#include "utf8.h" /* * utf8_to_unicode() * * Convert a UTF-8 sequence to its unicode value, and return the length of * the sequence in bytes. * * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can * either use it as-is (ie as Latin1) or you can check for invalid UTF-8 * by checking for a length of 1 and a result > 127. * * NOTE 2! This does *not* verify things like minimality. So overlong forms * are happily accepted and decoded, as are the various "invalid values". */ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res) { unsigned value; unsigned char c = line[index]; unsigned bytes, mask, i; *res = c; line += index; len -= index; /* * 0xxxxxxx is valid utf8 * 10xxxxxx is invalid UTF-8, we assume it is Latin1 */ if (c < 0xc0) return 1; /* Ok, it's 11xxxxxx, do a stupid decode */ mask = 0x20; bytes = 2; while (c & mask) { bytes++; mask >>= 1; } /* Invalid? Do it as a single byte Latin1 */ if (bytes > 6) return 1; if (bytes > len) return 1; value = c & (mask-1); /* Ok, do the bytes */ for (i = 1; i < bytes; i++) { c = line[i]; if ((c & 0xc0) != 0x80) return 1; value = (value << 6) | (c & 0x3f); } *res = value; return bytes; } static void reverse_string(char *begin, char *end) { do { char a = *begin, b = *end; *end = a; *begin = b; begin++; end--; } while (begin < end); } /* * unicode_to_utf8() * * Convert a unicode value to its canonical utf-8 sequence. * * NOTE! This does not check for - or care about - the "invalid" unicode * values. Also, converting a utf-8 sequence to unicode and back does * *not* guarantee the same sequence, since this generates the shortest * possible sequence, while utf8_to_unicode() accepts both Latin1 and * overlong utf-8 sequences. */ unsigned unicode_to_utf8(unsigned int c, char *utf8) { int bytes = 1; *utf8 = c; if (c > 0x7f) { int prefix = 0x40; char *p = utf8; do { *p++ = 0x80 + (c & 0x3f); bytes++; prefix >>= 1; c >>= 6; } while (c > prefix); *p = c - 2*prefix; reverse_string(utf8, p); } return bytes; }