utf8.c (2431B)
1 #include "utf8.h" 2 3 // TODO(not important): 4 // optimize charsize algorthim 5 6 #define LEN(a) (sizeof(a) / sizeof(a)[0]) 7 #define BETWEEN(x, a, b) ((a) <= (x) && (x) <= (b)) 8 9 static const uint8_t utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; 10 static const uint8_t utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; 11 12 static size_t utf8_decode(const char *, rune_t *, size_t); 13 static rune_t utf8_decodebyte(char, size_t *); 14 static char utf8_encodebyte(rune_t, size_t); 15 static size_t utf8_validate(rune_t *, size_t); 16 17 rune_t 18 utf8_decodebyte(char c, size_t *i) 19 { 20 for (*i = 0; *i < LEN(utfmask); ++(*i)) 21 if (((uint8_t)c & utfmask[*i]) == utfbyte[*i]) 22 return (uint8_t)c & ~utfmask[*i]; 23 return 0; 24 } 25 26 size_t 27 utf8_encode(rune_t u, char *c) 28 { 29 size_t len, i; 30 31 len = utf8_validate(&u, 0); 32 if (len > UTF_SIZ) 33 return 0; 34 35 for (i = len - 1; i != 0; --i) { 36 c[i] = utf8_encodebyte(u, 0); 37 u >>= 6; 38 } 39 c[0] = utf8_encodebyte(u, len); 40 41 return len; 42 } 43 44 char 45 utf8_encodebyte(rune_t u, size_t i) 46 { 47 return utfbyte[i] | (u & ~utfmask[i]); 48 } 49 50 size_t 51 utf8_validate(rune_t *u, size_t i) 52 { 53 const rune_t utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; 54 const rune_t utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; 55 56 if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) 57 *u = UTF_INVALID; 58 for (i = 1; *u > utfmax[i]; ++i) 59 ; 60 61 return i; 62 } 63 64 size_t 65 utf8_decode(const char *c, rune_t *u, size_t clen) 66 { 67 size_t i, j, len, type; 68 rune_t udecoded; 69 70 *u = UTF_INVALID; 71 if (!clen) 72 return 0; 73 udecoded = utf8_decodebyte(c[0], &len); 74 if (!BETWEEN(len, 1, UTF_SIZ)) 75 return 1; 76 for (i = 1, j = 1; i < clen && j < len; ++i, ++j) { 77 udecoded = (udecoded << 6) | utf8_decodebyte(c[i], &type); 78 if (type != 0) 79 return j; 80 } 81 if (j < len) 82 return 0; 83 *u = udecoded; 84 utf8_validate(u, len); 85 86 return len; 87 } 88 89 int 90 utf8_decode_buffer(const char* buffer, const int buflen, rune_t* u) 91 { 92 if (!buflen) return 0; 93 94 rune_t u_tmp; 95 int charsize; 96 if (!u) 97 u = &u_tmp; 98 99 // process a complete utf8 char 100 charsize = utf8_decode(buffer, u, buflen); 101 102 return charsize; 103 } 104 105 void 106 utf8_remove_string_end(char* string) 107 { 108 char* end = string + strlen(string); 109 if (end == string) 110 return; 111 112 do { 113 end--; 114 // if byte starts with 0b10, byte is an UTF-8 extender 115 } while (end > string && (*end & 0xC0) == 0x80); 116 *end = 0; 117 }