se

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

utf8.c (2431B)


      1 #include "utf8.h"
      2 
      3 // TODO(not important):
      4 // optimize charsize algorthim
      5 
      6 #define LEN(a)			(sizeof(a) / sizeof(a)[0])
      7 #define BETWEEN(x, a, b)	((a) <= (x) && (x) <= (b))
      8 
      9 static const uint8_t utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0};
     10 static const uint8_t utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
     11 
     12 static size_t utf8_decode(const char *, rune_t *, size_t);
     13 static rune_t utf8_decodebyte(char, size_t *);
     14 static char   utf8_encodebyte(rune_t, size_t);
     15 static size_t utf8_validate(rune_t *, size_t);
     16 
     17 rune_t
     18 utf8_decodebyte(char c, size_t *i)
     19 {
     20 		for (*i = 0; *i < LEN(utfmask); ++(*i))
     21 				if (((uint8_t)c & utfmask[*i]) == utfbyte[*i])
     22 						return (uint8_t)c & ~utfmask[*i];
     23 		return 0;
     24 }
     25 
     26 size_t
     27 utf8_encode(rune_t u, char *c)
     28 {
     29 		size_t len, i;
     30 
     31 		len = utf8_validate(&u, 0);
     32 		if (len > UTF_SIZ)
     33 				return 0;
     34 
     35 		for (i = len - 1; i != 0; --i) {
     36 				c[i] = utf8_encodebyte(u, 0);
     37 				u >>= 6;
     38 		}
     39 		c[0] = utf8_encodebyte(u, len);
     40 
     41 		return len;
     42 }
     43 
     44 char
     45 utf8_encodebyte(rune_t u, size_t i)
     46 {
     47 		return utfbyte[i] | (u & ~utfmask[i]);
     48 }
     49 
     50 size_t
     51 utf8_validate(rune_t *u, size_t i)
     52 {
     53 		const rune_t utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000};
     54 		const rune_t utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
     55 
     56 		if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
     57 				*u = UTF_INVALID;
     58 		for (i = 1; *u > utfmax[i]; ++i)
     59 				;
     60 
     61 		return i;
     62 }
     63 
     64 size_t
     65 utf8_decode(const char *c, rune_t *u, size_t clen)
     66 {
     67 		size_t i, j, len, type;
     68 		rune_t udecoded;
     69 
     70 		*u = UTF_INVALID;
     71 		if (!clen)
     72 				return 0;
     73 		udecoded = utf8_decodebyte(c[0], &len);
     74 		if (!BETWEEN(len, 1, UTF_SIZ))
     75 				return 1;
     76 		for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
     77 				udecoded = (udecoded << 6) | utf8_decodebyte(c[i], &type);
     78 				if (type != 0)
     79 						return j;
     80 		}
     81 		if (j < len)
     82 				return 0;
     83 		*u = udecoded;
     84 		utf8_validate(u, len);
     85 
     86 		return len;
     87 }
     88 
     89 int
     90 utf8_decode_buffer(const char* buffer, const int buflen, rune_t* u)
     91 {
     92 		if (!buflen) return 0;
     93 
     94 		rune_t u_tmp;
     95 		int charsize;
     96 		if (!u)
     97 				u = &u_tmp;
     98 
     99 		// process a complete utf8 char
    100 		charsize = utf8_decode(buffer, u, buflen);
    101 
    102 		return charsize;
    103 }
    104 
    105 void
    106 utf8_remove_string_end(char* string)
    107 {
    108 		char* end = string + strlen(string);
    109 		if (end == string)
    110 				return;
    111 
    112 		do {
    113 				end--;
    114 				// if byte starts with 0b10, byte is an UTF-8 extender
    115 		} while (end > string && (*end & 0xC0) == 0x80);
    116 		*end = 0;
    117 }