diff options
| author | William Casarin <jb55@jb55.com> | 2018-07-09 12:10:32 -0700 |
|---|---|---|
| committer | William Casarin <jb55@jb55.com> | 2018-07-09 12:10:32 -0700 |
| commit | 1b8fbbd843ddeb5fc81c9303db9c590a436d499b (patch) | |
| tree | a7227dfe8e4fbaee7b1e0b58b24994dce8078f3f /util.c | |
| parent | 37a9cdd2e80386f2c94e14e4f511284ae14c745a (diff) | |
progress
Diffstat (limited to 'util.c')
| -rw-r--r-- | util.c | 49 |
1 files changed, 49 insertions, 0 deletions
@@ -6,6 +6,11 @@ #include "util.h" +static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; +static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; +static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; +static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + void * ecalloc(size_t nmemb, size_t size) { @@ -33,3 +38,47 @@ die(const char *fmt, ...) { exit(1); } + +long +utf8decodebyte(const char c, size_t *i) +{ + for (*i = 0; *i < (UTF_SIZ + 1); ++(*i)) + if (((unsigned char)c & utfmask[*i]) == utfbyte[*i]) + return (unsigned char)c & ~utfmask[*i]; + return 0; +} + +size_t +utf8validate(long *u, size_t i) +{ + if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) + *u = UTF_INVALID; + for (i = 1; *u > utfmax[i]; ++i) + ; + return i; +} + +size_t +utf8decode(const char *c, long *u, size_t clen) +{ + size_t i, j, len, type; + long udecoded; + + *u = UTF_INVALID; + if (!clen) + return 0; + udecoded = utf8decodebyte(c[0], &len); + if (!BETWEEN(len, 1, UTF_SIZ)) + return 1; + for (i = 1, j = 1; i < clen && j < len; ++i, ++j) { + udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); + if (type) + return j; + } + if (j < len) + return 0; + *u = udecoded; + utf8validate(u, len); + + return len; +} |
