/* See LICENSE file for copyright and license details. */ #include #include #include #include #include "util.h" static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; void * ecalloc(size_t nmemb, size_t size) { void *p; if (!(p = calloc(nmemb, size))) die("calloc:"); return p; } void die(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); if (fmt[0] && fmt[strlen(fmt)-1] == ':') { fputc(' ', stderr); perror(NULL); } else { fputc('\n', stderr); } exit(1); } long utf8decodebyte(const char c, size_t *i) { for (*i = 0; *i < (UTF_SIZ + 1); ++(*i)) if (((unsigned char)c & utfmask[*i]) == utfbyte[*i]) return (unsigned char)c & ~utfmask[*i]; return 0; } size_t utf8validate(long *u, size_t i) { if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) *u = UTF_INVALID; for (i = 1; *u > utfmax[i]; ++i) ; return i; } size_t utf8decode(const char *c, long *u, size_t clen) { size_t i, j, len, type; long udecoded; *u = UTF_INVALID; if (!clen) return 0; udecoded = utf8decodebyte(c[0], &len); if (!BETWEEN(len, 1, UTF_SIZ)) return 1; for (i = 1, j = 1; i < clen && j < len; ++i, ++j) { udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); if (type) return j; } if (j < len) return 0; *u = udecoded; utf8validate(u, len); return len; }