#include "utf8.h" #include #include #include bool utf8_byte_is_unicode_start(uint8_t byte) { return (byte & 0xc0) == 0xc0; } bool utf8_byte_is_unicode_continuation(uint8_t byte) { return utf8_byte_is_unicode(byte) && !utf8_byte_is_unicode_start(byte); } bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; } bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); } enum utf8_state { Utf8_Accept = 0, Utf8_Reject = 1, }; // clang-format off static const uint8_t utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 }; // clang-format on /* * emoji decoding algorithm from * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ static enum utf8_state decode(enum utf8_state *state, uint32_t *codep, uint32_t byte) { uint32_t type = utf8d[byte]; *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); *state = utf8d[256 + *state * 16 + type]; return *state; } static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) { uint32_t codepoint = 0; enum utf8_state state = Utf8_Accept; uint32_t bi = 0; while (bi < nbytes) { enum utf8_state res = decode(&state, &codepoint, bytes[bi]); ++bi; if (res == Utf8_Accept || res == Utf8_Reject) { break; } } if (state == Utf8_Reject) { codepoint = 0xfffd; } return (struct codepoint){.codepoint = codepoint, .nbytes = bi}; } struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) { if (iter->offset >= iter->nbytes) { return NULL; } iter->current = next_utf8_codepoint(iter->data + iter->offset, iter->nbytes - iter->offset); iter->offset += iter->current.nbytes; return &iter->current; } struct utf8_codepoint_iterator create_utf8_codepoint_iterator(uint8_t *data, uint64_t len, uint64_t initial_offset) { return (struct utf8_codepoint_iterator){ .data = data, .nbytes = len, .offset = initial_offset, }; } /* TODO: grapheme clusters and other classification, this * returns the number of unicode code points */ uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) { uint32_t bi = 0; uint32_t nchars = 0; while (bi < nbytes) { struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); ++nchars; bi += codepoint.nbytes; } return nchars; } /* TODO: grapheme clusters and other classification, this * returns the number of unicode code points */ uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) { uint32_t bi = 0; uint32_t chars = 0; uint32_t expected = 0; while (chars < nchars && bi < nbytes) { struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); bi += codepoint.nbytes; ++chars; } // TODO: reject invalid? return bi; } uint32_t unicode_visual_char_width(const struct codepoint *codepoint) { if (codepoint->nbytes > 0) { // TODO: use unicode classification instead size_t w = wcwidth(codepoint->codepoint); return w >= 0 ? w : 2; } else { return 0; } }