summaryrefslogtreecommitdiff
path: root/src/dged/utf8.h
blob: b91e7fde7149fb2240991017aef01ba9d65a9070 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#ifndef _UTF8_H
#define _UTF8_H

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

struct codepoint {
  uint32_t codepoint;
  uint32_t nbytes;
};

struct utf8_codepoint_iterator {
  uint8_t *data;
  uint64_t nbytes;
  uint64_t offset;
  struct codepoint current;
};

struct utf8_codepoint_iterator
create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
                               uint64_t initial_offset);
struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter);

/*!
 * \brief Return the number of chars the utf-8 sequence pointed at by `bytes` of
 * length `nbytes`, represents
 */
uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes);

uint32_t unicode_visual_char_width(const struct codepoint *codepoint);
bool unicode_is_printable(const struct codepoint *codepoint);

bool utf8_byte_is_unicode_start(uint8_t byte);
bool utf8_byte_is_unicode_continuation(uint8_t byte);
bool utf8_byte_is_unicode(uint8_t byte);
bool utf8_byte_is_ascii(uint8_t byte);

size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]);

#endif