Overhaul unicode parsing

It now instead iterates the actual unicode code points. This is better than what it was previously doing but it is still not entirely correct w.r.t to unicode sequences. This handling of unicode code points does however make it slightly easier to handle UTF-16 if needed in the future. This also adds some long needed tests for buffer methods.
author: Albert Cervin <albert@acervin.com> 2024-08-23 17:07:27 +0200
committer: Albert Cervin <albert@acervin.com> 2024-09-11 16:22:58 +0200
commit: 4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree: 4745d99e70d645a8134dafc3814dc68bf678daf4 /src/dged/utf8.h
parent: 991283f684c224db46fe68738470921b8c394f13 (diff)
download: dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip
1 files changed, 23 insertions, 5 deletions
diff --git a/src/dged/utf8.h b/src/dged/utf8.h
index 04aa242..22ce22d 100644
--- a/src/dged/utf8.h
+++ b/src/dged/utf8.h
@@ -1,19 +1,37 @@
+#ifndef _UTF8_H
+#define _UTF8_H
+
 #include <stdbool.h>
 #include <stdint.h>
 
+struct codepoint {
+  uint32_t codepoint;
+  uint32_t nbytes;
+};
+
+struct utf8_codepoint_iterator {
+  uint8_t *data;
+  uint64_t nbytes;
+  uint64_t offset;
+  struct codepoint current;
+};
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+                               uint64_t initial_offset);
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter);
+
 /*!
  * \brief Return the number of chars the utf-8 sequence pointed at by `bytes` of
  * length `nbytes`, represents
  */
 uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes);
 
-/* Return the number of bytes used to make up the next `nchars` characters */
-uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars);
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint);
 
-/* true if `byte` is a unicode byte sequence start byte */
 bool utf8_byte_is_unicode_start(uint8_t byte);
 bool utf8_byte_is_unicode_continuation(uint8_t byte);
-bool utf8_byte_is_ascii(uint8_t byte);
 bool utf8_byte_is_unicode(uint8_t byte);
+bool utf8_byte_is_ascii(uint8_t byte);
 
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len);
+#endif
author	Albert Cervin <albert@acervin.com>	2024-08-23 17:07:27 +0200
committer	Albert Cervin <albert@acervin.com>	2024-09-11 16:22:58 +0200
commit	4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree	4745d99e70d645a8134dafc3814dc68bf678daf4 /src/dged/utf8.h
parent	991283f684c224db46fe68738470921b8c394f13 (diff)
download	dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip