summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dged/json.c51
-rw-r--r--src/dged/utf8.c26
-rw-r--r--src/dged/utf8.h3
3 files changed, 74 insertions, 6 deletions
diff --git a/src/dged/json.c b/src/dged/json.c
index a514f00..69823cb 100644
--- a/src/dged/json.c
+++ b/src/dged/json.c
@@ -2,10 +2,13 @@
#include "hash.h"
#include "hashmap.h"
+#include "utf8.h"
#include "vec.h"
+#include <assert.h>
#include <stddef.h>
#include <stdio.h>
+#include <stdlib.h>
struct json_key_value {
struct s8 key;
@@ -49,20 +52,49 @@ static struct json_value create_object(struct json_value *parent) {
return val;
}
+static uint32_t codepoint_from_hex(uint8_t bytes[4]) {
+ uint32_t nmbr = 0;
+ for (size_t i = 0; i < 4; ++i) {
+ uint8_t byte = bytes[i];
+ uint32_t value = 0;
+ if (byte >= '0' && byte <= '9') {
+ value = byte - '0';
+ } else if (byte >= 'A' && byte <= 'F') {
+ value = byte - 'A' + 10;
+ } else if (byte >= 'a' && byte <= 'f') {
+ value = byte - 'a' + 10;
+ }
+
+ // 16 ^ (3-i)
+ uint32_t multiplier = 1 << (4 * (3 - i));
+ nmbr += value * multiplier;
+ }
+
+ return nmbr;
+}
+
struct s8 unescape_json_string(struct s8 input) {
- /* FIXME: this is a bit funky and does not take
- unicode characters into account and probably also
- misses some escape codes. */
size_t new_size = 0;
bool escape = false;
for (size_t bi = 0; bi < input.l; ++bi) {
uint8_t b = input.s[bi];
+
+ size_t sz = 1;
if (b == '\\' && !escape) {
escape = true;
continue;
}
- ++new_size;
+ if (b == 'u' && escape) {
+ // unicode codepoint, calculate byte-width
+ // format is \uXXXX where X is a hex digit.
+ uint8_t chars[4];
+ uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
+ sz = utf8_encode(codepoint, chars);
+ bi += 4;
+ }
+
+ new_size += sz;
escape = false;
}
@@ -77,6 +109,7 @@ struct s8 unescape_json_string(struct s8 input) {
continue;
}
+ size_t skip = 1;
if (escape) {
switch (b) {
case 'b':
@@ -97,6 +130,14 @@ struct s8 unescape_json_string(struct s8 input) {
case 't':
buf[bufi] = '\t';
break;
+ case 'u': {
+ uint8_t chars[4] = {0};
+ uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
+ size_t size = utf8_encode(codepoint, chars);
+ memcpy(&buf[bufi], chars, size);
+ skip = size;
+ bi += 4;
+ } break;
case '"':
buf[bufi] = '"';
break;
@@ -108,7 +149,7 @@ struct s8 unescape_json_string(struct s8 input) {
}
escape = false;
- ++bufi;
+ bufi += skip;
}
return (struct s8){
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index b47f5fc..cc5a66e 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -36,7 +36,7 @@ static const uint8_t utf8d[] = {
// clang-format on
/*
- * emoji decoding algorithm from
+ * unicode decoding algorithm from
* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*/
static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
@@ -141,3 +141,27 @@ uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
return 0;
}
}
+
+size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]) {
+ if (codepoint <= 0x7F) {
+ buf[0] = (uint8_t)codepoint & 0xff;
+ return 1;
+ } else if (codepoint <= 0x7FF) {
+ buf[0] = 0xC0 | (codepoint >> 6);
+ buf[1] = 0x80 | (codepoint & 0x3F);
+ return 2;
+ } else if (codepoint <= 0xFFFF) {
+ buf[0] = 0xE0 | (codepoint >> 12);
+ buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[2] = 0x80 | (codepoint & 0x3F);
+ return 3;
+ } else if (codepoint <= 0x10FFFF) {
+ buf[0] = 0xF0 | (codepoint >> 18);
+ buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[3] = 0x80 | (codepoint & 0x3F);
+ return 4;
+ }
+
+ return 0;
+}
diff --git a/src/dged/utf8.h b/src/dged/utf8.h
index 150fe02..b91e7fd 100644
--- a/src/dged/utf8.h
+++ b/src/dged/utf8.h
@@ -2,6 +2,7 @@
#define _UTF8_H
#include <stdbool.h>
+#include <stddef.h>
#include <stdint.h>
struct codepoint {
@@ -35,4 +36,6 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte);
bool utf8_byte_is_unicode(uint8_t byte);
bool utf8_byte_is_ascii(uint8_t byte);
+size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]);
+
#endif