TOML settings parsing

Currently a very simplistic parser that do not support all TOML datatypes. Supported are: - Tables - Strings (incl multiline) - Integers - Inline Tables - Booleans - Comments
author: Albert Cervin <albert@acervin.com> 2023-05-18 23:51:51 +0200
committer: Albert Cervin <albert@acervin.com> 2023-05-24 22:18:12 +0200
commit: 4f3b576db6b01c8c88076985478e2a7fa37be340 (patch)
tree: 9723ed39a19872fd52f2867613e78e02de3cf79b /src/dged/settings-parse.c
parent: a4d17ddb8e7d23ccca13132f4d88cfc5f5730b76 (diff)
download: dged-4f3b576db6b01c8c88076985478e2a7fa37be340.tar.gz
dged-4f3b576db6b01c8c88076985478e2a7fa37be340.tar.xz
dged-4f3b576db6b01c8c88076985478e2a7fa37be340.zip
1 files changed, 277 insertions, 0 deletions
diff --git a/src/dged/settings-parse.c b/src/dged/settings-parse.c
new file mode 100644
index 0000000..6c1c482
--- /dev/null
+++ b/src/dged/settings-parse.c
@@ -0,0 +1,277 @@
+#include "settings-parse.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utf8.h"
+
+enum byte_class {
+  Byte_Alphanumeric,
+  Byte_Symbol,
+};
+
+struct parser parser_create(struct reader reader) {
+  struct parser state = {
+      .row = 0,
+      .col = 0,
+      .reader = reader,
+  };
+
+  VEC_INIT(&state.buffer, 32);
+
+  return state;
+}
+
+void parser_destroy(struct parser *parser) { VEC_DESTROY(&parser->buffer); }
+
+static enum byte_class classify(uint8_t byte) {
+  if ((byte >= 'a' && byte <= 'z') || (byte >= 'A' && byte <= 'Z') ||
+      (byte >= '0' && byte <= '9') || byte == '_' || byte == '-' ||
+      utf8_byte_is_unicode(byte)) {
+    return Byte_Alphanumeric;
+  }
+
+  return Byte_Symbol;
+}
+
+static void trim_parse_buffer_whitespace(void **data, uint32_t *len) {
+  uint8_t *d = (uint8_t *)*data;
+  uint32_t new_len = *len;
+
+  // beginning
+  while (d[0] == ' ' || d[0] == '\t') {
+    --new_len;
+    ++d;
+  }
+
+  // end
+  while (d[new_len - 1] == ' ' || d[new_len - 1] == '\t') {
+    --new_len;
+  }
+
+  *data = d;
+  *len = new_len;
+}
+
+static bool read_data_with_initial(struct parser *state, uint8_t *initial_byte,
+                                   uint8_t end, void **data_out,
+                                   uint32_t *len_out) {
+  uint8_t byte;
+  VEC_CLEAR(&state->buffer);
+  if (initial_byte != NULL) {
+    VEC_PUSH(&state->buffer, *initial_byte);
+  }
+
+  while (state->reader.getbytes(1, &byte, state->reader.userdata) > 0 &&
+         byte != end) {
+    ++state->col;
+    VEC_PUSH(&state->buffer, byte);
+  }
+
+  *data_out = VEC_FRONT(&state->buffer);
+  *len_out = VEC_SIZE(&state->buffer);
+
+  trim_parse_buffer_whitespace(data_out, len_out);
+
+  return byte == end;
+}
+
+static bool read_data(struct parser *state, uint8_t end, void **data_out,
+                      uint32_t *len_out) {
+  return read_data_with_initial(state, NULL, end, data_out, len_out);
+}
+
+static bool discard(struct parser *state, uint8_t end) {
+  uint8_t byte;
+  while (state->reader.getbytes(1, &byte, state->reader.userdata) > 0 &&
+         byte != end) {
+    ++state->col;
+  }
+
+  return byte == end;
+}
+
+static void errtoken(struct token *token_out, const char *fmt, ...) {
+  static char errmsgbuf[256] = {0};
+  va_list args;
+  va_start(args, fmt);
+  size_t written = vsnprintf(errmsgbuf, 256, fmt, args);
+  va_end(args);
+
+  token_out->type = Token_Error;
+  token_out->data = errmsgbuf;
+  token_out->len = written;
+}
+
+bool parser_next_token(struct parser *state, struct token *token_out) {
+  uint8_t byte;
+  static bool parse_value = false;
+  static int64_t int_value = 0;
+  static bool bool_value = false;
+
+  memset(token_out, 0, sizeof(struct token));
+
+  while (state->reader.getbytes(1, &byte, state->reader.userdata) > 0) {
+    switch (classify(byte)) {
+    case Byte_Alphanumeric: // unquoted key / value
+      if (!parse_value) {
+        token_out->type = Token_Key;
+        token_out->row = state->row;
+        token_out->col = state->col;
+
+        if (!read_data_with_initial(state, &byte, '=', &token_out->data,
+                                    &token_out->len)) {
+          errtoken(token_out, "Unexpected EOF while looking for end of key");
+          return true;
+        }
+
+        parse_value = true;
+      } else {
+        parse_value = false;
+        token_out->row = state->row;
+        token_out->col = state->col;
+
+        if (byte >= '0' && byte <= '9') {
+          token_out->type = Token_IntValue;
+          void *data;
+          uint32_t len;
+          read_data_with_initial(state, &byte, '\n', &data, &len);
+
+          char *s = calloc(len + 1, 1);
+          strncpy(s, (char *)data, len);
+
+          errno = 0;
+          int_value = strtol(s, NULL, 0);
+          free(s);
+          if (errno != 0) {
+            errtoken(token_out, "Invalid integer value %.*s: %s", len,
+                     (char *)data, strerror(errno));
+            return true;
+          }
+
+          token_out->data = &int_value;
+          token_out->len = 0;
+        } else if (byte == 't' || byte == 'f') {
+          token_out->type = Token_BoolValue;
+          void *data = NULL;
+          uint32_t len = 0;
+          read_data_with_initial(state, &byte, '\n', &data, &len);
+
+          if (strncmp((char *)data, "true", len) == 0) {
+            bool_value = true;
+            token_out->data = &bool_value;
+            token_out->len = 0;
+          } else if (strncmp((char *)data, "false", len) == 0) {
+            bool_value = false;
+            token_out->data = &bool_value;
+            token_out->len = 0;
+          } else {
+            errtoken(token_out, "Invalid bool value: %.*s", len, (char *)data);
+          }
+        }
+      }
+
+      return true;
+
+    case Byte_Symbol:
+      switch (byte) {
+      case '#': // comment
+        token_out->type = Token_Comment;
+        token_out->row = state->row;
+        token_out->col = state->col;
+        if (!read_data(state, '\n', &token_out->data, &token_out->len)) {
+          errtoken(token_out,
+                   "Unexpected EOF while looking for end of comment line");
+          return true;
+        }
+
+        uint8_t *data = (uint8_t *)token_out->data;
+        if (data[token_out->len - 1] == '\r') {
+          --token_out->len;
+        }
+
+        state->col = 0;
+        ++state->row;
+
+        return true;
+
+      case '{': // inline table
+        parse_value = false;
+        token_out->type = Token_InlineTable;
+        token_out->row = state->row;
+        token_out->col = state->col;
+        return true;
+        break;
+
+      case '}': // end inline table
+        parse_value = false;
+        break;
+
+      case '[': // table open
+        token_out->type = Token_Table;
+        token_out->row = state->row;
+        token_out->col = state->col;
+        if (!read_data(state, ']', &token_out->data, &token_out->len)) {
+          errtoken(token_out, "Unexpected EOF while looking for matching ']'");
+          return true;
+        }
+
+        ++state->col;
+        return true;
+
+      case '"': // quoted key or string value
+        bool multiline = false;
+        if (parse_value) {
+          token_out->type = Token_StringValue;
+        } else {
+          token_out->type = Token_Key;
+        }
+        token_out->row = state->row;
+        token_out->col = state->col;
+
+        // check for multiline
+        uint32_t numquotes = 1;
+        while (state->reader.getbytes(1, &byte, state->reader.userdata) > 0 &&
+               byte == '"') {
+          ++numquotes;
+        }
+
+        if (numquotes == 3) {
+          multiline = true;
+        }
+
+        if (!read_data_with_initial(state, &byte, '"', &token_out->data,
+                                    &token_out->len)) {
+          errtoken(token_out, "Unexpected EOF while looking for matching '\"'");
+          parse_value = false;
+          return true;
+        }
+
+        if (!parse_value) {
+          discard(state, '=');
+        }
+
+        if (multiline) {
+          discard(state, '"');
+          discard(state, '"');
+        }
+
+        ++state->col;
+        parse_value = false;
+        return true;
+
+      case '\n':
+      case '\r':
+        state->col = 0;
+        ++state->row;
+        break;
+      }
+      break;
+    }
+  }
+
+  return false;
+}
author	Albert Cervin <albert@acervin.com>	2023-05-18 23:51:51 +0200
committer	Albert Cervin <albert@acervin.com>	2023-05-24 22:18:12 +0200
commit	4f3b576db6b01c8c88076985478e2a7fa37be340 (patch)
tree	9723ed39a19872fd52f2867613e78e02de3cf79b /src/dged/settings-parse.c
parent	a4d17ddb8e7d23ccca13132f4d88cfc5f5730b76 (diff)
download	dged-4f3b576db6b01c8c88076985478e2a7fa37be340.tar.gz dged-4f3b576db6b01c8c88076985478e2a7fa37be340.tar.xz dged-4f3b576db6b01c8c88076985478e2a7fa37be340.zip