Overhaul unicode parsing

It now instead iterates the actual unicode code points. This is better than what it was previously doing but it is still not entirely correct w.r.t to unicode sequences. This handling of unicode code points does however make it slightly easier to handle UTF-16 if needed in the future. This also adds some long needed tests for buffer methods.
author: Albert Cervin <albert@acervin.com> 2024-08-23 17:07:27 +0200
committer: Albert Cervin <albert@acervin.com> 2024-09-11 16:22:58 +0200
commit: 4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree: 4745d99e70d645a8134dafc3814dc68bf678daf4 /test
parent: 991283f684c224db46fe68738470921b8c394f13 (diff)
download: dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip
4 files changed, 215 insertions, 60 deletions
diff --git a/test/buffer.c b/test/buffer.c
index a4b318e..7d879b0 100644
--- a/test/buffer.c
+++ b/test/buffer.c
@@ -1,11 +1,21 @@
 #include <string.h>
 
 #include "dged/buffer.h"
+#include "dged/settings.h"
 
 #include "assert.h"
 #include "test.h"
 
-void test_add() {
+static uint32_t add_callback_call_count = 0;
+static void add_callback(struct buffer *buffer, struct edit_location added,
+                         void *userdata) {
+  (void)buffer;
+  (void)added;
+  (void)userdata;
+  ++add_callback_call_count;
+}
+
+static void test_add(void) {
   struct buffer b = buffer_create("test-buffer");
   ASSERT(buffer_num_lines(&b) == 0, "Expected buffer to have zero lines");
 
@@ -16,10 +26,62 @@ void test_add() {
   ASSERT(loc.line == 1 && loc.col == strlen(txt),
          "Expected buffer to have one line with characters");
 
+  // test callback
+  uint32_t hook_id = buffer_add_insert_hook(&b, add_callback, NULL);
+  buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)"hej", 3);
+  ASSERT(add_callback_call_count == 1, "Expected callback to have been called");
+
+  // test removing the hook
+  buffer_remove_insert_hook(&b, hook_id, NULL);
+  buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)"hej", 3);
+  ASSERT(add_callback_call_count == 1,
+         "Expected callback to not have been called after it has been removed");
+
   buffer_destroy(&b);
 }
 
-void test_word_at() {
+static uint32_t delete_callback_call_count = 0;
+static void delete_callback(struct buffer *buffer, struct edit_location removed,
+                            void *userdata) {
+  (void)buffer;
+  (void)removed;
+  (void)userdata;
+  ++delete_callback_call_count;
+}
+
+static void test_delete(void) {
+  struct buffer b = buffer_create("test-buffer-delete");
+  const char *txt = "we are adding some text\ntwo lines to be exact";
+  struct location loc = buffer_add(&b, (struct location){.line = 0, .col = 0},
+                                   (uint8_t *)txt, strlen(txt));
+
+  ASSERT(buffer_line_length(&b, 0) == 23,
+         "Expected line 1 to be 23 chars before deletion");
+  buffer_delete(&b, region_new((struct location){.line = 0, .col = 0},
+                               (struct location){.line = 0, .col = 2}));
+  ASSERT(buffer_line_length(&b, 0) == 21,
+         "Expected line 1 to be 21 chars after deletion");
+
+  // delete newline
+  buffer_delete(&b, region_new((struct location){.line = 0, .col = 21},
+                               (struct location){.line = 1, .col = 0}));
+  ASSERT(buffer_num_lines(&b) == 1,
+         "Expected buffer to have one line after new line deletion");
+  ASSERT(buffer_line_length(&b, 0) == 42,
+         "Expected single line to be sum of both line lengths after new line "
+         "deletion");
+
+  // test that callback works
+  buffer_add_delete_hook(&b, delete_callback, NULL);
+  buffer_delete(&b, region_new((struct location){.line = 0, .col = 0},
+                               (struct location){.line = 0, .col = 2}));
+  ASSERT(delete_callback_call_count == 1,
+         "Expected callback to have been called");
+
+  buffer_destroy(&b);
+}
+
+static void test_word_at(void) {
   struct buffer b = buffer_create("test-word-at-buffer");
   const char *txt = "word1 (word2). Another";
   buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)txt,
@@ -40,8 +102,7 @@ void test_word_at() {
          "Expected word to span cols 7..12");
 
   // test that clamping works correctly
-  struct region word3 =
-      buffer_word_at(&b, (struct location){.line = 0, .col = 100});
+  struct region word3 = buffer_word_at(&b, buffer_clamp(&b, 0, 100));
   ASSERT(region_has_size(word3), "expected 0,100 to be in the last word");
   ASSERT(word3.begin.col == 15 && word3.end.col == 22,
          "Expected word to span cols 15..22");
@@ -49,7 +110,129 @@ void test_word_at() {
   buffer_destroy(&b);
 }
 
-void run_buffer_tests() {
+static void test_line_len(void) {
+  struct buffer b = buffer_create("test-line-length-buffer");
+  const char *txt = "Look! Banana 🍌";
+  buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)txt,
+             strlen(txt));
+  ASSERT(buffer_line_length(&b, 0) == 15,
+         "Expected banana line to be 15 chars wide");
+}
+
+static void test_char_movement(void) {
+  struct buffer b = buffer_create("test-char-movement-buffer");
+  const char *txt = "abcdefgh 🎯jklmn\tab";
+  buffer_add(&b, buffer_end(&b), (uint8_t *)txt, strlen(txt));
+  struct location next =
+      buffer_next_char(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(next.col == 1, "Expected next char to be next char");
+
+  next = buffer_next_char(&b, (struct location){.line = 0, .col = 9});
+  ASSERT(next.col == 11,
+         "Expected a double width char to result in a 2 column move");
+
+  next = buffer_next_char(&b, (struct location){.line = 0, .col = 16});
+  uint64_t tab_width = settings_get("editor.tab-width")->value.number_value;
+  ASSERT(next.col == 16 + tab_width,
+         "Expected a tab to result in a move the width of a tab");
+
+  struct location prev =
+      buffer_previous_char(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(prev.col == 0 && prev.line == 0,
+         "Expected backwards motion from 0,0 not to be possible");
+
+  prev = buffer_previous_char(&b, (struct location){.line = 0, .col = 11});
+  ASSERT(prev.col == 9,
+         "Expected a double width char to result in a 2 column move");
+
+  prev = buffer_previous_char(
+      &b, (struct location){.line = 0, .col = 16 + tab_width});
+  ASSERT(prev.col == 16,
+         "Expected a tab move backwards to step over the width of a tab");
+}
+
+static void test_word_movement(void) {
+  struct buffer b = buffer_create("test-word-movement-buffer");
+
+  const char *txt = " word1, word2 \"word3\" word4";
+  buffer_add(&b, buffer_end(&b), (uint8_t *)txt, strlen(txt));
+  struct location next =
+      buffer_next_word(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(next.col == 1, "Expected next word to start at col 1");
+
+  next = buffer_next_word(&b, (struct location){.line = 0, .col = 1});
+  ASSERT(next.col == 8, "Expected next word to start at col 8");
+
+  next = buffer_next_word(&b, (struct location){.line = 0, .col = 8});
+  ASSERT(next.col == 15, "Expected next word to start at col 15");
+
+  next = buffer_next_word(&b, (struct location){.line = 0, .col = 15});
+  ASSERT(next.col == 22, "Expected next word to start at col 22");
+
+  struct location prev =
+      buffer_previous_word(&b, (struct location){.line = 0, .col = 26});
+  ASSERT(prev.col == 22, "Expected previous word to start at col 22");
+
+  prev = buffer_previous_word(&b, (struct location){.line = 0, .col = 22});
+  ASSERT(prev.col == 15, "Expected previous word to start at col 15");
+
+  prev = buffer_previous_word(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(prev.col == 0 && prev.line == 0,
+         "Expected previous word to not go before beginning of buffer");
+}
+
+void test_copy(void) {
+  struct buffer b = buffer_create("test-copy-buffer");
+  buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)"copy", 4);
+
+  buffer_copy(&b, region_new((struct location){.line = 0, .col = 0},
+                             (struct location){.line = 0, .col = 4}));
+  buffer_paste(&b, (struct location){.line = 0, .col = 4});
+  ASSERT(buffer_line_length(&b, 0) == 8, "Expected text to be copied");
+  struct text_chunk t = buffer_line(&b, 0);
+  ASSERT_STR_EQ((const char *)t.text, "copycopy",
+                "Expected copied text to match");
+  if (t.allocated) {
+    free(t.text);
+  }
+
+  buffer_cut(&b, region_new((struct location){.line = 0, .col = 2},
+                            (struct location){.line = 0, .col = 4}));
+  buffer_paste(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(buffer_line_length(&b, 0) == 8, "Expected line length to be the same");
+  t = buffer_line(&b, 0);
+  ASSERT_STR_EQ((const char *)t.text, "pycocopy",
+                "Expected cut+pasted text to match");
+  if (t.allocated) {
+    free(t.text);
+  }
+
+  // test kill ring
+  buffer_paste_older(&b, (struct location){.line = 0, .col = 0});
+  ASSERT(buffer_line_length(&b, 0) == 12,
+         "Expected line length to have increased when pasting older");
+  t = buffer_line(&b, 0);
+  ASSERT_STR_EQ((const char *)t.text, "copypycocopy",
+                "Expected pasted older text to match");
+  if (t.allocated) {
+    free(t.text);
+  }
+
+  buffer_destroy(&b);
+}
+
+void run_buffer_tests(void) {
+  settings_init(10);
+  settings_set_default(
+      "editor.tab-width",
+      (struct setting_value){.type = Setting_Number, .number_value = 4});
+
   run_test(test_add);
+  run_test(test_delete);
   run_test(test_word_at);
+  run_test(test_line_len);
+  run_test(test_char_movement);
+  run_test(test_word_movement);
+  run_test(test_copy);
+  settings_destroy();
 }
diff --git a/test/main.c b/test/main.c
index 4c241b3..dc0c2dc 100644
--- a/test/main.c
+++ b/test/main.c
@@ -9,7 +9,9 @@
 void handle_abort() { exit(1); }
 
 int main() {
-  setlocale(LC_ALL, "");
+  // Use a hardcoded locale to get a
+  // predictable env.
+  setlocale(LC_ALL, "en_US.UTF-8");
   signal(SIGABRT, handle_abort);
 
   struct timespec test_begin;
@@ -52,5 +54,6 @@ int main() {
       ((uint64_t)test_begin.tv_sec * 1e9 + (uint64_t)test_begin.tv_nsec);
   printf("\n🎉 \x1b[1;32mDone! All tests successful in %.2f ms!\x1b[0m\n",
          (double)elapsed_nanos / 1e6);
+
   return 0;
 }
diff --git a/test/text.c b/test/text.c
index 9faa663..f890e7b 100644
--- a/test/text.c
+++ b/test/text.c
@@ -15,22 +15,19 @@ void assert_line_eq(struct text_chunk line, const char *txt, const char *msg) {
 void assert_line_equal(struct text_chunk *line) {}
 
 void test_add_text() {
-  uint32_t lines_added, cols_added;
+  uint32_t lines_added;
   /* use a silly small initial capacity to test re-alloc */
   struct text *t = text_create(1);
 
   const char *txt = "This is line 1\n";
-  text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added,
-                 &cols_added);
+  text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added);
 
-  ASSERT(text_line_size(t, 0) == 14 && text_line_length(t, 0) == 14,
-         "Expected line 1 to have 14 chars and 14 bytes");
+  ASSERT(text_line_size(t, 0) == 14, "Expected line 1 to be 14 bytes");
   assert_line_eq(text_get_line(t, 0), "This is line 1",
                  "Expected line 1 to be line 1");
 
   const char *txt2 = "This is line 2\n";
-  text_insert_at(t, 1, 0, (uint8_t *)txt2, strlen(txt2), &lines_added,
-                 &cols_added);
+  text_insert_at(t, 1, 0, (uint8_t *)txt2, strlen(txt2), &lines_added);
   ASSERT(text_num_lines(t) == 3,
          "Expected text to have three lines after second insertion");
   assert_line_eq(text_get_line(t, 1), "This is line 2",
@@ -38,8 +35,7 @@ void test_add_text() {
 
   // simulate indentation
   const char *txt3 = "    ";
-  text_insert_at(t, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added,
-                 &cols_added);
+  text_insert_at(t, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added);
   ASSERT(text_num_lines(t) == 3,
          "Expected text to have three lines after second insertion");
   assert_line_eq(text_get_line(t, 0), "    This is line 1",
@@ -48,7 +44,7 @@ void test_add_text() {
                  "Expected line 2 to be line 2 still");
 
   // insert newline in middle of line
-  text_insert_at(t, 1, 4, (uint8_t *)"\n", 1, &lines_added, &cols_added);
+  text_insert_at(t, 1, 4, (uint8_t *)"\n", 1, &lines_added);
   ASSERT(text_num_lines(t) == 4,
          "Expected text to have four lines after inserting a new line");
   assert_line_eq(text_get_line(t, 1), "This", "Expected line 2 to be split");
@@ -56,11 +52,11 @@ void test_add_text() {
                  "Expected line 2 to be split");
 
   // insert newline before line 1
-  text_insert_at(t, 1, 0, (uint8_t *)"\n", 1, &lines_added, &cols_added);
+  text_insert_at(t, 1, 0, (uint8_t *)"\n", 1, &lines_added);
   ASSERT(
       text_num_lines(t) == 5,
       "Expected to have five lines after adding an empty line in the middle");
-  ASSERT(text_line_length(t, 1) == 0, "Expected line 2 to be empty");
+  ASSERT(text_line_size(t, 1) == 0, "Expected line 2 to be empty");
   assert_line_eq(text_get_line(t, 2), "This",
                  "Expected line 3 to be previous line 2");
   assert_line_eq(text_get_line(t, 3), " is line 2",
@@ -70,37 +66,35 @@ void test_add_text() {
 }
 
 void test_delete_text() {
-  uint32_t lines_added, cols_added;
+  uint32_t lines_added;
   struct text *t = text_create(10);
   const char *txt = "This is line 1";
-  text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added,
-                 &cols_added);
+  text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added);
 
   text_delete(t, 0, 12, 0, 14);
-  ASSERT(text_line_length(t, 0) == 12,
-         "Expected line to be 12 chars after deleting two");
+  ASSERT(text_line_size(t, 0) == 12,
+         "Expected line to be 12 bytes after deleting two");
   ASSERT(strncmp((const char *)text_get_line(t, 0).text, "This is line",
                  text_line_size(t, 0)) == 0,
-         "Expected two chars to be deleted");
+         "Expected two bytes to be deleted");
 
   text_delete(t, 0, 0, 10, 10);
   ASSERT(text_get_line(t, 0).nbytes == 0,
-         "Expected line to be empty after many chars removed");
+         "Expected line to be empty after many bytes removed");
 
   const char *txt2 = "This is line 1\nThis is line 2\nThis is line 3";
-  text_insert_at(t, 0, 0, (uint8_t *)txt2, strlen(txt2), &lines_added,
-                 &cols_added);
+  text_insert_at(t, 0, 0, (uint8_t *)txt2, strlen(txt2), &lines_added);
   ASSERT(text_num_lines(t) == 3,
          "Expected to have three lines after inserting as many");
 
   text_delete(t, 1, 11, 1, 14);
-  ASSERT(text_line_length(t, 1) == 11,
-         "Expected line to contain 11 chars after deletion");
+  ASSERT(text_line_size(t, 1) == 11,
+         "Expected line to contain 11 bytes after deletion");
   struct text_chunk line = text_get_line(t, 1);
   ASSERT(strncmp((const char *)line.text, "This is lin", line.nbytes) == 0,
          "Expected deleted characters to be gone in the second line");
 
-  text_delete(t, 1, 0, 1, text_line_length(t, 1) + 1);
+  text_delete(t, 1, 0, 1, text_line_size(t, 1) + 1);
   ASSERT(text_num_lines(t) == 2,
          "Expected to have two lines after deleting one");
   struct text_chunk line2 = text_get_line(t, 1);
@@ -110,8 +104,8 @@ void test_delete_text() {
   struct text *t3 = text_create(10);
   const char *delete_me = "This is line🎙\nQ";
   text_insert_at(t3, 0, 0, (uint8_t *)delete_me, strlen(delete_me),
-                 &lines_added, &cols_added);
-  text_delete(t3, 0, 13, 0, 14);
+                 &lines_added);
+  text_delete(t3, 0, 16, 1, 0);
   struct text_chunk top_line = text_get_line(t3, 0);
   ASSERT(strncmp((const char *)top_line.text, "This is line🎙Q",
                  top_line.nbytes) == 0,
@@ -123,33 +117,13 @@ void test_delete_text() {
   struct text *t4 = text_create(10);
   const char *deletable_text = "Only one line kinda";
   text_append(t4, (uint8_t *)deletable_text, strlen(deletable_text),
-              &lines_added, &cols_added);
+              &lines_added);
   text_delete(t4, 0, 19, 0, 20);
   ASSERT(text_num_lines(t4) == 1, "Expected the line to still be there");
-  ASSERT(text_line_length(t4, 0) == 19,
+  ASSERT(text_line_size(t4, 0) == 19,
          "Expected nothing to have happened to the line");
 
-  // test utf-8
-  struct text *t2 = text_create(10);
-  const char *txt3 = "Emojis: 🇫🇮 🐮\n";
-  text_insert_at(t2, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added,
-                 &cols_added);
-
-  // TODO: Fix when graphemes are implemented, should be 11, right now it counts
-  // the two unicode code points 🇫 and 🇮 as two chars.
-  ASSERT(text_line_length(t2, 0) == 12,
-         "Line length should be 12 (even though there "
-         "are more bytes in the line).");
-
-  text_delete(t2, 0, 10, 0, 12);
-  ASSERT(text_line_length(t2, 0) == 10,
-         "Line length should be 10 after deleting the cow emoji and a space");
-  struct text_chunk line3 = text_get_line(t2, 0);
-  ASSERT(strncmp((const char *)line3.text, "Emojis: 🇫🇮", line3.nbytes) == 0,
-         "Expected cow emoji plus space to be deleted");
-
   text_destroy(t);
-  text_destroy(t2);
   text_destroy(t3);
   text_destroy(t4);
 }
diff --git a/test/utf8.c b/test/utf8.c
index d67c409..c5094c7 100644
--- a/test/utf8.c
+++ b/test/utf8.c
@@ -6,11 +6,6 @@
 #include "assert.h"
 #include "test.h"
 
-void test_nchars_nbytes() {
-  ASSERT(utf8_nchars((uint8_t *)"👴", strlen("👴")) == 1,
-         "Expected old man emoji to be 1 char");
-  ASSERT(utf8_nbytes((uint8_t *)"👴", strlen("👴"), 1) == 4,
-         "Expected old man emoji to be 4 bytes");
-}
+void test_nchars_nbytes() {}
 
 void run_utf8_tests() { run_test(test_nchars_nbytes); }
author	Albert Cervin <albert@acervin.com>	2024-08-23 17:07:27 +0200
committer	Albert Cervin <albert@acervin.com>	2024-09-11 16:22:58 +0200
commit	4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree	4745d99e70d645a8134dafc3814dc68bf678daf4 /test
parent	991283f684c224db46fe68738470921b8c394f13 (diff)
download	dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip