11 files changed, 620 insertions, 539 deletions
diff --git a/src/dged/buffer.c b/src/dged/buffer.c
index 6051f69..1062a47 100644
--- a/src/dged/buffer.c
+++ b/src/dged/buffer.c
@@ -157,6 +157,42 @@ void buffer_static_teardown() {
   }
 }
 
+static uint32_t get_tab_width(struct buffer *buffer) {
+  struct setting *tw = lang_setting(&buffer->lang, "tab-width");
+  if (tw == NULL) {
+    tw = settings_get("editor.tab-width");
+  }
+
+  uint32_t tab_width = 4;
+  if (tw != NULL && tw->value.type == Setting_Number) {
+    tab_width = tw->value.number_value;
+  }
+  return tab_width;
+}
+
+static bool use_tabs(struct buffer *buffer) {
+  struct setting *ut = lang_setting(&buffer->lang, "use-tabs");
+  if (ut == NULL) {
+    ut = settings_get("editor.use-tabs");
+  }
+
+  bool use_tabs = false;
+  if (ut != NULL && ut->value.type == Setting_Bool) {
+    use_tabs = ut->value.bool_value;
+  }
+
+  return use_tabs;
+}
+
+static uint32_t visual_char_width(struct codepoint *codepoint,
+                                  uint32_t tab_width) {
+  if (codepoint->codepoint == '\t') {
+    return tab_width;
+  } else {
+    return unicode_visual_char_width(codepoint);
+  }
+}
+
 static struct buffer create_internal(const char *name, char *filename) {
   struct buffer b = (struct buffer){
       .filename = filename,
@@ -185,7 +221,7 @@ static struct buffer create_internal(const char *name, char *filename) {
 
 static void strip_final_newline(struct buffer *b) {
   uint32_t nlines = text_num_lines(b->text);
-  if (nlines > 0 && text_line_length(b->text, nlines - 1) == 0) {
+  if (nlines > 0 && buffer_line_length(b, nlines - 1) == 0) {
     text_delete(b->text, nlines - 1, 0, nlines - 1, 1);
   }
 }
@@ -207,7 +243,7 @@ static void buffer_read_from_file(struct buffer *b) {
       int bytes = fread(buff, 1, 4096, file);
       if (bytes > 0) {
         uint32_t ignore;
-        text_append(b->text, buff, bytes, &ignore, &ignore);
+        text_append(b->text, buff, bytes, &ignore);
       } else if (bytes == 0) {
         break; // EOF
       } else {
@@ -239,70 +275,66 @@ static void write_line(struct text_chunk *chunk, void *userdata) {
   fputc('\n', file);
 }
 
-static bool is_word_break(uint8_t c) {
+static bool is_word_break(const struct codepoint *codepoint) {
+  uint32_t c = codepoint->codepoint;
   return c == ' ' || c == '.' || c == '(' || c == ')' || c == '[' || c == ']' ||
-         c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':';
+         c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':' ||
+         c == '"';
 }
 
-static bool is_word_char(uint8_t c) { return !is_word_break(c); }
-
-struct match_result {
-  struct location at;
-  bool found;
-};
-
-static struct match_result find_next_in_line(struct buffer *buffer,
-                                             struct location start,
-                                             bool (*predicate)(uint8_t c)) {
-  struct text_chunk line = text_get_line(buffer->text, start.line);
-  bool found = false;
+static bool is_word_char(const struct codepoint *c) {
+  return !is_word_break(c);
+}
 
-  if (line.nbytes == 0) {
+static struct match_result
+find_next_in_line(struct buffer *buffer, struct location start,
+                  bool (*predicate)(const struct codepoint *c)) {
+  if (text_line_size(buffer->text, start.line) == 0) {
     return (struct match_result){.at = start, .found = false};
   }
 
-  uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col);
-  while (bytei < line.nbytes) {
-    if (predicate(line.text[bytei])) {
+  bool found = false;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, start.line);
+  uint32_t coli = 0, tab_width = get_tab_width(buffer);
+  struct codepoint *codepoint;
+  while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    if (coli >= start.col && predicate(codepoint)) {
       found = true;
       break;
     }
-    ++bytei;
+
+    coli += visual_char_width(codepoint, tab_width);
   }
 
-  uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei);
   return (struct match_result){
-      .at = (struct location){.line = start.line, .col = target_col},
-      .found = found};
+      .at = (struct location){.line = start.line, .col = coli}, .found = found};
 }
 
-static struct match_result find_prev_in_line(struct buffer *buffer,
-                                             struct location start,
-                                             bool (*predicate)(uint8_t c)) {
-  struct text_chunk line = text_get_line(buffer->text, start.line);
-  bool found = false;
+static struct match_result
+find_prev_in_line(struct buffer *buffer, struct location start,
+                  bool (*predicate)(const struct codepoint *c)) {
 
-  if (line.nbytes == 0) {
+  if (text_line_size(buffer->text, start.line) == 0) {
     return (struct match_result){.at = start, .found = false};
   }
 
-  uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col);
-  while (bytei > 0) {
-    if (predicate(line.text[bytei])) {
+  bool found = false;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, start.line);
+  uint32_t coli = 0, tab_width = get_tab_width(buffer), found_at;
+  struct codepoint *codepoint;
+  while (coli < start.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    if (predicate(codepoint)) {
       found = true;
-      break;
+      found_at = coli;
     }
-    --bytei;
-  }
-
-  // first byte on line can also be a match
-  if (predicate(line.text[bytei])) {
-    found = true;
+    coli += visual_char_width(codepoint, tab_width);
   }
 
-  uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei);
   return (struct match_result){
-      .at = (struct location){.line = start.line, .col = target_col},
+      .at =
+          (struct location){.line = start.line, .col = found ? found_at : coli},
       .found = found};
 }
 
@@ -315,13 +347,52 @@ static struct text_chunk *copy_region(struct buffer *buffer,
     free(curr->text);
   }
 
+  struct location begin_bytes =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location end_bytes =
+      buffer_location_to_byte_coords(buffer, region.end);
+
   struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+      text_get_region(buffer->text, begin_bytes.line, begin_bytes.col,
+                      end_bytes.line, end_bytes.col);
   *curr = txt;
   return curr;
 }
 
+static struct location do_indent(struct buffer *buffer, struct location at,
+                                 uint32_t tab_width, bool use_tabs) {
+  if (use_tabs) {
+    return buffer_add(buffer, at, (uint8_t *)"\t", 1);
+  } else {
+    return buffer_add(buffer, at, (uint8_t *)"                ",
+                      tab_width > 16 ? 16 : tab_width);
+  }
+}
+
+static uint64_t to_global_offset(struct buffer *buffer,
+                                 struct location bytecoords) {
+  uint32_t line = bytecoords.line;
+  uint32_t col = bytecoords.col;
+  uint32_t byteoff = 0;
+  uint32_t nlines = buffer_num_lines(buffer);
+
+  if (nlines == 0) {
+    return 0;
+  }
+
+  for (uint32_t l = 0; l < line && l < nlines; ++l) {
+    // +1 for newline
+    byteoff += text_line_size(buffer->text, l) + 1;
+  }
+
+  // handle last line
+  uint32_t l = line < nlines ? line : nlines - 1;
+  uint32_t nbytes = text_line_size(buffer->text, l);
+  byteoff += col <= nbytes ? col : nbytes + 1;
+
+  return byteoff;
+}
+
 /* --------------------- buffer methods -------------------- */
 
 struct buffer buffer_create(const char *name) {
@@ -452,18 +523,29 @@ struct location buffer_add(struct buffer *buffer, struct location at,
   struct location initial = at;
   struct location final = at;
 
-  uint32_t lines_added, cols_added;
-  text_insert_at(buffer->text, initial.line, initial.col, text, nbytes,
-                 &lines_added, &cols_added);
+  struct location at_bytes = buffer_location_to_byte_coords(buffer, at);
+
+  uint32_t lines_added;
+  text_insert_at(buffer->text, at_bytes.line, at_bytes.col, text, nbytes,
+                 &lines_added);
 
   // move to after inserted text
   if (lines_added > 0) {
     final = buffer_clamp(buffer, (int64_t)at.line + lines_added, 0);
   } else {
+    uint32_t cols_added = 0, tab_width = get_tab_width(buffer);
+    struct utf8_codepoint_iterator iter =
+        create_utf8_codepoint_iterator(text, nbytes, 0);
+    struct codepoint *codepoint;
+    while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      cols_added += visual_char_width(codepoint, tab_width);
+    }
     final =
         buffer_clamp(buffer, (int64_t)at.line, (int64_t)at.col + cols_added);
   }
 
+  struct location final_bytes = buffer_location_to_byte_coords(buffer, final);
+
   undo_push_add(
       &buffer->undo,
       (struct undo_add){.begin = {.row = initial.line, .col = initial.col},
@@ -474,11 +556,17 @@ struct location buffer_add(struct buffer *buffer, struct location at,
                        (struct undo_boundary){.save_point = false});
   }
 
-  uint32_t begin_idx = text_global_idx(buffer->text, initial.line, initial.col);
-  uint32_t end_idx = text_global_idx(buffer->text, final.line, final.col);
+  uint32_t begin_idx = to_global_offset(buffer, at_bytes);
+  uint32_t end_idx = to_global_offset(buffer, final_bytes);
 
   VEC_FOR_EACH(&buffer->hooks->insert_hooks, struct insert_hook * h) {
-    h->callback(buffer, region_new(initial, final), begin_idx, end_idx,
+    h->callback(buffer,
+                (struct edit_location){
+                    .coordinates = region_new(initial, final),
+                    .bytes = region_new(at_bytes, final_bytes),
+                    .global_byte_begin = begin_idx,
+                    .global_byte_end = end_idx,
+                },
                 h->userdata);
   }
 
@@ -488,15 +576,16 @@ struct location buffer_add(struct buffer *buffer, struct location at,
 
 struct location buffer_set_text(struct buffer *buffer, uint8_t *text,
                                 uint32_t nbytes) {
-  uint32_t lines, cols;
+  uint32_t lines_added;
 
   text_clear(buffer->text);
-  text_append(buffer->text, text, nbytes, &lines, &cols);
+  text_append(buffer->text, text, nbytes, &lines_added);
 
   // if last line is empty, remove it
   strip_final_newline(buffer);
 
-  return buffer_clamp(buffer, lines, cols);
+  return buffer_clamp(buffer, lines_added,
+                      buffer_line_length(buffer, lines_added));
 }
 
 void buffer_clear(struct buffer *buffer) { text_clear(buffer->text); }
@@ -524,9 +613,18 @@ struct location buffer_previous_char(struct buffer *buffer,
     }
 
     --dot.line;
-    dot.col = buffer_num_chars(buffer, dot.line);
+    dot.col = buffer_line_length(buffer, dot.line);
   } else {
-    --dot.col;
+    struct utf8_codepoint_iterator iter =
+        text_line_codepoint_iterator(buffer->text, dot.line);
+    struct codepoint *codepoint;
+    uint32_t coli = 0, tab_width = get_tab_width(buffer), last_width = 0;
+    while (coli < dot.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      last_width = visual_char_width(codepoint, tab_width);
+      coli += last_width;
+    }
+
+    dot.col = coli - last_width;
   }
 
   return dot;
@@ -571,14 +669,14 @@ struct location buffer_previous_line(struct buffer *buffer,
   }
 
   --dot.line;
-  uint32_t nchars = buffer_num_chars(buffer, dot.line);
+  uint32_t nchars = buffer_line_length(buffer, dot.line);
   uint32_t new_col = dot.col > nchars ? nchars : dot.col;
 
   return dot;
 }
 
 struct location buffer_next_char(struct buffer *buffer, struct location dot) {
-  if (dot.col == buffer_num_chars(buffer, dot.line)) {
+  if (dot.col == buffer_line_length(buffer, dot.line)) {
     uint32_t lastline = buffer->lazy_row_add ? buffer_num_lines(buffer)
                                              : buffer_num_lines(buffer) - 1;
     if (dot.line == lastline) {
@@ -588,7 +686,16 @@ struct location buffer_next_char(struct buffer *buffer, struct location dot) {
     dot.col = 0;
     ++dot.line;
   } else {
-    ++dot.col;
+    struct utf8_codepoint_iterator iter =
+        text_line_codepoint_iterator(buffer->text, dot.line);
+    struct codepoint *codepoint;
+    uint32_t coli = 0;
+    while (coli <= dot.col &&
+           (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      coli += visual_char_width(codepoint, get_tab_width(buffer));
+    }
+
+    dot.col = coli;
   }
 
   return dot;
@@ -635,7 +742,7 @@ struct location buffer_next_line(struct buffer *buffer, struct location dot) {
 
   ++dot.line;
   uint32_t new_col = dot.col;
-  uint32_t nchars = buffer_num_chars(buffer, dot.line);
+  uint32_t nchars = buffer_line_length(buffer, dot.line);
   new_col = new_col > nchars ? nchars : new_col;
 
   return dot;
@@ -664,8 +771,8 @@ struct location buffer_clamp(struct buffer *buffer, int64_t line, int64_t col) {
   // clamp col
   if (col < 0) {
     col = 0;
-  } else if (col > buffer_num_chars(buffer, line)) {
-    col = buffer_num_chars(buffer, line);
+  } else if (col > buffer_line_length(buffer, line)) {
+    col = buffer_line_length(buffer, line);
   }
 
   location.col = col;
@@ -681,7 +788,7 @@ struct location buffer_end(struct buffer *buffer) {
     return (struct location){.line = nlines, .col = 0};
   } else {
     return (struct location){.line = nlines - 1,
-                             .col = buffer_num_chars(buffer, nlines - 1)};
+                             .col = buffer_line_length(buffer, nlines - 1)};
   }
 }
 
@@ -689,55 +796,22 @@ uint32_t buffer_num_lines(struct buffer *buffer) {
   return text_num_lines(buffer->text);
 }
 
-uint32_t buffer_num_chars(struct buffer *buffer, uint32_t line) {
-  if (line >= buffer_num_lines(buffer)) {
-    return 0;
+uint32_t buffer_line_length(struct buffer *buffer, uint32_t line) {
+  uint32_t tab_size = get_tab_width(buffer), len = 0;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, line);
+  struct codepoint *codepoint;
+  while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    len += visual_char_width(codepoint, tab_size);
   }
 
-  return text_line_length(buffer->text, line);
+  return len;
 }
 
 struct location buffer_newline(struct buffer *buffer, struct location at) {
   return buffer_add(buffer, at, (uint8_t *)"\n", 1);
 }
 
-static uint32_t get_tab_width(struct buffer *buffer) {
-  struct setting *tw = lang_setting(&buffer->lang, "tab-width");
-  if (tw == NULL) {
-    tw = settings_get("editor.tab-width");
-  }
-
-  uint32_t tab_width = 4;
-  if (tw != NULL && tw->value.type == Setting_Number) {
-    tab_width = tw->value.number_value;
-  }
-  return tab_width;
-}
-
-static bool use_tabs(struct buffer *buffer) {
-  struct setting *ut = lang_setting(&buffer->lang, "use-tabs");
-  if (ut == NULL) {
-    ut = settings_get("editor.use-tabs");
-  }
-
-  bool use_tabs = false;
-  if (ut != NULL && ut->value.type == Setting_Bool) {
-    use_tabs = ut->value.bool_value;
-  }
-
-  return use_tabs;
-}
-
-static struct location do_indent(struct buffer *buffer, struct location at,
-                                 uint32_t tab_width, bool use_tabs) {
-  if (use_tabs) {
-    return buffer_add(buffer, at, (uint8_t *)"\t", 1);
-  } else {
-    return buffer_add(buffer, at, (uint8_t *)"                ",
-                      tab_width > 16 ? 16 : tab_width);
-  }
-}
-
 struct location buffer_indent(struct buffer *buffer, struct location at) {
   return do_indent(buffer, at, get_tab_width(buffer), use_tabs(buffer));
 }
@@ -778,16 +852,13 @@ struct location buffer_undo(struct buffer *buffer, struct location dot) {
     case Undo_Add: {
       struct undo_add *add = &rec->add;
 
-      pos =
-          buffer_delete(buffer, (struct region){.begin =
-                                                    (struct location){
-                                                        .line = add->begin.row,
-                                                        .col = add->begin.col,
-                                                    },
-                                                .end = (struct location){
-                                                    .line = add->end.row,
-                                                    .col = add->end.col,
-                                                }});
+      pos = buffer_delete(buffer,
+                          (struct region){
+                              .begin = (struct location){.line = add->begin.row,
+                                                         .col = add->begin.col},
+                              .end = (struct location){.line = add->end.row,
+                                                       .col = add->end.col},
+                          });
 
       break;
     }
@@ -888,9 +959,14 @@ struct location buffer_delete(struct buffer *buffer, struct region region) {
     return region.begin;
   }
 
+  struct location begin_bytes =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location end_bytes =
+      buffer_location_to_byte_coords(buffer, region.end);
+
   struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+      text_get_region(buffer->text, begin_bytes.line, begin_bytes.col,
+                      end_bytes.line, end_bytes.col);
 
   undo_push_boundary(&buffer->undo,
                      (struct undo_boundary){.save_point = false});
@@ -903,17 +979,22 @@ struct location buffer_delete(struct buffer *buffer, struct region region) {
   undo_push_boundary(&buffer->undo,
                      (struct undo_boundary){.save_point = false});
 
-  uint32_t begin_idx =
-      text_global_idx(buffer->text, region.begin.line, region.begin.col);
-  uint32_t end_idx =
-      text_global_idx(buffer->text, region.end.line, region.end.col);
+  uint64_t begin_idx = to_global_offset(buffer, begin_bytes);
+  uint64_t end_idx = to_global_offset(buffer, end_bytes);
 
-  text_delete(buffer->text, region.begin.line, region.begin.col,
-              region.end.line, region.end.col);
+  text_delete(buffer->text, begin_bytes.line, begin_bytes.col, end_bytes.line,
+              end_bytes.col);
   buffer->modified = true;
 
   VEC_FOR_EACH(&buffer->hooks->delete_hooks, struct delete_hook * h) {
-    h->callback(buffer, region, begin_idx, end_idx, h->userdata);
+    h->callback(buffer,
+                (struct edit_location){
+                    .coordinates = region,
+                    .bytes = region_new(begin_bytes, end_bytes),
+                    .global_byte_begin = begin_idx,
+                    .global_byte_end = end_idx,
+                },
+                h->userdata);
   }
 
   return region.begin;
@@ -1035,27 +1116,6 @@ struct cmdbuf {
   struct buffer *buffer;
 };
 
-static uint32_t visual_char_width(uint8_t *byte, uint32_t maxlen) {
-  if (*byte == '\t') {
-    return 4;
-  } else {
-    return utf8_visual_char_width(byte, maxlen);
-  }
-}
-
-uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col,
-                             uint32_t end_col) {
-  uint32_t start_byte = utf8_nbytes(txt, len, start_col);
-  uint32_t end_byte = utf8_nbytes(txt, len, end_col);
-
-  uint32_t width = 0;
-  for (uint32_t bytei = start_byte; bytei < end_byte; ++bytei) {
-    width += visual_char_width(&txt[bytei], len - bytei);
-  }
-
-  return width;
-}
-
 static void apply_properties(struct command_list *cmds,
                              struct text_property *properties[],
                              uint32_t nproperties) {
@@ -1097,65 +1157,67 @@ void render_line(struct text_chunk *line, void *userdata) {
   command_list_set_show_whitespace(cmdbuf->cmds, cmdbuf->show_ws);
 
   // calculate scroll offsets
-  uint32_t scroll_bytes =
-      utf8_nbytes(line->text, line->nbytes, cmdbuf->origin.col);
-  uint32_t text_nbytes_scroll =
-      scroll_bytes > line->nbytes ? 0 : line->nbytes - scroll_bytes;
-  uint8_t *text = line->text + scroll_bytes;
-
-  uint32_t visual_col_start = 0;
-  uint32_t cur_visual_col = 0;
-  uint32_t start_byte = 0, text_nbytes = 0;
   struct text_property *properties[32] = {0};
   uint64_t prev_properties_hash = 0;
 
-  for (uint32_t cur_byte = start_byte, coli = 0;
-       cur_byte < text_nbytes_scroll && cur_visual_col < cmdbuf->width &&
-       coli < line->nchars - cmdbuf->origin.col;
-       ++coli) {
+  uint32_t tab_width = get_tab_width(cmdbuf->buffer);
+
+  // handle scroll column offset
+  uint32_t coli = 0, bytei = 0;
+  struct utf8_codepoint_iterator iter = text_chunk_codepoint_iterator(line);
+  struct codepoint *codepoint;
+  while (coli < cmdbuf->origin.col &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    coli += visual_char_width(codepoint, tab_width);
+    bytei += codepoint->nbytes;
+  }
 
-    uint32_t bytes_remaining = text_nbytes_scroll - cur_byte;
-    uint32_t char_nbytes = utf8_nbytes(text + cur_byte, bytes_remaining, 1);
-    uint32_t char_vwidth = visual_char_width(text + cur_byte, bytes_remaining);
+  // coli is the visual column [0..width-1]
+  coli = 0;
+  uint32_t drawn_bytei = bytei;
+  uint32_t drawn_coli = coli;
 
+  while (coli < cmdbuf->width &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
     // calculate character properties
     uint32_t nproperties = 0;
-    text_get_properties(
-        cmdbuf->buffer->text,
-        (struct location){.line = line->line, .col = coli + cmdbuf->origin.col},
-        properties, 32, &nproperties);
+    text_get_properties(cmdbuf->buffer->text, line->line, bytei, properties, 32,
+                        &nproperties);
 
     // if we have any new or lost props, flush text up until now, reset
     // and re-apply current properties
     uint64_t new_properties_hash = properties_hash(properties, nproperties);
     if (new_properties_hash != prev_properties_hash) {
-      command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line,
-                             text + start_byte, cur_byte - start_byte);
+      command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line,
+                             line->text + drawn_bytei, bytei - drawn_bytei);
       command_list_reset_color(cmdbuf->cmds);
 
-      visual_col_start = cur_visual_col;
-      start_byte = cur_byte;
+      drawn_coli = coli;
+      drawn_bytei = bytei;
 
       // apply new properties
       apply_properties(cmdbuf->cmds, properties, nproperties);
     }
 
     prev_properties_hash = new_properties_hash;
-    cur_byte += char_nbytes;
-    text_nbytes += char_nbytes;
-    cur_visual_col += char_vwidth;
+    bytei += codepoint->nbytes;
+    coli += visual_char_width(codepoint, tab_width);
   }
 
   // flush remaining
-  command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line,
-                         text + start_byte, text_nbytes - start_byte);
+  command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line,
+                         line->text + drawn_bytei, bytei - drawn_bytei);
+
+  drawn_coli = coli;
+  drawn_bytei = bytei;
 
   command_list_reset_color(cmdbuf->cmds);
   command_list_set_show_whitespace(cmdbuf->cmds, false);
 
-  if (cur_visual_col < cmdbuf->width) {
-    command_list_draw_repeated(cmdbuf->cmds, cur_visual_col, visual_line, ' ',
-                               cmdbuf->width - cur_visual_col);
+  // TODO: considering the whole screen is cleared, is this really needed?
+  if (drawn_coli < cmdbuf->width) {
+    command_list_draw_repeated(cmdbuf->cmds, drawn_coli, visual_line, ' ',
+                               cmdbuf->width - drawn_coli);
   }
 }
 
@@ -1200,19 +1262,19 @@ void buffer_render(struct buffer *buffer, struct buffer_render_params *params) {
 void buffer_add_text_property(struct buffer *buffer, struct location start,
                               struct location end,
                               struct text_property property) {
-  text_add_property(
-      buffer->text, (struct location){.line = start.line, .col = start.col},
-      (struct location){.line = end.line, .col = end.col}, property);
+  struct location bytestart = buffer_location_to_byte_coords(buffer, start);
+  struct location byteend = buffer_location_to_byte_coords(buffer, end);
+  text_add_property(buffer->text, bytestart.line, bytestart.col, byteend.line,
+                    byteend.col, property);
 }
 
 void buffer_get_text_properties(struct buffer *buffer, struct location location,
                                 struct text_property **properties,
                                 uint32_t max_nproperties,
                                 uint32_t *nproperties) {
-  text_get_properties(
-      buffer->text,
-      (struct location){.line = location.line, .col = location.col}, properties,
-      max_nproperties, nproperties);
+  struct location bytecoords = buffer_location_to_byte_coords(buffer, location);
+  text_get_properties(buffer->text, bytecoords.line, bytecoords.col, properties,
+                      max_nproperties, nproperties);
 }
 
 void buffer_clear_text_properties(struct buffer *buffer) {
@@ -1244,9 +1306,12 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line,
                  (struct location){.line = end + 1, .col = 0});
 
   struct s8 *lines = (struct s8 *)malloc(sizeof(struct s8) * ntosort);
-  struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+
+  struct location bytebeg =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location byteend = buffer_location_to_byte_coords(buffer, region.end);
+  struct text_chunk txt = text_get_region(
+      buffer->text, bytebeg.line, bytebeg.col, byteend.line, byteend.col);
 
   uint32_t line_start = 0;
   uint32_t curr_line = 0;
@@ -1278,3 +1343,41 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line,
     free(txt.text);
   }
 }
+
+struct location buffer_location_to_byte_coords(struct buffer *buffer,
+                                               struct location coords) {
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, coords.line);
+  uint32_t byteoffset = 0, col = 0, tab_width = get_tab_width(buffer);
+  struct codepoint *codepoint;
+
+  /* Let this walk up to (and including the target column) to
+   * make sure we account for zero-width characters when calculating the
+   * byte offset.
+   */
+  while (col <= coords.col &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    byteoffset += codepoint->nbytes;
+    col += visual_char_width(codepoint, tab_width);
+  }
+
+  /* Remove the byte-width of the last char again since it gives us the
+   * position right before it while still taking zero-width codepoints
+   * into account.
+   */
+  return (struct location){.line = coords.line,
+                           .col = byteoffset -
+                                  (codepoint != NULL ? codepoint->nbytes : 0)};
+}
+
+struct match_result
+buffer_find_prev_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c)) {
+  return find_prev_in_line(buffer, start, predicate);
+}
+
+struct match_result
+buffer_find_next_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c)) {
+  return find_next_in_line(buffer, start, predicate);
+}
diff --git a/src/dged/buffer.h b/src/dged/buffer.h
index cd5bd95..c9fe2ca 100644
--- a/src/dged/buffer.h
+++ b/src/dged/buffer.h
@@ -295,13 +295,13 @@ struct location buffer_end(struct buffer *buffer);
 uint32_t buffer_num_lines(struct buffer *buffer);
 
 /**
- * Get the number of chars in a given line in buffer.
+ * Get the line length in number of column positions.
  *
  * @param [in] buffer The buffer to use.
- * @param [in] line The line to get number of chars for.
- * @returns The number of chars in @ref line.
+ * @param [in] line The line to get number of columns for.
+ * @returns The number of column positions in the current line.
  */
-uint32_t buffer_num_chars(struct buffer *buffer, uint32_t line);
+uint32_t buffer_line_length(struct buffer *buffer, uint32_t line);
 
 /**
  * Insert a newline in the buffer.
@@ -555,6 +555,13 @@ uint32_t buffer_add_reload_hook(struct buffer *buffer, reload_hook_cb callback,
 void buffer_remove_reload_hook(struct buffer *buffer, uint32_t hook_id,
                                remove_hook_cb callback);
 
+struct edit_location {
+  struct region coordinates;
+  struct region bytes;
+  uint64_t global_byte_begin;
+  uint64_t global_byte_end;
+};
+
 /**
  * Buffer insert hook callback function.
  *
@@ -565,9 +572,8 @@ void buffer_remove_reload_hook(struct buffer *buffer, uint32_t hook_id,
  * @param end_idx The global byte offset to the end of where text was inserted.
  * @param userdata The userdata as sent in to @ref buffer_add_insert_hook.
  */
-typedef void (*insert_hook_cb)(struct buffer *buffer, struct region inserted,
-                               uint32_t begin_idx, uint32_t end_idx,
-                               void *userdata);
+typedef void (*insert_hook_cb)(struct buffer *buffer,
+                               struct edit_location inserted, void *userdata);
 
 /**
  * Add an insert hook, called when text is inserted into the @p buffer.
@@ -600,9 +606,8 @@ void buffer_remove_insert_hook(struct buffer *buffer, uint32_t hook_id,
  * @param end_idx The global byte offset to the end of the removed text.
  * @param userdata The userdata as sent in to @ref buffer_add_delete_hook.
  */
-typedef void (*delete_hook_cb)(struct buffer *buffer, struct region removed,
-                               uint32_t begin_idx, uint32_t end_idx,
-                               void *userdata);
+typedef void (*delete_hook_cb)(struct buffer *buffer,
+                               struct edit_location removed, void *userdata);
 
 /**
  * Add a delete hook, called when text is removed from the @p buffer.
@@ -724,10 +729,6 @@ void buffer_update(struct buffer *buffer, struct buffer_update_params *params);
  */
 void buffer_render(struct buffer *buffer, struct buffer_render_params *params);
 
-// TODO: move this to where it makes sense
-uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col,
-                             uint32_t end_col);
-
 /**
  * Sort lines in a buffer alphabetically.
  *
@@ -738,4 +739,19 @@ uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col,
 void buffer_sort_lines(struct buffer *buffer, uint32_t start_line,
                        uint32_t end_line);
 
+struct location buffer_location_to_byte_coords(struct buffer *buffer,
+                                               struct location coords);
+
+struct match_result {
+  struct location at;
+  bool found;
+};
+
+struct match_result
+buffer_find_prev_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c));
+struct match_result
+buffer_find_next_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c));
+
 #endif
diff --git a/src/dged/buffer_view.c b/src/dged/buffer_view.c
index 4e67d78..f3dd2b9 100644
--- a/src/dged/buffer_view.c
+++ b/src/dged/buffer_view.c
@@ -128,7 +128,7 @@ void buffer_view_backward_nlines(struct buffer_view *view, uint32_t nlines) {
 }
 
 void buffer_view_goto_end_of_line(struct buffer_view *view) {
-  view->dot.col = buffer_num_chars(view->buffer, view->dot.line);
+  view->dot.col = buffer_line_length(view->buffer, view->dot.line);
 }
 
 void buffer_view_goto_beginning_of_line(struct buffer_view *view) {
@@ -224,15 +224,22 @@ void buffer_view_delete_word(struct buffer_view *view) {
 }
 
 void buffer_view_kill_line(struct buffer_view *view) {
-  uint32_t nchars =
-      buffer_num_chars(view->buffer, view->dot.line) - view->dot.col;
-  if (nchars == 0) {
-    nchars = 1;
+  uint32_t ncols =
+      buffer_line_length(view->buffer, view->dot.line) - view->dot.col;
+
+  uint32_t line = view->dot.line;
+  uint32_t col = view->dot.col + ncols;
+
+  // kill the newline if we are at the end of the line
+  if (ncols == 0) {
+    struct location loc = buffer_next_char(view->buffer, view->dot);
+    line = loc.line;
+    col = loc.col;
   }
 
   struct region reg = region_new(view->dot, (struct location){
-                                                .line = view->dot.line,
-                                                .col = view->dot.col + nchars,
+                                                .line = line,
+                                                .col = col,
                                             });
 
   buffer_cut(view->buffer, reg);
@@ -241,7 +248,8 @@ void buffer_view_kill_line(struct buffer_view *view) {
 void buffer_view_sort_lines(struct buffer_view *view) {
   struct region reg = region_new(view->dot, view->mark);
   if (view->mark_set && region_has_size(reg)) {
-    if (reg.end.line > 0 && buffer_num_chars(view->buffer, reg.end.line) == 0) {
+    if (reg.end.line > 0 &&
+        buffer_line_length(view->buffer, reg.end.line) == 0) {
       reg.end.line -= 1;
     }
 
@@ -271,21 +279,7 @@ struct location buffer_view_dot_to_relative(struct buffer_view *view) {
 }
 
 struct location buffer_view_dot_to_visual(struct buffer_view *view) {
-  // calculate visual column index for dot column
-  struct text_chunk c = buffer_line(view->buffer, view->dot.line);
-  uint32_t width = visual_string_width(c.text, c.nbytes, 0, view->dot.col);
-  if (view->scroll.col > 0) {
-    width -= visual_string_width(c.text, c.nbytes, 0, view->scroll.col);
-  }
-
-  struct location l = buffer_view_dot_to_relative(view);
-  l.col = width + view->fringe_width;
-
-  if (c.allocated) {
-    free(c.text);
-  }
-
-  return l;
+  return buffer_view_dot_to_relative(view);
 }
 
 void buffer_view_undo(struct buffer_view *view) {
diff --git a/src/dged/display.c b/src/dged/display.c
index bc604f0..ea3f459 100644
--- a/src/dged/display.c
+++ b/src/dged/display.c
@@ -60,7 +60,7 @@ struct push_fmt_cmd {
 struct repeat_cmd {
   uint32_t col;
   uint32_t row;
-  int32_t c;
+  uint32_t c;
   uint32_t nrepeat;
 };
 
@@ -135,21 +135,7 @@ void display_destroy(struct display *display) {
 uint32_t display_width(struct display *display) { return display->width; }
 uint32_t display_height(struct display *display) { return display->height; }
 
-void putch(uint8_t c) {
-  // TODO: move this to buffer rendering
-  if (c < ' ') {
-    fprintf(stdout, "^%c", c + 0x40);
-  } else if (c == 0x7f) {
-    fprintf(stdout, "^?");
-  } else if (utf8_byte_is_unicode_start(c) ||
-             utf8_byte_is_unicode_continuation(c)) {
-    putc(c, stdout);
-  } else if (c >= ' ' && c < 0x7f) {
-    putc(c, stdout);
-  } else {
-    fprintf(stdout, "|0x%02x|", c);
-  }
-}
+void putch(uint8_t c) { putc(c, stdout); }
 
 static void apply_fmt(uint8_t *fmt_stack, uint32_t fmt_stack_len) {
   if (fmt_stack == NULL || fmt_stack_len == 0) {
@@ -164,6 +150,7 @@ static void apply_fmt(uint8_t *fmt_stack, uint32_t fmt_stack_len) {
 
 void putch_ws(uint8_t c, bool show_whitespace, uint8_t *fmt_stack,
               uint32_t fmt_stack_len) {
+  // TODO: tab width needs to be sent here
   if (show_whitespace && c == '\t') {
     fputs("\x1b[90m →  \x1b[39m", stdout);
     apply_fmt(fmt_stack, fmt_stack_len);
@@ -295,7 +282,7 @@ void command_list_draw_text_copy(struct command_list *list, uint32_t col,
 }
 
 void command_list_draw_repeated(struct command_list *list, uint32_t col,
-                                uint32_t row, int32_t c, uint32_t nrepeat) {
+                                uint32_t row, uint32_t c, uint32_t nrepeat) {
   struct repeat_cmd *cmd = add_command(list, RenderCommand_Repeat)->repeat;
   cmd->col = col;
   cmd->row = row;
@@ -401,10 +388,14 @@ void display_render(struct display *display,
         display_move_cursor(display, repeat_cmd->row + cl->yoffset,
                             repeat_cmd->col + cl->xoffset);
         apply_fmt(fmt_stack, fmt_stack_len);
-        uint32_t nbytes = utf8_nbytes((uint8_t *)&repeat_cmd->c, 4, 1);
-        for (uint32_t i = 0; i < repeat_cmd->nrepeat; ++i) {
-          putbytes((uint8_t *)&repeat_cmd->c, nbytes, show_whitespace_state,
-                   fmt_stack, fmt_stack_len);
+        struct utf8_codepoint_iterator iter =
+            create_utf8_codepoint_iterator((uint8_t *)&repeat_cmd->c, 4, 0);
+        struct codepoint *codepoint = utf8_next_codepoint(&iter);
+        if (codepoint != NULL) {
+          for (uint32_t i = 0; i < repeat_cmd->nrepeat; ++i) {
+            putbytes((uint8_t *)&repeat_cmd->c, codepoint->nbytes,
+                     show_whitespace_state, fmt_stack, fmt_stack_len);
+          }
         }
         break;
       }
diff --git a/src/dged/display.h b/src/dged/display.h
index 0fda30d..f9c7ef8 100644
--- a/src/dged/display.h
+++ b/src/dged/display.h
@@ -238,7 +238,7 @@ void command_list_draw_text_copy(struct command_list *list, uint32_t col,
  * @param nrepeat Number of times to repeat byte.
  */
 void command_list_draw_repeated(struct command_list *list, uint32_t col,
-                                uint32_t row, int32_t c, uint32_t nrepeat);
+                                uint32_t row, uint32_t c, uint32_t nrepeat);
 
 void command_list_draw_command_list(struct command_list *list,
                                     struct command_list *to_draw);
diff --git a/src/dged/keyboard.c b/src/dged/keyboard.c
index 26eb308..04565e0 100644
--- a/src/dged/keyboard.c
+++ b/src/dged/keyboard.c
@@ -78,20 +78,24 @@ void parse_keys(uint8_t *bytes, uint32_t nbytes, struct key *out_keys,
       } else if (utf8_byte_is_unicode_continuation(b)) {
         // do nothing for these
       } else { // ascii char or unicode start byte (self-inserting)
-        uint32_t nb = utf8_byte_is_unicode_start(b)
-                          ? utf8_nbytes(bytes + bytei, nbytes - bytei, 1)
-                          : 1;
-
-        // "compress" number of keys if previous key was also a
-        // "simple" key
-        if (prev_kp != NULL && prev_kp->mod == None) {
-          prev_kp->end += nb;
-        } else {
-          kp->mod = None;
-          kp->key = b;
-          kp->start = bytei;
-          kp->end = bytei + nb;
-          ++nkps;
+        // TODO: do this better
+        struct utf8_codepoint_iterator iter =
+            create_utf8_codepoint_iterator(bytes + bytei, nbytes - bytei, 0);
+        struct codepoint *codepoint = utf8_next_codepoint(&iter);
+        if (codepoint != NULL) {
+          uint32_t nb = codepoint->nbytes;
+
+          // "compress" number of keys if previous key was also a
+          // "simple" key
+          if (prev_kp != NULL && prev_kp->mod == None) {
+            prev_kp->end += nb;
+          } else {
+            kp->mod = None;
+            kp->key = b;
+            kp->start = bytei;
+            kp->end = bytei + nb;
+            ++nkps;
+          }
         }
       }
     }
diff --git a/src/dged/syntax.c b/src/dged/syntax.c
index 8d0fd1a..569dc70 100644
--- a/src/dged/syntax.c
+++ b/src/dged/syntax.c
@@ -342,7 +342,8 @@ static void update_parser(struct buffer *buffer, void *userdata,
                           : origin.line + height;
   ts_query_cursor_set_point_range(
       cursor, (TSPoint){.row = origin.line, .column = origin.col},
-      (TSPoint){.row = end_line, .column = buffer_num_chars(buffer, end_line)});
+      (TSPoint){.row = end_line,
+                .column = buffer_line_length(buffer, end_line)});
   ts_query_cursor_exec(cursor, h->query, ts_tree_root_node(h->tree));
 
   TSQueryMatch match;
@@ -406,47 +407,39 @@ static void update_parser(struct buffer *buffer, void *userdata,
         continue;
       }
 
-      buffer_add_text_property(
-          buffer,
-          (struct location){.line = start.row,
-                            .col = text_byteindex_to_col(
-                                buffer->text, start.row, start.column)},
-          (struct location){.line = end.row,
-                            .col = text_byteindex_to_col(buffer->text, end.row,
-                                                         end.column - 1)},
-          (struct text_property){
-              .type = TextProperty_Colors,
-              .colors =
-                  (struct text_property_colors){
-                      .set_fg = true,
-                      .fg = color,
-                  },
-          });
+      text_add_property(buffer->text, start.row, start.column, end.row,
+                        end.column > 0 ? end.column - 1 : 0,
+                        (struct text_property){
+                            .type = TextProperty_Colors,
+                            .colors =
+                                (struct text_property_colors){
+                                    .set_fg = true,
+                                    .fg = color,
+                                },
+                        });
     }
   }
 
   ts_query_cursor_delete(cursor);
 }
 
-static void text_removed(struct buffer *buffer, struct region removed,
-                         uint32_t begin_idx, uint32_t end_idx, void *userdata) {
+static void text_removed(struct buffer *buffer, struct edit_location removed,
+                         void *userdata) {
   struct highlight *h = (struct highlight *)userdata;
 
-  TSPoint begin = {.row = removed.begin.line,
-                   .column = text_col_to_byteindex(
-                       buffer->text, removed.begin.line, removed.begin.col)};
+  TSPoint begin = {.row = removed.bytes.begin.line,
+                   .column = removed.bytes.begin.col};
   TSPoint new_end = begin;
-  TSPoint old_end = {.row = removed.end.line,
-                     .column = text_col_to_byteindex(
-                         buffer->text, removed.end.line, removed.end.col)};
+  TSPoint old_end = {.row = removed.bytes.end.line,
+                     .column = removed.bytes.end.col};
 
   TSInputEdit edit = {
       .start_point = begin,
       .old_end_point = old_end,
       .new_end_point = new_end,
-      .start_byte = begin_idx,
-      .old_end_byte = end_idx,
-      .new_end_byte = begin_idx,
+      .start_byte = removed.global_byte_begin,
+      .old_end_byte = removed.global_byte_end,
+      .new_end_byte = removed.global_byte_begin,
   };
 
   ts_tree_edit(h->tree, &edit);
@@ -479,27 +472,24 @@ static void buffer_reloaded(struct buffer *buffer, void *userdata) {
   }
 }
 
-static void text_inserted(struct buffer *buffer, struct region inserted,
-                          uint32_t begin_idx, uint32_t end_idx,
+static void text_inserted(struct buffer *buffer, struct edit_location inserted,
                           void *userdata) {
   struct timer *text_inserted = timer_start("syntax.txt-inserted");
   struct highlight *h = (struct highlight *)userdata;
 
-  TSPoint begin = {.row = inserted.begin.line,
-                   .column = text_col_to_byteindex(
-                       buffer->text, inserted.begin.line, inserted.begin.col)};
+  TSPoint begin = {.row = inserted.bytes.begin.line,
+                   .column = inserted.bytes.begin.col};
   TSPoint old_end = begin;
-  TSPoint new_end = {.row = inserted.end.line,
-                     .column = text_col_to_byteindex(
-                         buffer->text, inserted.end.line, inserted.end.col)};
+  TSPoint new_end = {.row = inserted.bytes.end.line,
+                     .column = inserted.bytes.end.col};
 
   TSInputEdit edit = {
       .start_point = begin,
       .old_end_point = old_end,
       .new_end_point = new_end,
-      .start_byte = begin_idx,
-      .old_end_byte = begin_idx,
-      .new_end_byte = end_idx,
+      .start_byte = inserted.global_byte_begin,
+      .old_end_byte = inserted.global_byte_begin,
+      .new_end_byte = inserted.global_byte_end,
   };
 
   ts_tree_edit(h->tree, &edit);
diff --git a/src/dged/text.c b/src/dged/text.c
index 3d1078f..18ab04f 100644
--- a/src/dged/text.c
+++ b/src/dged/text.c
@@ -18,7 +18,6 @@ struct line {
   uint8_t *data;
   uint8_t flags;
   uint32_t nbytes;
-  uint32_t nchars;
 };
 
 struct text_property_entry {
@@ -54,11 +53,9 @@ void text_destroy(struct text *text) {
     text->lines[li].data = NULL;
     text->lines[li].flags = 0;
     text->lines[li].nbytes = 0;
-    text->lines[li].nchars = 0;
   }
 
   free(text->lines);
-
   free(text);
 }
 
@@ -68,68 +65,25 @@ void text_clear(struct text *text) {
     text->lines[li].data = NULL;
     text->lines[li].flags = 0;
     text->lines[li].nbytes = 0;
-    text->lines[li].nchars = 0;
   }
 
   text->nlines = 0;
   text_clear_properties(text);
 }
 
-// given `char_idx` as a character index, return the byte index
-uint32_t charidx_to_byteidx(struct line *line, uint32_t char_idx) {
-  if (line->nchars == 0) {
-    return 0;
-  }
-
-  if (char_idx > line->nchars) {
-    return line->nbytes - 1;
-  }
-
-  return utf8_nbytes(line->data, line->nbytes, char_idx);
-}
-
-uint32_t text_col_to_byteindex(struct text *text, uint32_t line, uint32_t col) {
-  return charidx_to_byteidx(&text->lines[line], col);
-}
-
-// given `byte_idx` as a byte index, return the character index
-uint32_t byteidx_to_charidx(struct line *line, uint32_t byte_idx) {
-  if (byte_idx > line->nbytes) {
-    return line->nchars;
+struct utf8_codepoint_iterator
+text_line_codepoint_iterator(const struct text *text, uint32_t lineidx) {
+  if (lineidx >= text_num_lines(text)) {
+    return create_utf8_codepoint_iterator(NULL, 0, 0);
   }
 
-  return utf8_nchars(line->data, byte_idx);
+  return create_utf8_codepoint_iterator(text->lines[lineidx].data,
+                                        text->lines[lineidx].nbytes, 0);
 }
 
-uint32_t text_byteindex_to_col(struct text *text, uint32_t line,
-                               uint32_t byteindex) {
-  return byteidx_to_charidx(&text->lines[line], byteindex);
-}
-
-uint32_t text_global_idx(struct text *text, uint32_t line, uint32_t col) {
-  uint32_t byteoff = 0;
-  uint32_t nlines = text_num_lines(text);
-
-  if (nlines == 0) {
-    return 0;
-  }
-
-  for (uint32_t l = 0; l < line && l < nlines; ++l) {
-    // +1 for newline
-    byteoff += text_line_size(text, l) + 1;
-  }
-
-  uint32_t l = line < nlines ? line : nlines - 1;
-  uint32_t nchars = text_line_length(text, l);
-  uint32_t c = col < nchars ? col : nchars;
-  byteoff += text_col_to_byteindex(text, l, c);
-
-  if (col > nchars) {
-    // account for newline
-    ++byteoff;
-  }
-
-  return byteoff;
+struct utf8_codepoint_iterator
+text_chunk_codepoint_iterator(const struct text_chunk *chunk) {
+  return create_utf8_codepoint_iterator(chunk->text, chunk->nbytes, 0);
 }
 
 void append_empty_lines(struct text *text, uint32_t numlines) {
@@ -145,17 +99,10 @@ void append_empty_lines(struct text *text, uint32_t numlines) {
     struct line *nline = &text->lines[text->nlines];
     nline->data = NULL;
     nline->nbytes = 0;
-    nline->nchars = 0;
     nline->flags = 0;
 
     ++text->nlines;
   }
-
-  if (text->nlines > text->capacity) {
-    printf("text->nlines: %d, text->capacity: %d\n", text->nlines,
-           text->capacity);
-    raise(SIGTRAP);
-  }
 }
 
 void ensure_line(struct text *text, uint32_t line) {
@@ -166,8 +113,8 @@ void ensure_line(struct text *text, uint32_t line) {
 
 // It is assumed that `data` does not contain any \n, that is handled by
 // higher-level functions
-void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data,
-               uint32_t len, uint32_t nchars) {
+static void insert_at(struct text *text, uint32_t line, uint32_t offset,
+                      uint8_t *data, uint32_t len) {
 
   if (len == 0) {
     return;
@@ -178,11 +125,10 @@ void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data,
   struct line *l = &text->lines[line];
 
   l->nbytes += len;
-  l->nchars += nchars;
   l->flags = LineChanged;
   l->data = realloc(l->data, l->nbytes);
 
-  uint32_t bytei = charidx_to_byteidx(l, col);
+  uint32_t bytei = offset;
 
   // move following bytes out of the way
   if (bytei + len < l->nbytes) {
@@ -194,15 +140,7 @@ void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data,
   memcpy(l->data + bytei, data, len);
 }
 
-uint32_t text_line_length(struct text *text, uint32_t lineidx) {
-  if (lineidx >= text_num_lines(text)) {
-    return 0;
-  }
-
-  return text->lines[lineidx].nchars;
-}
-
-uint32_t text_line_size(struct text *text, uint32_t lineidx) {
+uint32_t text_line_size(const struct text *text, uint32_t lineidx) {
   if (lineidx >= text_num_lines(text)) {
     return 0;
   }
@@ -210,20 +148,19 @@ uint32_t text_line_size(struct text *text, uint32_t lineidx) {
   return text->lines[lineidx].nbytes;
 }
 
-uint32_t text_num_lines(struct text *text) { return text->nlines; }
+uint32_t text_num_lines(const struct text *text) { return text->nlines; }
+
+static void split_line(struct text *text, uint32_t offset, uint32_t lineidx,
+                       uint32_t newlineidx) {
+  struct line *line = &text->lines[lineidx];
+  struct line *next = &text->lines[newlineidx];
 
-void split_line(uint32_t col, struct line *line, struct line *next) {
   uint8_t *data = line->data;
   uint32_t nbytes = line->nbytes;
-  uint32_t nchars = line->nchars;
-
-  uint32_t chari = col;
-  uint32_t bytei = charidx_to_byteidx(line, chari);
+  uint32_t bytei = offset;
 
   line->nbytes = bytei;
-  line->nchars = chari;
   next->nbytes = nbytes - bytei;
-  next->nchars = nchars - chari;
   line->flags = next->flags = line->flags;
 
   next->data = NULL;
@@ -260,7 +197,7 @@ void shift_lines(struct text *text, uint32_t start, int32_t direction) {
   memmove(dest, src, nlines * sizeof(struct line));
 }
 
-void new_line_at(struct text *text, uint32_t line, uint32_t col) {
+void new_line_at(struct text *text, uint32_t line, uint32_t offset) {
   ensure_line(text, line);
 
   uint32_t newline = line + 1;
@@ -274,7 +211,7 @@ void new_line_at(struct text *text, uint32_t line, uint32_t col) {
   }
 
   // split line if needed
-  split_line(col, &text->lines[line], &text->lines[newline]);
+  split_line(text, offset, line, newline);
 }
 
 void delete_line(struct text *text, uint32_t line) {
@@ -294,29 +231,25 @@ void delete_line(struct text *text, uint32_t line) {
   --text->nlines;
   text->lines[text->nlines].data = NULL;
   text->lines[text->nlines].nbytes = 0;
-  text->lines[text->nlines].nchars = 0;
 }
 
-void text_insert_at_inner(struct text *text, uint32_t line, uint32_t col,
-                          uint8_t *bytes, uint32_t nbytes,
-                          uint32_t *lines_added, uint32_t *cols_added) {
+static void text_insert_at_inner(struct text *text, uint32_t line,
+                                 uint32_t offset, uint8_t *bytes,
+                                 uint32_t nbytes, uint32_t *lines_added) {
   uint32_t linelen = 0, start_line = line;
 
-  *cols_added = 0;
   for (uint32_t bytei = 0; bytei < nbytes; ++bytei) {
     uint8_t byte = bytes[bytei];
     if (byte == '\n') {
       uint8_t *line_data = bytes + (bytei - linelen);
-      uint32_t nchars = utf8_nchars(line_data, linelen);
+      insert_at(text, line, offset, line_data, linelen);
 
-      insert_at(text, line, col, line_data, linelen, nchars);
-
-      col += nchars;
-      new_line_at(text, line, col);
+      offset += linelen;
+      new_line_at(text, line, offset);
 
       ++line;
       linelen = 0;
-      col = 0;
+      offset = 0;
     } else {
       ++linelen;
     }
@@ -325,30 +258,26 @@ void text_insert_at_inner(struct text *text, uint32_t line, uint32_t col,
   // handle remaining
   if (linelen > 0) {
     uint8_t *line_data = bytes + (nbytes - linelen);
-    uint32_t nchars = utf8_nchars(line_data, linelen);
-    insert_at(text, line, col, line_data, linelen, nchars);
-    *cols_added = nchars;
+    insert_at(text, line, offset, line_data, linelen);
   }
 
   *lines_added = line - start_line;
 }
 
 void text_append(struct text *text, uint8_t *bytes, uint32_t nbytes,
-                 uint32_t *lines_added, uint32_t *cols_added) {
+                 uint32_t *lines_added) {
   uint32_t line = text->nlines > 0 ? text->nlines - 1 : 0;
-  uint32_t col = text_line_length(text, line);
-
-  text_insert_at_inner(text, line, col, bytes, nbytes, lines_added, cols_added);
+  uint32_t offset = text_line_size(text, line);
+  text_insert_at_inner(text, line, offset, bytes, nbytes, lines_added);
 }
 
-void text_insert_at(struct text *text, uint32_t line, uint32_t col,
-                    uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added,
-                    uint32_t *cols_added) {
-  text_insert_at_inner(text, line, col, bytes, nbytes, lines_added, cols_added);
+void text_insert_at(struct text *text, uint32_t line, uint32_t offset,
+                    uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added) {
+  text_insert_at_inner(text, line, offset, bytes, nbytes, lines_added);
 }
 
-void text_delete(struct text *text, uint32_t start_line, uint32_t start_col,
-                 uint32_t end_line, uint32_t end_col) {
+void text_delete(struct text *text, uint32_t start_line, uint32_t start_offset,
+                 uint32_t end_line, uint32_t end_offset) {
 
   if (text->nlines == 0) {
     return;
@@ -362,45 +291,44 @@ void text_delete(struct text *text, uint32_t start_line, uint32_t start_col,
 
   if (end_line > maxline) {
     end_line = maxline;
-    end_col = text->lines[end_line].nchars;
+    end_offset = text_line_size(text, end_line);
   }
 
   struct line *firstline = &text->lines[start_line];
   struct line *lastline = &text->lines[end_line];
 
   // clamp column
-  if (start_col > firstline->nchars) {
-    start_col = firstline->nchars > 0 ? firstline->nchars - 1 : 0;
+  uint32_t firstline_len = text_line_size(text, start_line);
+  if (start_offset > firstline_len) {
+    start_offset = firstline_len > 0 ? firstline_len - 1 : 0;
   }
 
   // handle deletion of newlines
-  if (end_col > lastline->nchars) {
+  uint32_t lastline_len = text_line_size(text, end_line);
+  if (end_offset > lastline_len) {
     if (end_line + 1 < text->nlines) {
-      end_col = 0;
+      end_offset = 0;
       ++end_line;
       lastline = &text->lines[end_line];
     } else {
-      end_col = lastline->nchars;
+      end_offset = lastline_len;
     }
   }
 
-  uint32_t bytei = utf8_nbytes(lastline->data, lastline->nbytes, end_col);
+  uint32_t srcbytei = end_offset;
+  uint32_t dstbytei = start_offset;
+  uint32_t ncopy = lastline->nbytes - srcbytei;
   if (lastline == firstline) {
     // in this case we can "overwrite"
-    uint32_t dstbytei =
-        utf8_nbytes(firstline->data, firstline->nbytes, start_col);
-    memmove(firstline->data + dstbytei, lastline->data + bytei,
-            lastline->nbytes - bytei);
+    memmove(firstline->data + dstbytei, lastline->data + srcbytei, ncopy);
   } else {
     // otherwise we actually have to copy from the last line
-    insert_at(text, start_line, start_col, lastline->data + bytei,
-              lastline->nbytes - bytei, lastline->nchars - end_col);
+    insert_at(text, start_line, start_offset, lastline->data + srcbytei, ncopy);
   }
 
-  firstline->nchars = start_col + (lastline->nchars - end_col);
-  firstline->nbytes =
-      utf8_nbytes(firstline->data, firstline->nbytes, start_col) +
-      (lastline->nbytes - bytei);
+  // new byte count is whatever we had before (left of dstbytei)
+  // plus what we copied
+  firstline->nbytes = dstbytei + ncopy;
 
   // delete full lines, backwards to not shift old, crappy data upwards
   for (uint32_t linei = end_line >= text->nlines ? end_line - 1 : end_line;
@@ -429,7 +357,6 @@ void text_for_each_line(struct text *text, uint32_t line, uint32_t nlines,
         .allocated = false,
         .text = src_line->data,
         .nbytes = src_line->nbytes,
-        .nchars = src_line->nchars,
         .line = li,
     };
     callback(&line, userdata);
@@ -441,8 +368,8 @@ struct text_chunk text_get_line(struct text *text, uint32_t line) {
   return (struct text_chunk){
       .text = src_line->data,
       .nbytes = src_line->nbytes,
-      .nchars = src_line->nchars,
       .line = line,
+      .allocated = false,
   };
 }
 
@@ -453,33 +380,34 @@ struct copy_cmd {
 };
 
 struct text_chunk text_get_region(struct text *text, uint32_t start_line,
-                                  uint32_t start_col, uint32_t end_line,
-                                  uint32_t end_col) {
-  if (start_line == end_line && start_col == end_col) {
+                                  uint32_t start_offset, uint32_t end_line,
+                                  uint32_t end_offset) {
+  if (start_line == end_line && start_offset == end_offset) {
     return (struct text_chunk){0};
   }
 
   struct line *first_line = &text->lines[start_line];
   struct line *last_line = &text->lines[end_line];
+  uint32_t first_line_len = first_line->nbytes;
+  uint32_t last_line_len = last_line->nbytes;
 
-  if (start_col > first_line->nchars) {
+  if (start_offset > first_line_len) {
     return (struct text_chunk){0};
   }
 
   // handle copying of newlines
-  if (end_col > last_line->nchars) {
+  if (end_offset > last_line_len) {
     ++end_line;
-    end_col = 0;
+    end_offset = 0;
     last_line = &text->lines[end_line];
   }
 
   uint32_t nlines = end_line - start_line + 1;
   struct copy_cmd *copy_cmds = calloc(nlines, sizeof(struct copy_cmd));
 
-  uint32_t total_chars = 0, total_bytes = 0;
+  uint32_t total_bytes = 0;
   for (uint32_t line = start_line; line <= end_line; ++line) {
     struct line *l = &text->lines[line];
-    total_chars += l->nchars;
     total_bytes += l->nbytes;
 
     struct copy_cmd *cmd = &copy_cmds[line - start_line];
@@ -490,19 +418,14 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line,
 
   // correct first line
   struct copy_cmd *cmd_first = &copy_cmds[0];
-  uint32_t byteoff =
-      utf8_nbytes(first_line->data, first_line->nbytes, start_col);
-  cmd_first->byteoffset += byteoff;
-  cmd_first->nbytes -= byteoff;
-  total_bytes -= byteoff;
-  total_chars -= start_col;
+  cmd_first->byteoffset += start_offset;
+  cmd_first->nbytes -= start_offset;
+  total_bytes -= start_offset;
 
   // correct last line
   struct copy_cmd *cmd_last = &copy_cmds[nlines - 1];
-  uint32_t byteindex = utf8_nbytes(last_line->data, last_line->nbytes, end_col);
-  cmd_last->nbytes -= (last_line->nbytes - byteindex);
-  total_bytes -= (last_line->nbytes - byteindex);
-  total_chars -= (last_line->nchars - end_col);
+  cmd_last->nbytes -= (last_line->nbytes - end_offset);
+  total_bytes -= (last_line->nbytes - end_offset);
 
   uint8_t *data = (uint8_t *)malloc(
       total_bytes + /* nr of newline chars */ (end_line - start_line));
@@ -518,7 +441,6 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line,
       data[curr] = '\n';
       ++curr;
       ++total_bytes;
-      ++total_chars;
     }
   }
 
@@ -527,28 +449,25 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line,
       .text = data,
       .line = 0,
       .nbytes = total_bytes,
-      .nchars = total_chars,
       .allocated = true,
   };
 }
 
-bool text_line_contains_unicode(struct text *text, uint32_t line) {
-  return text->lines[line].nbytes != text->lines[line].nchars;
-}
-
-void text_add_property(struct text *text, struct location start,
-                       struct location end, struct text_property property) {
+void text_add_property(struct text *text, uint32_t start_line,
+                       uint32_t start_offset, uint32_t end_line,
+                       uint32_t end_offset, struct text_property property) {
   struct text_property_entry entry = {
-      .start = start,
-      .end = end,
+      .start = (struct location){.line = start_line, .col = start_offset},
+      .end = (struct location){.line = end_line, .col = end_offset},
       .property = property,
   };
   VEC_PUSH(&text->properties, entry);
 }
 
-void text_get_properties(struct text *text, struct location location,
+void text_get_properties(struct text *text, uint32_t line, uint32_t offset,
                          struct text_property **properties,
                          uint32_t max_nproperties, uint32_t *nproperties) {
+  struct location location = {.line = line, .col = offset};
   uint32_t nres = 0;
   VEC_FOR_EACH(&text->properties, struct text_property_entry * prop) {
     if (location_is_between(location, prop->start, prop->end)) {
diff --git a/src/dged/text.h b/src/dged/text.h
index 8b49ef4..28bd325 100644
--- a/src/dged/text.h
+++ b/src/dged/text.h
@@ -6,9 +6,16 @@
 #include <stdint.h>
 
 #include "location.h"
+#include "utf8.h"
 
 struct text;
-struct render_command;
+
+struct text_chunk {
+  uint8_t *text;
+  uint32_t nbytes;
+  uint32_t line;
+  bool allocated;
+};
 
 struct text *text_create(uint32_t initial_capacity);
 void text_destroy(struct text *text);
@@ -18,31 +25,21 @@ void text_destroy(struct text *text);
  */
 void text_clear(struct text *text);
 
-void text_insert_at(struct text *text, uint32_t line, uint32_t col,
-                    uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added,
-                    uint32_t *cols_added);
+void text_insert_at(struct text *text, uint32_t line, uint32_t offset,
+                    uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added);
 
 void text_append(struct text *text, uint8_t *bytes, uint32_t nbytes,
-                 uint32_t *lines_added, uint32_t *cols_added);
+                 uint32_t *lines_added);
 
-void text_delete(struct text *text, uint32_t start_line, uint32_t start_col,
-                 uint32_t end_line, uint32_t end_col);
+void text_delete(struct text *text, uint32_t start_line, uint32_t start_offset,
+                 uint32_t end_line, uint32_t end_offset);
 
-uint32_t text_num_lines(struct text *text);
-uint32_t text_line_length(struct text *text, uint32_t lineidx);
-uint32_t text_line_size(struct text *text, uint32_t lineidx);
-uint32_t text_col_to_byteindex(struct text *text, uint32_t line, uint32_t col);
-uint32_t text_byteindex_to_col(struct text *text, uint32_t line,
-                               uint32_t byteindex);
-uint32_t text_global_idx(struct text *text, uint32_t line, uint32_t col);
-
-struct text_chunk {
-  uint8_t *text;
-  uint32_t nbytes;
-  uint32_t nchars;
-  uint32_t line;
-  bool allocated;
-};
+uint32_t text_num_lines(const struct text *text);
+uint32_t text_line_size(const struct text *text, uint32_t lineidx);
+struct utf8_codepoint_iterator
+text_line_codepoint_iterator(const struct text *text, uint32_t lineidx);
+struct utf8_codepoint_iterator
+text_chunk_codepoint_iterator(const struct text_chunk *chunk);
 
 typedef void (*chunk_cb)(struct text_chunk *chunk, void *userdata);
 void text_for_each_line(struct text *text, uint32_t line, uint32_t nlines,
@@ -52,10 +49,8 @@ void text_for_each_chunk(struct text *text, chunk_cb callback, void *userdata);
 
 struct text_chunk text_get_line(struct text *text, uint32_t line);
 struct text_chunk text_get_region(struct text *text, uint32_t start_line,
-                                  uint32_t start_col, uint32_t end_line,
-                                  uint32_t end_col);
-
-bool text_line_contains_unicode(struct text *text, uint32_t line);
+                                  uint32_t start_offset, uint32_t end_line,
+                                  uint32_t end_offset);
 
 enum text_property_type {
   TextProperty_Colors,
@@ -77,10 +72,11 @@ struct text_property {
   };
 };
 
-void text_add_property(struct text *text, struct location start,
-                       struct location end, struct text_property property);
+void text_add_property(struct text *text, uint32_t start_line,
+                       uint32_t start_offset, uint32_t end_line,
+                       uint32_t end_offset, struct text_property property);
 
-void text_get_properties(struct text *text, struct location location,
+void text_get_properties(struct text *text, uint32_t line, uint32_t offset,
                          struct text_property **properties,
                          uint32_t max_nproperties, uint32_t *nproperties);
 
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index 52de2da..ede4fb1 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -1,5 +1,6 @@
 #include "utf8.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <wchar.h>
 
@@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) {
 bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; }
 bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); }
 
-uint32_t utf8_nbytes_in_char(uint8_t byte) {
-  // length of char is the number of leading ones
-  // flip it and count number of leading zeros
-  uint8_t invb = ~byte;
-  return __builtin_clz((uint32_t)invb) - 24;
+enum utf8_state {
+  Utf8_Accept = 0,
+  Utf8_Reject = 1,
+};
+
+// clang-format off
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+// clang-format on
+
+/*
+ * emoji decoding algorithm from
+ * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ */
+static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
+                              uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6)
+                                   : (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state * 16 + type];
+  return *state;
+}
+
+static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) {
+  uint32_t codepoint = 0;
+  enum utf8_state state = Utf8_Accept;
+  uint32_t bi = 0;
+  while (bi < nbytes) {
+    enum utf8_state res = decode(&state, &codepoint, bytes[bi]);
+    ++bi;
+
+    if (res == Utf8_Accept || res == Utf8_Reject) {
+      break;
+    }
+  }
+
+  if (state == Utf8_Reject) {
+    codepoint = 0xfffd;
+  }
+
+  return (struct codepoint){.codepoint = codepoint, .nbytes = bi};
 }
 
-// TODO: grapheme clusters, this returns the number of unicode code points
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) {
+  if (iter->offset >= iter->nbytes) {
+    return NULL;
+  }
+
+  iter->current = next_utf8_codepoint(iter->data + iter->offset,
+                                      iter->nbytes - iter->offset);
+  iter->offset += iter->current.nbytes;
+  return &iter->current;
+}
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+                               uint64_t initial_offset) {
+  return (struct utf8_codepoint_iterator){
+      .data = data,
+      .nbytes = len,
+      .offset = initial_offset,
+  };
+}
+
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
 uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) {
+  uint32_t bi = 0;
   uint32_t nchars = 0;
-  uint32_t expected = 0;
-  for (uint32_t bi = 0; bi < nbytes; ++bi) {
-    uint8_t byte = bytes[bi];
-    if (utf8_byte_is_unicode(byte)) {
-      if (utf8_byte_is_unicode_start(byte)) {
-        expected = utf8_nbytes_in_char(byte) - 1;
-      } else { // continuation byte
-        --expected;
-        if (expected == 0) {
-          ++nchars;
-        }
-      }
-    } else { // ascii
-      ++nchars;
-    }
+  while (bi < nbytes) {
+    struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+    ++nchars;
+    bi += codepoint.nbytes;
   }
+
   return nchars;
 }
 
-// TODO: grapheme clusters, this uses the number of unicode code points
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
 uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) {
-
   uint32_t bi = 0;
   uint32_t chars = 0;
   uint32_t expected = 0;
 
   while (chars < nchars && bi < nbytes) {
-    uint8_t byte = bytes[bi];
-    if (utf8_byte_is_unicode(byte)) {
-      if (utf8_byte_is_unicode_start(byte)) {
-        expected = utf8_nbytes_in_char(byte) - 1;
-      } else { // continuation char
-        --expected;
-        if (expected == 0) {
-          ++chars;
-        }
-      }
-    } else { // ascii
-      ++chars;
-    }
-
-    ++bi;
+    struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+    bi += codepoint.nbytes;
+    ++chars;
   }
 
+  // TODO: reject invalid?
   return bi;
 }
 
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) {
-  if (utf8_byte_is_unicode_start(*bytes)) {
-    wchar_t wc;
-    size_t nbytes = 0;
-    if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) {
-      size_t w = wcwidth(wc);
-      return w > 0 ? w : 2;
-    } else {
-      return 1;
-    }
-  } else if (utf8_byte_is_unicode_continuation(*bytes)) {
-    return 0;
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
+  if (codepoint->nbytes > 0) {
+    // TODO: use unicode classification instead
+    size_t w = wcwidth(codepoint->codepoint);
+    return w >= 0 ? w : 2;
   } else {
-    return 1;
+    return 0;
   }
 }
diff --git a/src/dged/utf8.h b/src/dged/utf8.h
index 04aa242..22ce22d 100644
--- a/src/dged/utf8.h
+++ b/src/dged/utf8.h
@@ -1,19 +1,37 @@
+#ifndef _UTF8_H
+#define _UTF8_H
+
 #include <stdbool.h>
 #include <stdint.h>
 
+struct codepoint {
+  uint32_t codepoint;
+  uint32_t nbytes;
+};
+
+struct utf8_codepoint_iterator {
+  uint8_t *data;
+  uint64_t nbytes;
+  uint64_t offset;
+  struct codepoint current;
+};
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+                               uint64_t initial_offset);
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter);
+
 /*!
  * \brief Return the number of chars the utf-8 sequence pointed at by `bytes` of
  * length `nbytes`, represents
  */
 uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes);
 
-/* Return the number of bytes used to make up the next `nchars` characters */
-uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars);
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint);
 
-/* true if `byte` is a unicode byte sequence start byte */
 bool utf8_byte_is_unicode_start(uint8_t byte);
 bool utf8_byte_is_unicode_continuation(uint8_t byte);
-bool utf8_byte_is_ascii(uint8_t byte);
 bool utf8_byte_is_unicode(uint8_t byte);
+bool utf8_byte_is_ascii(uint8_t byte);
 
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len);
+#endif