cmark

My personal build of CMark ✏️

Commit
b0a4cfa36e99c27dd2b20be8f8888fa7721bad58
Parent
75b48c5938f5984dbcf79a579d15c9cbd6447d12
Author
Nick Wellnhofer <wellnhofer@aevum.de>
Date

Use C string instead of chunk for literal text

Use zero-terminated C strings and a separate length field instead of cmark_chunks. Literal inline text will now be copied from the parent block's content buffer, slowing the benchmark down by 10-15%.

The node struct never references memory of other nodes now, fixing #309. Node accessors don't have to check for delayed creation of C strings, so parsing and iterating all literals using the public API should actually be faster than before.

Diffstat

7 files changed, 71 insertions, 33 deletions

Status File Name N° Changes Insertions Deletions
Modified api_test/main.c 19 16 3
Modified src/blocks.c 3 2 1
Modified src/commonmark.c 3 1 2
Modified src/inlines.c 59 39 20
Modified src/iterator.c 5 3 2
Modified src/node.c 7 4 3
Modified src/node.h 8 6 2
diff --git a/api_test/main.c b/api_test/main.c
@@ -243,6 +243,21 @@ static void accessors(test_batch_runner *runner) {
   cmark_node_free(doc);
 }
 
+static void free_parent(test_batch_runner *runner) {
+  static const char markdown[] = "text\n";
+
+  cmark_node *doc =
+      cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT);
+
+  cmark_node *para = cmark_node_first_child(doc);
+  cmark_node *text = cmark_node_first_child(para);
+  cmark_node_unlink(text);
+  cmark_node_free(doc);
+  STR_EQ(runner, cmark_node_get_literal(text), "text",
+         "inline content after freeing parent block");
+  cmark_node_free(text);
+}
+
 static void node_check(test_batch_runner *runner) {
   // Construct an incomplete tree.
   cmark_node *doc = cmark_node_new(CMARK_NODE_DOCUMENT);
@@ -381,9 +396,6 @@ static void create_tree(test_batch_runner *runner) {
   free(html);
 
   cmark_node_free(doc);
-
-  // TODO: Test that the contents of an unlinked inline are valid
-  // after the parent block was destroyed. This doesn't work so far.
   cmark_node_free(emph);
 }
 
@@ -1031,6 +1043,7 @@ int main() {
   version(runner);
   constructor(runner);
   accessors(runner);
+  free_parent(runner);
   node_check(runner);
   iterator(runner);
   iterator_delete(runner);
diff --git a/src/blocks.c b/src/blocks.c
@@ -322,7 +322,8 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
     break;
 
   case CMARK_NODE_HTML_BLOCK:
-    b->as.literal = cmark_chunk_buf_detach(node_content);
+    b->as.literal.len = node_content->size;
+    b->as.literal.data = cmark_strbuf_detach(node_content);
     break;
 
   case CMARK_NODE_LIST:      // determine tight/loose status
diff --git a/src/commonmark.c b/src/commonmark.c
@@ -146,8 +146,7 @@ static bool is_autolink(cmark_node *node) {
   if (strcmp((const char *)url, "mailto:") == 0) {
     url += 7;
   }
-  return strncmp((const char *)url, (char *)link_text->as.literal.data,
-                 link_text->as.literal.len) == 0;
+  return strcmp((const char *)url, (char *)link_text->as.literal.data) == 0;
 }
 
 // if node is a block node, returns node.
diff --git a/src/inlines.c b/src/inlines.c
@@ -22,9 +22,6 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
 static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
 
 // Macros for creating various kinds of simple.
-#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
-#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
-#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
 #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
 #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
 #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
@@ -81,12 +78,10 @@ static bufsize_t subject_find_special_char(subject *subj, int options);
 
 // Create an inline with a literal string value.
 static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
-                                             int start_column, int end_column,
-                                             cmark_chunk s) {
+                                             int start_column, int end_column) {
   cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
   cmark_strbuf_init(subj->mem, &e->content, 0);
   e->type = (uint16_t)t;
-  e->as.literal = s;
   e->start_line = e->end_line = subj->line;
   // columns are 1 based.
   e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
@@ -102,6 +97,23 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
   return e;
 }
 
+static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) {
+  cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
+  e->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
+  memcpy(e->as.literal.data, s.data, s.len);
+  e->as.literal.data[s.len] = 0;
+  e->as.literal.len = s.len;
+  return e;
+}
+
+static cmark_node *make_str_from_buf(subject *subj, int sc, int ec,
+                                     cmark_strbuf *buf) {
+  cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
+  e->as.literal.len = buf->size;
+  e->as.literal.data = cmark_strbuf_detach(buf);
+  return e;
+}
+
 // Like make_str, but parses entities.
 static cmark_node *make_str_with_entities(subject *subj,
                                           int start_column, int end_column,
@@ -109,7 +121,7 @@ static cmark_node *make_str_with_entities(subject *subj,
   cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
 
   if (houdini_unescape_html(&unescaped, content->data, content->len)) {
-    return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
+    return make_str_from_buf(subj, start_column, end_column, &unescaped);
   } else {
     return make_str(subj, start_column, end_column, *content);
   }
@@ -368,7 +380,10 @@ static cmark_node *handle_backticks(subject *subj, int options) {
                      endpos - startpos - openticks.len);
     S_normalize_code(&buf);
 
-    cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
+    cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos,
+                                    endpos - openticks.len - 1);
+    node->as.literal.len = buf.size;
+    node->as.literal.data = cmark_strbuf_detach(&buf);
     adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
     return node;
   }
@@ -579,7 +594,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) {
     cmark_strbuf_puts(&buf, ENDASH);
   }
 
-  return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
+  return make_str_from_buf(subj, startpos, subj->pos - 1, &buf);
 }
 
 // Assumes we have a period at the current position.
@@ -656,19 +671,15 @@ static void process_emphasis(subject *subj, delimiter *stack_bottom) {
           closer = closer->next;
         }
       } else if (closer->delim_char == '\'') {
-        cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
-        closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
+        cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE);
         if (opener_found) {
-          cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
-          opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
+          cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE);
         }
         closer = closer->next;
       } else if (closer->delim_char == '"') {
-        cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
-        closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
+        cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE);
         if (opener_found) {
-          cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
-          opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
+          cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE);
         }
         closer = closer->next;
       }
@@ -709,7 +720,9 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
   opener_num_chars -= use_delims;
   closer_num_chars -= use_delims;
   opener_inl->as.literal.len = opener_num_chars;
+  opener_inl->as.literal.data[opener_num_chars] = 0;
   closer_inl->as.literal.len = closer_num_chars;
+  closer_inl->as.literal.data[closer_num_chars] = 0;
 
   // free delimiters between opener and closer
   delim = closer->previous;
@@ -785,7 +798,7 @@ static cmark_node *handle_entity(subject *subj) {
     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
 
   subj->pos += len;
-  return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
+  return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent);
 }
 
 // Clean a URL: remove surrounding whitespace, and remove \ that escape
@@ -853,9 +866,15 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
   // finally, try to match an html tag
   matchlen = scan_html_tag(&subj->input, subj->pos);
   if (matchlen > 0) {
-    contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
+    unsigned char *src = subj->input.data + subj->pos - 1;
+    bufsize_t len = matchlen + 1;
     subj->pos += matchlen;
-    cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
+    cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE,
+                                    subj->pos - matchlen - 1, subj->pos - 1);
+    node->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, len + 1);
+    memcpy(node->as.literal.data, src, len);
+    node->as.literal.data[len] = 0;
+    node->as.literal.len = len;
     adjust_subj_node_newlines(subj, node, matchlen, 1, options);
     return node;
   }
diff --git a/src/iterator.c b/src/iterator.c
@@ -111,8 +111,9 @@ void cmark_consolidate_text_nodes(cmark_node *root) {
         cmark_node_free(tmp);
         tmp = next;
       }
-      cmark_chunk_free(iter->mem, &cur->as.literal);
-      cur->as.literal = cmark_chunk_buf_detach(&buf);
+      iter->mem->free(cur->as.literal.data);
+      cur->as.literal.len = buf.size;
+      cur->as.literal.data = cmark_strbuf_detach(&buf);
     }
   }
 
diff --git a/src/node.c b/src/node.c
@@ -116,7 +116,7 @@ static void S_free_nodes(cmark_node *e) {
     case CMARK_NODE_HTML_INLINE:
     case CMARK_NODE_CODE:
     case CMARK_NODE_HTML_BLOCK:
-      cmark_chunk_free(NODE_MEM(e), &e->as.literal);
+      NODE_MEM(e)->free(e->as.literal.data);
       break;
     case CMARK_NODE_LINK:
     case CMARK_NODE_IMAGE:
@@ -295,7 +295,7 @@ const char *cmark_node_get_literal(cmark_node *node) {
   case CMARK_NODE_TEXT:
   case CMARK_NODE_HTML_INLINE:
   case CMARK_NODE_CODE:
-    return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.literal);
+    return node->as.literal.data ? (char *)node->as.literal.data : "";
 
   case CMARK_NODE_CODE_BLOCK:
     return (char *)node->as.code.literal;
@@ -317,7 +317,8 @@ int cmark_node_set_literal(cmark_node *node, const char *content) {
   case CMARK_NODE_TEXT:
   case CMARK_NODE_HTML_INLINE:
   case CMARK_NODE_CODE:
-    cmark_chunk_set_cstr(NODE_MEM(node), &node->as.literal, content);
+    node->as.literal.len = cmark_set_cstr(NODE_MEM(node),
+                                          &node->as.literal.data, content);
     return 1;
 
   case CMARK_NODE_CODE_BLOCK:
diff --git a/src/node.h b/src/node.h
@@ -10,7 +10,11 @@ extern "C" {
 
 #include "cmark.h"
 #include "buffer.h"
-#include "chunk.h"
+
+typedef struct {
+  unsigned char *data;
+  bufsize_t len;
+} cmark_literal;
 
 typedef struct {
   cmark_list_type list_type;
@@ -72,7 +76,7 @@ struct cmark_node {
   uint16_t flags;
 
   union {
-    cmark_chunk literal;
+    cmark_literal literal;
     cmark_list list;
     cmark_code code;
     cmark_heading heading;