cmark

My personal build of CMark ✏️

Commit
01cb5c9563cc257e14a0093843d87621563d961f
Parent
1f8ea828409287b7901bf32d01f8ec662ffdc9ba
Author
John MacFarlane <jgm@berkeley.edu>
Date

Improved escaping in commonmark renderer.

We try not to escape punctuation unless we absolutely have to. So, `)` and `.` are no longer escaped whenever they occur after digits; now they are only escaped if they are geuninely in a position where they'd cause a list item.

This required a couple changes to render.c.

- `renderer->begin_content` is only set to false AFTER a string of digits at the beginning of the line. (This is slightly unprincipled.) - We never break before a numeral (also slightly unprincipled).

Diffstat

2 files changed, 23 insertions, 6 deletions

Status File Name N° Changes Insertions Deletions
Modified src/commonmark.c 11 8 3
Modified src/render.c 18 15 3
diff --git a/src/commonmark.c b/src/commonmark.c
@@ -24,6 +24,8 @@ static inline void outc(cmark_renderer *renderer, cmark_escaping escape,
                         int32_t c, unsigned char nextc) {
   bool needs_escaping = false;
   char encoded[20];
+  bool follows_digit = renderer->buffer->size > 0 &&
+	  cmark_isdigit(renderer->buffer->ptr[renderer->buffer->size - 1]);
 
   needs_escaping =
       escape != LITERAL &&
@@ -31,9 +33,12 @@ static inline void outc(cmark_renderer *renderer, cmark_escaping escape,
         (c == '*' || c == '_' || c == '[' || c == ']' || c == '#' || c == '<' ||
          c == '>' || c == '\\' || c == '`' || c == '!' ||
          (c == '&' && isalpha(nextc)) || (c == '!' && nextc == '[') ||
-         (renderer->begin_content && (c == '-' || c == '+' || c == '=')) ||
-         ((c == '.' || c == ')') &&
-          isdigit(renderer->buffer->ptr[renderer->buffer->size - 1])))) ||
+         (renderer->begin_content && (c == '-' || c == '+' || c == '=') &&
+	  // begin_content doesn't get set to false til we've passed digits
+	  // at the beginning of line, so...
+	  !follows_digit) ||
+         (renderer->begin_content && (c == '.' || c == ')') && follows_digit &&
+	  (nextc == 0 || cmark_isspace(nextc))))) ||
        (escape == URL && (c == '`' || c == '<' || c == '>' || isspace(c) ||
                           c == '\\' || c == ')' || c == '(')) ||
        (escape == TITLE &&
diff --git a/src/render.c b/src/render.c
@@ -23,6 +23,7 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap,
   unsigned char nextc;
   int32_t c;
   int i = 0;
+  int last_nonspace;
   int len;
   cmark_chunk remainder = cmark_chunk_literal("");
   int k = renderer->buffer->size - 1;
@@ -63,15 +64,20 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap,
     nextc = source[i + len];
     if (c == 32 && wrap) {
       if (!renderer->begin_line) {
+	last_nonspace = renderer->buffer->size;
         cmark_strbuf_putc(renderer->buffer, ' ');
         renderer->column += 1;
         renderer->begin_line = false;
         renderer->begin_content = false;
-        renderer->last_breakable = renderer->buffer->size - 1;
         // skip following spaces
         while (source[i + 1] == ' ') {
           i++;
         }
+	// We don't allow breaks that make a digit the first character
+	// because this causes problems with commonmark output.
+	if (!cmark_isdigit(source[i + 1])) {
+          renderer->last_breakable = last_nonspace;
+	}
       }
 
     } else if (c == 10) {
@@ -83,11 +89,17 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap,
     } else if (escape == LITERAL) {
       cmark_render_code_point(renderer, c);
       renderer->begin_line = false;
-      renderer->begin_content = false;
+      // we don't set 'begin_content' to false til we've
+      // finished parsing a digit.  Reason:  in commonmark
+      // we need to escape a potential list marker after
+      // a digit:
+      renderer->begin_content = renderer->begin_content &&
+	                          cmark_isdigit(c) == 1;
     } else {
       (renderer->outc)(renderer, escape, c, nextc);
       renderer->begin_line = false;
-      renderer->begin_content = false;
+      renderer->begin_content = renderer->begin_content &&
+	                          cmark_isdigit(c) == 1;
     }
 
     // If adding the character went beyond width, look for an