cmark

My personal build of CMark ✏️

Commit
2994011ce13f3a437c637dcac6bc841b22d80b6c
Parent
5acc7d48c73975769aeb4a5bc0f140d470d85a55
Author
Nick Wellnhofer <wellnhofer@aevum.de>
Date

Replace invalid characters in XML output

Control characters, U+FFFE and U+FFFF aren't allowed in XML 1.0, so replace them with U+FFFD (replacement character). This doesn't solve the problem how to roundtrip these characters, but at least we don't produce invalid XML. See #365.

Diffstat

2 files changed, 68 insertions, 6 deletions

Status File Name N° Changes Insertions Deletions
Modified api_test/main.c 13 11 2
Modified src/xml.c 61 57 4
diff --git a/api_test/main.c b/api_test/main.c
@@ -542,7 +542,10 @@ static void render_xml(test_batch_runner *runner) {
 
   static const char markdown[] = "foo *bar*\n"
                                  "\n"
-                                 "paragraph 2\n"
+                                 "control -\x0C-\n"
+                                 "fffe -\xEF\xBF\xBE-\n"
+                                 "ffff -\xEF\xBF\xBF-\n"
+                                 "escape <>&\"\n"
                                  "\n"
                                  "```\ncode\n```\n";
   cmark_node *doc =
@@ -559,7 +562,13 @@ static void render_xml(test_batch_runner *runner) {
                       "    </emph>\n"
                       "  </paragraph>\n"
                       "  <paragraph>\n"
-                      "    <text xml:space=\"preserve\">paragraph 2</text>\n"
+                      "    <text xml:space=\"preserve\">control -" UTF8_REPL "-</text>\n"
+                      "    <softbreak />\n"
+                      "    <text xml:space=\"preserve\">fffe -" UTF8_REPL "-</text>\n"
+                      "    <softbreak />\n"
+                      "    <text xml:space=\"preserve\">ffff -" UTF8_REPL "-</text>\n"
+                      "    <softbreak />\n"
+                      "    <text xml:space=\"preserve\">escape &lt;&gt;&amp;&quot;</text>\n"
                       "  </paragraph>\n"
                       "  <code_block xml:space=\"preserve\">code\n"
                       "</code_block>\n"
diff --git a/src/xml.c b/src/xml.c
@@ -7,16 +7,69 @@
 #include "cmark.h"
 #include "node.h"
 #include "buffer.h"
-#include "houdini.h"
 
 #define BUFFER_SIZE 100
 #define MAX_INDENT 40
 
 // Functions to convert cmark_nodes to XML strings.
 
-static void escape_xml(cmark_strbuf *dest, const unsigned char *source,
-                       bufsize_t length) {
-  houdini_escape_html0(dest, source, length, 0);
+// C0 control characters, U+FFFE and U+FFF aren't allowed in XML.
+static const char XML_ESCAPE_TABLE[256] = {
+    /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
+    /* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 0x20 */ 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5, 0,
+    /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xA0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xB0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9,
+    /* 0xC0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xD0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xE0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0xF0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// U+FFFD Replacement Character encoded in UTF-8
+#define UTF8_REPL "\xEF\xBF\xBD"
+
+static const char *XML_ESCAPES[] = {
+  "", UTF8_REPL, "&quot;", "&amp;", "&lt;", "&gt;"
+};
+
+static void escape_xml(cmark_strbuf *ob, const unsigned char *src,
+                       bufsize_t size) {
+  bufsize_t i = 0, org, esc = 0;
+
+  while (i < size) {
+    org = i;
+    while (i < size && (esc = XML_ESCAPE_TABLE[src[i]]) == 0)
+      i++;
+
+    if (i > org)
+      cmark_strbuf_put(ob, src + org, i - org);
+
+    if (i >= size)
+      break;
+
+    if (esc == 9) {
+      // To replace U+FFFE and U+FFFF with U+FFFD, only the last byte has to
+      // be changed.
+      // We know that src[i] is 0xBE or 0xBF.
+      if (i >= 2 && src[i-2] == 0xEF && src[i-1] == 0xBF) {
+        cmark_strbuf_putc(ob, 0xBD);
+      } else {
+        cmark_strbuf_putc(ob, src[i]);
+      }
+    } else {
+      cmark_strbuf_puts(ob, XML_ESCAPES[esc]);
+    }
+
+    i++;
+  }
 }
 
 static void escape_xml_str(cmark_strbuf *dest, const unsigned char *source) {