cmark
My personal build of CMark ✏️
git clone: git://git.pablopie.xyz/cmark
Commit
2994011ce13f3a437c637dcac6bc841b22d80b6c
Parent
5acc7d48c73975769aeb4a5bc0f140d470d85a55
Author
Nick Wellnhofer <wellnhofer@aevum.de >
Date
Wed, 3 Feb 2021 18:35:47 +0100
Replace invalid characters in XML output
Control characters, U+FFFE and U+FFFF aren't allowed in XML 1.0, so
replace them with U+FFFD (replacement character). This doesn't solve
the problem how to roundtrip these characters, but at least we don't
produce invalid XML. See #365.
Diffstat
2 files changed, 68 insertions, 6 deletions
diff --git a/src/xml.c b/src/xml.c
@@ -7,16 +7,69 @@
#include "cmark.h"
#include "node.h"
#include "buffer.h"
-#include "houdini.h"
#define BUFFER_SIZE 100
#define MAX_INDENT 40
// Functions to convert cmark_nodes to XML strings.
-static void escape_xml(cmark_strbuf *dest, const unsigned char *source,
- bufsize_t length) {
- houdini_escape_html0(dest, source, length, 0);
+// C0 control characters, U+FFFE and U+FFF aren't allowed in XML.
+static const char XML_ESCAPE_TABLE[256] = {
+ /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
+ /* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 0x20 */ 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5, 0,
+ /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xB0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9,
+ /* 0xC0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// U+FFFD Replacement Character encoded in UTF-8
+#define UTF8_REPL "\xEF\xBF\xBD"
+
+static const char *XML_ESCAPES[] = {
+ "", UTF8_REPL, """, "&", "<", ">"
+};
+
+static void escape_xml(cmark_strbuf *ob, const unsigned char *src,
+ bufsize_t size) {
+ bufsize_t i = 0, org, esc = 0;
+
+ while (i < size) {
+ org = i;
+ while (i < size && (esc = XML_ESCAPE_TABLE[src[i]]) == 0)
+ i++;
+
+ if (i > org)
+ cmark_strbuf_put(ob, src + org, i - org);
+
+ if (i >= size)
+ break;
+
+ if (esc == 9) {
+ // To replace U+FFFE and U+FFFF with U+FFFD, only the last byte has to
+ // be changed.
+ // We know that src[i] is 0xBE or 0xBF.
+ if (i >= 2 && src[i-2] == 0xEF && src[i-1] == 0xBF) {
+ cmark_strbuf_putc(ob, 0xBD);
+ } else {
+ cmark_strbuf_putc(ob, src[i]);
+ }
+ } else {
+ cmark_strbuf_puts(ob, XML_ESCAPES[esc]);
+ }
+
+ i++;
+ }
}
static void escape_xml_str(cmark_strbuf *dest, const unsigned char *source) {