cmark

My personal build of CMark ✏️

Commit
d57f3952ca8b9aac16db8243539f4c1c5dbf3c93
Parent
bf44064d09afd04039058a00c32c1532fb5e2b61
Author
John MacFarlane <jgm@berkeley.edu>
Date

Added xml writer, to dump the AST in XML.

This is a work-in-progress.

CommonMark.dtd gives the DTD for the generated XML.

Closes #53.

Diffstat

5 files changed, 198 insertions, 1 deletion

Status File Name N° Changes Insertions Deletions
Added CommonMark.dtd 45 45 0
Modified src/CMakeLists.txt 1 1 0
Modified src/cmark.h 5 5 0
Modified src/main.c 8 7 1
Added src/xml.c 140 140 0
diff --git a/CommonMark.dtd b/CommonMark.dtd
@@ -0,0 +1,45 @@
+<!-- DTD for CommonMark xml export format -->
+<!ELEMENT DOCUMENT
+(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*>
+<!ELEMENT BLOCK_QUOTE
+(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*>
+<!ELEMENT LIST (LIST_ITEM)+>
+<!ATTLIST LIST
+          type (bullet|ordered) #REQUIRED
+          start CDATA #IMPLIED
+          tight (true|false) #REQUIRED
+          delimiter (period|paren) #IMPLIED>
+<!ELEMENT LIST_ITEM
+(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*>
+<!ELEMENT CODE_BLOCK (#PCDATA)>
+<!ATTLIST CODE_BLOCK
+          xml:space CDATA #FIXED "preserve"
+          info CDATA #IMPLIED>
+<!ELEMENT HTML (#PCDATA)>
+<!ATTLIST HTML xml:space CDATA #FIXED "preserve">
+<!ELEMENT PARAGRAPH
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ELEMENT HEADER
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ATTLIST HEADER level (1|2|3|4|5|6) #REQUIRED>
+<!ELEMENT HRULE EMPTY>
+<!ELEMENT TEXT (#PCDATA)>
+<!ELEMENT SOFTBREAK EMPTY>
+<!ELEMENT LINEBREAK EMPTY>
+<!ELEMENT CODE (#PCDATA)>
+<!ATTLIST CODE xml:space CDATA #FIXED "preserve">
+<!ELEMENT INLINE_HTML (#PCDATA)>
+<!ATTLIST INLINE_HTML xml:space CDATA #FIXED "preserve">
+<!ELEMENT EMPH
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ELEMENT STRONG
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ELEMENT LINK
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ATTLIST LINK url CDATA #REQUIRED>
+<!ATTLIST LINK title CDATA #IMPLIED>
+<!ELEMENT IMAGE
+(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*>
+<!ATTLIST IMAGE url CDATA #REQUIRED>
+<!ATTLIST IMAGE title CDATA #IMPLIED>
+<!ATTLIST ANY sourcepos CDATA #IMPLIED>
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -29,6 +29,7 @@ set(LIBRARY_SOURCES
   buffer.c
   references.c
   man.c
+  xml.c
   html.c
   html_unescape.gperf
   houdini_href_e.c
diff --git a/src/cmark.h b/src/cmark.h
@@ -394,6 +394,11 @@ cmark_node *cmark_parse_file(FILE *f);
 CMARK_EXPORT
 char *cmark_render_ast(cmark_node *root);
 
+/** Render a 'node' tree as XML.
+ */
+CMARK_EXPORT
+char *cmark_render_xml(cmark_node *root);
+
 /** Render a 'node' tree as an HTML fragment.  It is up to the user
  * to add an appropriate header and footer.
  */
diff --git a/src/main.c b/src/main.c
@@ -10,6 +10,7 @@
 typedef enum {
 	FORMAT_NONE,
 	FORMAT_HTML,
+	FORMAT_XML,
 	FORMAT_MAN,
 	FORMAT_AST
 } writer_format;
@@ -18,7 +19,7 @@ void print_usage()
 {
 	printf("Usage:   cmark [FILE*]\n");
 	printf("Options:\n");
-	printf("  --to, -t FORMAT  Specify output format (html, man, ast)\n");
+	printf("  --to, -t FORMAT  Specify output format (html, xml, man, ast)\n");
 	printf("  --help, -h       Print usage information\n");
 	printf("  --version        Print version\n");
 }
@@ -33,6 +34,9 @@ static void print_document(cmark_node *document, writer_format writer)
 	case FORMAT_HTML:
 		result = cmark_render_html(document);
 		break;
+	case FORMAT_XML:
+		result = cmark_render_xml(document);
+		break;
 	case FORMAT_MAN:
 		result = cmark_render_man(document);
 		break;
@@ -74,6 +78,8 @@ int main(int argc, char *argv[])
 					writer = FORMAT_MAN;
 				} else if (strcmp(argv[i], "html") == 0) {
 					writer = FORMAT_HTML;
+				} else if (strcmp(argv[i], "xml") == 0) {
+					writer = FORMAT_XML;
 				} else if (strcmp(argv[i], "ast") == 0) {
 					writer = FORMAT_AST;
 				} else {
diff --git a/src/xml.c b/src/xml.c
@@ -0,0 +1,140 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "config.h"
+#include "cmark.h"
+#include "node.h"
+#include "buffer.h"
+#include "houdini.h"
+
+// Functions to convert cmark_nodes to XML strings.
+
+static void escape_xml(cmark_strbuf *dest, const unsigned char *source, int length)
+{
+	if (source != NULL) {
+		if (length < 0)
+			length = strlen((char *)source);
+
+		houdini_escape_html0(dest, source, (size_t)length, 0);
+	}
+}
+
+struct render_state {
+	cmark_strbuf* xml;
+	int indent;
+};
+
+static inline void indent(struct render_state *state)
+{
+	int i;
+	for (i = 0; i < state->indent; i++) {
+		cmark_strbuf_putc(state->xml, ' ');
+	}
+}
+
+static int
+S_render_node(cmark_node *node, cmark_event_type ev_type, void *vstate)
+{
+	struct render_state *state = vstate;
+	cmark_strbuf *xml = state->xml;
+	bool literal = false;
+
+	bool entering = (ev_type == CMARK_EVENT_ENTER);
+
+	if (entering) {
+		indent(state);
+		cmark_strbuf_printf(xml, "<%s", cmark_node_type_string(node));
+
+		if (node->start_line != 0) {
+			cmark_strbuf_printf(xml, " sourcepos=\"%d:%d-%d\"",
+					    node->start_line,
+					    node->start_column,
+					    node->end_line);
+		}
+
+		literal = false;
+
+		switch (node->type) {
+		case CMARK_NODE_TEXT:
+		case CMARK_NODE_CODE:
+		case CMARK_NODE_HTML:
+		case CMARK_NODE_INLINE_HTML:
+			cmark_strbuf_puts(xml, ">");
+			escape_xml(xml, node->as.literal.data,
+				   node->as.literal.len);
+			cmark_strbuf_puts(xml, "</");
+			cmark_strbuf_puts(xml, cmark_node_type_string(node));
+			literal = true;
+			break;
+		case CMARK_NODE_CODE_BLOCK:
+			if (node->as.code.info.len > 0) {
+				cmark_strbuf_puts(xml, " info=\"");
+				escape_xml(xml, node->as.code.info.data,
+					   node->as.code.info.len);
+				cmark_strbuf_putc(xml, '"');
+			}
+			cmark_strbuf_puts(xml, ">");
+			escape_xml(xml, node->as.code.literal.data,
+				   node->as.code.literal.len);
+			cmark_strbuf_puts(xml, "</");
+			cmark_strbuf_puts(xml, cmark_node_type_string(node));
+			literal = true;
+			break;
+		case CMARK_NODE_LINK:
+		case CMARK_NODE_IMAGE:
+			cmark_strbuf_puts(xml, " url=\"");
+			escape_xml(xml, node->as.link.url, -1);
+			cmark_strbuf_putc(xml, '"');
+			cmark_strbuf_puts(xml, " title=\"");
+			escape_xml(xml, node->as.link.title, -1);
+			cmark_strbuf_putc(xml, '"');
+			break;
+		default:
+			break;
+		}
+		if (node->first_child) {
+			state->indent += 2;
+		} else if (!literal) {
+			cmark_strbuf_puts(xml, " /");
+		}
+
+	} else {
+		if (node->first_child) {
+			state->indent -= 2;
+		}
+		indent(state);
+		cmark_strbuf_printf(xml, "</%s", cmark_node_type_string(node));
+	}
+
+	// TODO print attributes
+
+	cmark_strbuf_puts(xml, ">\n");
+
+	return 1;
+}
+
+char *cmark_render_xml(cmark_node *root)
+{
+	char *result;
+	cmark_strbuf xml = GH_BUF_INIT;
+	cmark_event_type ev_type;
+	cmark_node *cur;
+	struct render_state state = { &xml, 0 };
+	cmark_iter *iter = cmark_iter_new(root);
+
+	cmark_strbuf_puts(state.xml,
+			  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
+	cmark_strbuf_puts(state.xml,
+			  "<!DOCTYPE CommonMark SYSTEM \"CommonMark.dtd\">\n");
+	while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
+		cur = cmark_iter_get_node(iter);
+		S_render_node(cur, ev_type, &state);
+	}
+	result = (char *)cmark_strbuf_detach(&xml);
+
+	cmark_iter_free(iter);
+	cmark_strbuf_free(&xml);
+	return result;
+}