cmark

My personal build of CMark ✏️

Commit: 7600cd859014bac31200d52b1c4f6e88136b3c97
Parent: c9875cbbbe293e6727a7a25b79e7ea4949ef5670
Author: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 20 Nov 2014 08:57:20 -0800

runtests.py: Fixed normalization of declarations and CDATA.

If the input contains CDATA, we break it out and pass it through verbatim, without sending it through HTMLParser, which breaks on CDATA.

Improves on #161.

Diffstat

1 file changed, 10 insertions, 7 deletions

Status	File Name	N° Changes	Insertions	Deletions
Modified	runtests.py	17	10	7

diff --git a/runtests.py b/runtests.py
@@ -115,7 +115,7 @@ class MyHTMLParser(HTMLParser):
     def handle_decl(self, data):
         self.output += '<!' + data + '>'
         self.last = "decl"
-    def handle_unknown_decl(self, data):
+    def unknown_decl(self, data):
         self.output += '<!' + data + '>'
         self.last = "decl"
     def handle_pi(self,data):
@@ -174,15 +174,18 @@ def normalize_html(html):
     * Attributes are sorted and lowercased.
     * References are converted to unicode, except that '<', '>', '&', and
       '&' are rendered using entities.
-
-    Known limitations:
-
-    * HTMLParser just swallows CDATA.
-    * HTMLParser seems to treat unknown declarations as comments.
     """
+    html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
     try:
         parser = MyHTMLParser()
-        parser.feed(html.decode(encoding='UTF-8'))
+        # We work around HTMLParser's limitations parsing CDATA
+        # by breaking the input into chunks and passing CDATA chunks
+        # through verbatim.
+        for chunk in re.finditer(html_chunk_re, html):
+            if chunk.group(0)[:8] == "<![CDATA":
+                parser.output += chunk.group(0)
+            else:
+                parser.feed(chunk.group(0).decode(encoding='UTF-8'))
         parser.close()
         return parser.output
     except HTMLParseError as e: