cmark

My personal build of CMark ✏️

normalize.py (6508B)

  1 # -*- coding: utf-8 -*-
  2 from html.parser import HTMLParser
  3 import urllib
  4 
  5 try:
  6     from html.parser import HTMLParseError
  7 except ImportError:
  8     # HTMLParseError was removed in Python 3.5. It could never be
  9     # thrown, so we define a placeholder instead.
 10     class HTMLParseError(Exception):
 11         pass
 12 
 13 from html.entities import name2codepoint
 14 import sys
 15 import re
 16 import html
 17 
 18 # Normalization code, adapted from
 19 # https://github.com/karlcow/markdown-testsuite/
 20 significant_attrs = ["alt", "href", "src", "title"]
 21 whitespace_re = re.compile('\s+')
 22 class MyHTMLParser(HTMLParser):
 23     def __init__(self):
 24         HTMLParser.__init__(self)
 25         self.convert_charrefs = False
 26         self.last = "starttag"
 27         self.in_pre = False
 28         self.output = ""
 29         self.last_tag = ""
 30     def handle_data(self, data):
 31         after_tag = self.last == "endtag" or self.last == "starttag"
 32         after_block_tag = after_tag and self.is_block_tag(self.last_tag)
 33         if after_tag and self.last_tag == "br":
 34             data = data.lstrip('\n')
 35         if not self.in_pre:
 36             data = whitespace_re.sub(' ', data)
 37         if after_block_tag and not self.in_pre:
 38             if self.last == "starttag":
 39                 data = data.lstrip()
 40             elif self.last == "endtag":
 41                 data = data.strip()
 42         self.output += data
 43         self.last = "data"
 44     def handle_endtag(self, tag):
 45         if tag == "pre":
 46             self.in_pre = False
 47         elif self.is_block_tag(tag):
 48             self.output = self.output.rstrip()
 49         self.output += "</" + tag + ">"
 50         self.last_tag = tag
 51         self.last = "endtag"
 52     def handle_starttag(self, tag, attrs):
 53         if tag == "pre":
 54             self.in_pre = True
 55         if self.is_block_tag(tag):
 56             self.output = self.output.rstrip()
 57         self.output += "<" + tag
 58         # For now we don't strip out 'extra' attributes, because of
 59         # raw HTML test cases.
 60         # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
 61         if attrs:
 62             attrs.sort()
 63             for (k,v) in attrs:
 64                 self.output += " " + k
 65                 if v in ['href','src']:
 66                     self.output += ("=" + '"' +
 67                             urllib.quote(urllib.unquote(v), safe='/') + '"')
 68                 elif v != None:
 69                     self.output += ("=" + '"' + html.escape(v,quote=True) + '"')
 70         self.output += ">"
 71         self.last_tag = tag
 72         self.last = "starttag"
 73     def handle_startendtag(self, tag, attrs):
 74         """Ignore closing tag for self-closing """
 75         self.handle_starttag(tag, attrs)
 76         self.last_tag = tag
 77         self.last = "endtag"
 78     def handle_comment(self, data):
 79         self.output += '<!--' + data + '-->'
 80         self.last = "comment"
 81     def handle_decl(self, data):
 82         self.output += '<!' + data + '>'
 83         self.last = "decl"
 84     def unknown_decl(self, data):
 85         self.output += '<!' + data + '>'
 86         self.last = "decl"
 87     def handle_pi(self,data):
 88         self.output += '<?' + data + '>'
 89         self.last = "pi"
 90     def handle_entityref(self, name):
 91         try:
 92             c = chr(name2codepoint[name])
 93         except KeyError:
 94             c = None
 95         self.output_char(c, '&' + name + ';')
 96         self.last = "ref"
 97     def handle_charref(self, name):
 98         try:
 99             if name.startswith("x"):
100                 c = chr(int(name[1:], 16))
101             else:
102                 c = chr(int(name))
103         except ValueError:
104                 c = None
105         self.output_char(c, '&' + name + ';')
106         self.last = "ref"
107     # Helpers.
108     def output_char(self, c, fallback):
109         if c == '<':
110             self.output += "&lt;"
111         elif c == '>':
112             self.output += "&gt;"
113         elif c == '&':
114             self.output += "&amp;"
115         elif c == '"':
116             self.output += "&quot;"
117         elif c == None:
118             self.output += fallback
119         else:
120             self.output += c
121 
122     def is_block_tag(self,tag):
123         return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
124             'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
125             'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
126             'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
127             'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
128             'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
129             'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
130 
131 def normalize_html(html):
132     r"""
133     Return normalized form of HTML which ignores insignificant output
134     differences:
135 
136     Multiple inner whitespaces are collapsed to a single space (except
137     in pre tags):
138 
139         >>> normalize_html("<p>a  \t b</p>")
140         '<p>a b</p>'
141 
142         >>> normalize_html("<p>a  \t\nb</p>")
143         '<p>a b</p>'
144 
145     * Whitespace surrounding block-level tags is removed.
146 
147         >>> normalize_html("<p>a  b</p>")
148         '<p>a b</p>'
149 
150         >>> normalize_html(" <p>a  b</p>")
151         '<p>a b</p>'
152 
153         >>> normalize_html("<p>a  b</p> ")
154         '<p>a b</p>'
155 
156         >>> normalize_html("\n\t<p>\n\t\ta  b\t\t</p>\n\t")
157         '<p>a b</p>'
158 
159         >>> normalize_html("<i>a  b</i> ")
160         '<i>a b</i> '
161 
162     * Self-closing tags are converted to open tags.
163 
164         >>> normalize_html("<br />")
165         '<br>'
166 
167     * Attributes are sorted and lowercased.
168 
169         >>> normalize_html('<a title="bar" HREF="foo">x</a>')
170         '<a href="foo" title="bar">x</a>'
171 
172     * References are converted to unicode, except that '<', '>', '&', and
173       '"' are rendered using entities.
174 
175         >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
176         '\u2200&amp;&gt;&lt;&quot;'
177 
178     """
179     html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
180     try:
181         parser = MyHTMLParser()
182         # We work around HTMLParser's limitations parsing CDATA
183         # by breaking the input into chunks and passing CDATA chunks
184         # through verbatim.
185         for chunk in re.finditer(html_chunk_re, html):
186             if chunk.group(0)[:8] == "<![CDATA":
187                 parser.output += chunk.group(0)
188             else:
189                 parser.feed(chunk.group(0))
190         parser.close()
191         return parser.output
192     except HTMLParseError as e:
193         sys.stderr.write("Normalization error: " + e.msg + "\n")
194         return html  # on error, return unnormalized HTML