cmark

My personal build of CMark ✏️

Commit
54c087d1272b4ce756e56de68e8e6dfac6d159fc
Parent
8418191a26b970c32c0396bf3580c0c7a374fb35
Author
John MacFarlane <jgm@berkeley.edu>
Date

make_entities_h.py: confirm there are no hash collisions.

At least with valid data.

Diffstat

1 file changed, 11 insertions, 3 deletions

Status File Name N° Changes Insertions Deletions
Modified tools/make_entities_h.py 14 11 3
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
@@ -15,11 +15,19 @@ def djb2(s):
 
 entities5 = html.entities.html5
 
+# remove keys without semicolons.  For some reason the list
+# has duplicates of a few things, like auml, one with and one
+# without a semicolon.
+entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';']
+
 # Note that most entries in the entity table end with ';', but in a few
 # cases we have both a version with ';' and one without, so we strip out
 # the latter to avoid duplicates:
-hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s]
-                      for s in entities5.keys() if s[-1] == ';'])
+hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities])
+
+# Confirm no hash collisions
+hashes = [x for [x,_,_] in hashed_data]
+assert(len(hashes) == len(set(hashes)))
 
 # indices is a dictionary - given a hash it spits out the ordering
 # of this entity in the list (the array index)
@@ -60,7 +68,7 @@ def to_binary_array(xs, mid):
     mg = indices[greaters[midgreaters][0]]
   lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
                           ''.join(map(toesc, x[1])) + "\", " + str(ml) +
-                          ", " + str(mg) + "}, /* &" + x[2] + " */")
+                          ", " + str(mg) + "}, /* &" + x[2] + "; */")
   if len(lesses) > 0:
     to_binary_array(lesses, midlesses)
   if len(greaters) > 0: