cmark

My personal build of CMark ✏️

Commit
846650bc717b908ef865d8c4095ee6fcd059c79b
Parent
5860d60cc79332c01da59f39e90fff2bcc8a5e9e
Author
John MacFarlane <jgm@berkeley.edu>
Date

Optimized --smart.

Previously we had 0.24s on the benchmark without `--smart`, 0.38s with it. Now we have 0.27s with `--smart`, so `--smart` has only a small performance impact.

Diffstat

1 file changed, 36 insertions, 9 deletions

Status File Name N° Changes Insertions Deletions
Modified src/smart.c 45 36 9
diff --git a/src/smart.c b/src/smart.c
@@ -10,6 +10,25 @@
 #include "buffer.h"
 #include "chunk.h"
 
+static const char SMART_PUNCT_TABLE[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
 void escape_with_smart(cmark_strbuf *buf,
 		       cmark_node *node,
 		       void (*escape)(cmark_strbuf *, const unsigned char *, int),
@@ -24,11 +43,11 @@ void escape_with_smart(cmark_strbuf *buf,
 	int32_t c = 0;
 	int32_t after_char = 0;
 	int32_t before_char = 0;
-	int len;
 	bool left_flanking, right_flanking;
 	int lastout = 0;
-	int i = 0;
+	int i = 0, j = 0;
 	cmark_chunk lit = node->as.literal;
+	int len;
 
 	// set before_char based on previous text node if there is one:
 	if (node->prev) {
@@ -58,18 +77,26 @@ void escape_with_smart(cmark_strbuf *buf,
 	}
 
 	while (i < lit.len) {
-		len = utf8proc_iterate(lit.data + i, lit.len - i, &c);
-		i += len;
-
-		// replace with efficient lookup table:
-		if (!(c == 34 || c == 39 || c == 45 || c == 46)) {
-			before_char = c;
+		c = lit.data[i];
+		i++;
+		if (SMART_PUNCT_TABLE[c] == 0) {
 			continue;
 		}
-		(*escape)(buf, lit.data + lastout, i - len - lastout);
+
+		(*escape)(buf, lit.data + lastout, i - 1 - lastout);
 
 		if (c == 34 || c == 39) {
 
+			// if i == 0 then before_char was set before the while loop
+			if (i > 1) {
+				j = i - 2;
+				// walk back to the beginning of the UTF_8 sequence:
+				while (j >= 0 && lit.data[j] >> 6 == 2) {
+					j--;
+				}
+				utf8proc_iterate(lit.data + j, lit.len - j, &before_char);
+			}
+
 			if (i >= lit.len) {
 				if (node->next) {
 					if (node->next->type == CMARK_NODE_TEXT) {