cmark

My personal build of CMark ✏️

Commit
bc14d869323650e936c7143dcf941b28ccd5b57d
Parent
a173d0bb746b1afc6a4942a2536c9008da35b572
Author
John MacFarlane <jgm@berkeley.edu>
Date

Merge pull request #57 from nwellnhof/optimize_utf8proc_detab

Optimize utf8proc_detab

Diffstat

1 file changed, 40 insertions, 22 deletions

Status File Name N° Changes Insertions Deletions
Modified src/utf8.c 62 40 22
diff --git a/src/utf8.c b/src/utf8.c
@@ -56,21 +56,18 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
 // Validate a single UTF-8 character according to RFC 3629.
 static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 {
-	int length = utf8proc_charlen(str, str_len);
+	int length = utf8proc_utf8class[str[0]];
 
-	if (length <= 0)
-		return length;
+	if (!length)
+		return -1;
 
-	switch (length) {
-	case 1:
-		if (str[0] == 0x00) {
-			// ASCII NUL is technically valid but rejected
-			// for security reasons.
-			return -length;
-		}
-		break;
+	if ((bufsize_t)length > str_len)
+		return -str_len;
 
+	switch (length) {
 	case 2:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
 		if (str[0] < 0xC2) {
 			// Overlong
 			return -length;
@@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		break;
 
 	case 3:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
+		if ((str[2] & 0xC0) != 0x80)
+			return -2;
 		if (str[0] == 0xE0) {
 			if (str[1] < 0xA0) {
 				// Overlong
@@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		break;
 
 	case 4:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
+		if ((str[2] & 0xC0) != 0x80)
+			return -2;
+		if ((str[3] & 0xC0) != 0x80)
+			return -3;
 		if (str[0] == 0xF0) {
 			if (str[1] < 0x90) {
 				// Overlong
@@ -117,10 +124,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 
 	while (i < size) {
 		bufsize_t org = i;
+		int charlen = 0;
+
+		while (i < size && line[i] != '\t') {
+			if (line[i] >= 0x80) {
+				charlen = utf8proc_valid(line + i, size - i);
+				if (charlen < 0) {
+					charlen = -charlen;
+					break;
+				}
+				i += charlen;
+			}
+			else if (line[i] == '\0') {
+				// ASCII NUL is technically valid but rejected
+				// for security reasons.
+				charlen = 1;
+				break;
+			}
+			else {
+				i++;
+			}
 
-		while (i < size && line[i] != '\t' && line[i] != '\0'
-		       && line[i] < 0x80) {
-			i++;
 			tab++;
 		}
 
@@ -136,14 +160,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 			i += 1;
 			tab += numspaces;
 		} else {
-			int charlen = utf8proc_valid(line + i, size - i);
-
-			if (charlen >= 0) {
-				cmark_strbuf_put(ob, line + i, charlen);
-			} else {
-				encode_unknown(ob);
-				charlen = -charlen;
-			}
+			// Invalid UTF-8
+			encode_unknown(ob);
 
 			i += charlen;
 			tab += 1;