cmark

My personal build of CMark ✏️

Commit
26537124a4070f7869db67317b90e08916050c8f
Parent
7f491b0bdf8e206458d284938efa8a0890c9d352
Author
John MacFarlane <jgm@berkeley.edu>
Date

Renamed utf8proc_detab as utf8proc_check, removed detabbing function.

Now it just replaces bad UTF-8 sequences and NULLs.

This restores benchmarks to near their previous levels.

Diffstat

3 files changed, 12 insertions, 23 deletions

Status File Name N° Changes Insertions Deletions
Modified src/blocks.c 2 1 1
Modified src/utf8.c 31 10 21
Modified src/utf8.h 2 1 1
diff --git a/src/blocks.c b/src/blocks.c
@@ -619,7 +619,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 	cmark_chunk input;
 	bool maybe_lazy;
 
-	cmark_strbuf_put(parser->curline, buffer, bytes);
+	utf8proc_check(parser->curline, buffer, bytes);
 	parser->offset = 0;
 	parser->column = 0;
 	parser->blank = false;
diff --git a/src/utf8.c b/src/utf8.c
@@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 	return length;
 }
 
-void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
+void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 {
-	static const uint8_t whitespace[] = "    ";
-
-	bufsize_t i = 0, tab = 0;
+	bufsize_t i = 0;
 
 	while (i < size) {
 		bufsize_t org = i;
 		int charlen = 0;
 
-		while (i < size && line[i] != '\t') {
-			if (line[i] >= 0x80) {
+		while (i < size) {
+			if (line[i] < 0x80 && line[i] != 0) {
+				i++;
+			} else if (line[i] >= 0x80) {
 				charlen = utf8proc_valid(line + i, size - i);
 				if (charlen < 0) {
 					charlen = -charlen;
 					break;
 				}
 				i += charlen;
-			} else if (line[i] == '\0') {
+			} else if (line[i] == 0) {
 				// ASCII NUL is technically valid but rejected
 				// for security reasons.
 				charlen = 1;
 				break;
-			} else {
-				i++;
 			}
-
-			tab++;
 		}
 
-		if (i > org)
+		if (i > org) {
 			cmark_strbuf_put(ob, line + org, i - org);
+		}
 
-		if (i >= size)
+		if (i >= size) {
 			break;
-
-		if (line[i] == '\t') {
-			int numspaces = 4 - (tab % 4);
-			cmark_strbuf_put(ob, whitespace, numspaces);
-			i += 1;
-			tab += numspaces;
 		} else {
 			// Invalid UTF-8
 			encode_unknown(ob);
-
 			i += charlen;
-			tab += 1;
 		}
 	}
 }
diff --git a/src/utf8.h b/src/utf8.h
@@ -11,7 +11,7 @@ extern "C" {
 void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len);
 void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
 int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst);
-void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, bufsize_t size);
+void utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size);
 int utf8proc_is_space(int32_t uc);
 int utf8proc_is_punctuation(int32_t uc);