cmark

My personal build of CMark ✏️

Commit
7f491b0bdf8e206458d284938efa8a0890c9d352
Parent
ef77d908553bfdd37b83ae4832d7e6ff36874f24
Author
John MacFarlane <jgm@berkeley.edu>
Date

Preliminary changes for new tab handling.

We no longer preprocess tabs to spaces before parsing.

Instead, we keep track of both the byte offset and the (virtual) column as we parse block starts.

This allows us to handle tabs without converting to spaces first. Tabs are left as tabs in the output.

Added `column` and `first_nonspace_column` fields to `parser`.

Added utility function to advance the offset, computing the virtual column too.

Note that we don't need to deal with UTF-8 here at all. Only ASCII occurs in block starts.

Significant performance improvement due to the fact that we're not doing UTF-8 validation -- though we might want to add that back in.

Diffstat

2 files changed, 71 insertions, 23 deletions

Status File Name N° Changes Insertions Deletions
Modified src/blocks.c 92 69 23
Modified src/parser.h 2 2 0
diff --git a/src/blocks.c b/src/blocks.c
@@ -16,6 +16,8 @@
 #include "debug.h"
 
 #define CODE_INDENT 4
+#define TAB_STOP 4
+
 #define peek_at(i, n) (i)->data[n]
 
 static inline bool
@@ -70,7 +72,9 @@ cmark_parser *cmark_parser_new(int options)
 	parser->current = document;
 	parser->line_number = 0;
 	parser->offset = 0;
+	parser->column = 0;
 	parser->first_nonspace = 0;
+	parser->first_nonspace_column = 0;
 	parser->indent = 0;
 	parser->blank = false;
 	parser->curline = line;
@@ -555,16 +559,53 @@ static void chop_trailing_hashtags(cmark_chunk *ch)
 static void
 S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input)
 {
+	char c;
+	int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
+
 	parser->first_nonspace = parser->offset;
-	while (peek_at(input, parser->first_nonspace) == ' ') {
-		parser->first_nonspace++;
+	parser->first_nonspace_column = parser->column;
+	while ((c = peek_at(input, parser->first_nonspace))) {
+		if (c == ' ') {
+			parser->first_nonspace += 1;
+			parser->first_nonspace_column += 1;
+			chars_to_tab = chars_to_tab - 1;
+			if (chars_to_tab == 0) {
+				chars_to_tab = TAB_STOP;
+			}
+		} else if (c == '\t') {
+			parser->first_nonspace += 1;
+			parser->first_nonspace_column += chars_to_tab;
+			chars_to_tab = TAB_STOP;
+		} else {
+			break;
+		}
 	}
 
-	parser->indent = parser->first_nonspace - parser->offset;
+	parser->indent = parser->first_nonspace_column - parser->column;
 	parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
 }
 
 static void
+S_advance_offset(cmark_parser *parser, cmark_chunk *input, bufsize_t count, bool columns)
+{
+	char c;
+	int chars_to_tab;
+	while (count > 0 && (c = peek_at(input, parser->offset))) {
+		if (c == '\t') {
+			chars_to_tab = 4 - (parser->column % TAB_STOP);
+			parser->column += chars_to_tab;
+			parser->offset += 1;
+			count -= (columns ? chars_to_tab : 1);
+		} else {
+			parser->offset += 1;
+			parser->column += 1; // assume ascii; block starts are ascii
+			count -= 1;
+		}
+	}
+}
+
+
+static void
 S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t bytes)
 {
 	cmark_node* last_matched_container;
@@ -578,8 +619,9 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 	cmark_chunk input;
 	bool maybe_lazy;
 
-	utf8proc_detab(parser->curline, buffer, bytes);
+	cmark_strbuf_put(parser->curline, buffer, bytes);
 	parser->offset = 0;
+	parser->column = 0;
 	parser->blank = false;
 
 	input.data = parser->curline->ptr;
@@ -601,7 +643,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 		if (container->type == NODE_BLOCK_QUOTE) {
 			matched = parser->indent <= 3 && peek_at(&input, parser->first_nonspace) == '>';
 			if (matched) {
-				parser->offset = parser->first_nonspace + 1;
+				S_advance_offset(parser, &input, parser->indent + 1, true);
 				if (peek_at(&input, parser->offset) == ' ')
 					parser->offset++;
 			} else {
@@ -609,13 +651,14 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			}
 
 		} else if (container->type == NODE_ITEM) {
-
 			if (parser->indent >= container->as.list.marker_offset +
 			    container->as.list.padding) {
-				parser->offset += container->as.list.marker_offset +
-				                  container->as.list.padding;
+				S_advance_offset(parser, &input,
+						 container->as.list.marker_offset +
+						 container->as.list.padding, true);
 			} else if (parser->blank) {
-				parser->offset = parser->first_nonspace;
+				S_advance_offset(parser, &input,
+						 parser->first_nonspace - parser->offset, false);
 			} else {
 				all_matched = false;
 			}
@@ -624,9 +667,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 
 			if (!container->as.code.fenced) { // indented
 				if (parser->indent >= CODE_INDENT) {
-					parser->offset += CODE_INDENT;
+					S_advance_offset(parser, &input, CODE_INDENT, true);
 				} else if (parser->blank) {
-					parser->offset = parser->first_nonspace;
+					S_advance_offset(parser, &input,
+							 parser->first_nonspace - parser->offset,
+							 false);
 				} else {
 					all_matched = false;
 				}
@@ -642,7 +687,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 					// closing fence - and since we're at
 					// the end of a line, we can return:
 					all_matched = false;
-					parser->offset += matched;
+					S_advance_offset(parser, &input, matched, false);
 					parser->current = finalize(parser, container);
 					goto finished;
 				} else {
@@ -650,7 +695,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 					i = container->as.code.fence_offset;
 					while (i > 0 &&
 					       peek_at(&input, parser->offset) == ' ') {
-						parser->offset++;
+						S_advance_offset(parser, &input, 1, false);
 						i--;
 					}
 				}
@@ -697,15 +742,16 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 
 		if (!indented && peek_at(&input, parser->first_nonspace) == '>') {
 
-			parser->offset = parser->first_nonspace + 1;
+			S_advance_offset(parser, &input, parser->first_nonspace + 1 - parser->offset, false);
 			// optional following character
 			if (peek_at(&input, parser->offset) == ' ')
-				parser->offset++;
+				S_advance_offset(parser, &input, 1, false);
 			container = add_child(parser, container, NODE_BLOCK_QUOTE, parser->offset + 1);
 
 		} else if (!indented && (matched = scan_atx_header_start(&input, parser->first_nonspace))) {
 
-			parser->offset = parser->first_nonspace + matched;
+			S_advance_offset(parser, &input,
+					 parser->first_nonspace + matched - parser->offset, false);
 			container = add_child(parser, container, NODE_HEADER, parser->offset + 1);
 
 			bufsize_t hashpos = cmark_chunk_strchr(&input, '#', parser->first_nonspace);
@@ -726,7 +772,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			container->as.code.fence_length = matched;
 			container->as.code.fence_offset = parser->first_nonspace - parser->offset;
 			container->as.code.info = cmark_chunk_literal("");
-			parser->offset = parser->first_nonspace + matched;
+			S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
 
 		} else if (!indented && (matched = scan_html_block_tag(&input, parser->first_nonspace))) {
 
@@ -743,7 +789,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			container->type = NODE_HEADER;
 			container->as.header.level = lev;
 			container->as.header.setext = true;
-			parser->offset = input.len - 1;
+			S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
 
 		} else if (!indented &&
 		           !(container->type == NODE_PARAGRAPH &&
@@ -753,7 +799,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			// it's only now that we know the line is not part of a setext header:
 			container = add_child(parser, container, NODE_HRULE, parser->first_nonspace + 1);
 			container = finalize(parser, container);
-			parser->offset = input.len - 1;
+		        S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
 
 		} else if ((matched = parse_list_marker(&input, parser->first_nonspace, &data)) &&
 		           (!indented || container->type == NODE_LIST)) {
@@ -761,7 +807,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			// spaces indent, as long as the list container is still open.
 
 			// compute padding:
-			parser->offset = parser->first_nonspace + matched;
+			S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
 			i = 0;
 			while (i <= 5 && peek_at(&input, parser->offset + i) == ' ') {
 				i++;
@@ -771,11 +817,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			    S_is_line_end_char(peek_at(&input, parser->offset))) {
 				data->padding = matched + 1;
 				if (i > 0) {
-					parser->offset += 1;
+					S_advance_offset(parser, &input, 1, false);
 				}
 			} else {
 				data->padding = matched + i;
-				parser->offset += i;
+				S_advance_offset(parser, &input, i, true);
 			}
 
 			// check container; if it's a list, see if this list item
@@ -799,7 +845,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 			free(data);
 
 		} else if (indented && !maybe_lazy && !parser->blank) {
-			parser->offset += CODE_INDENT;
+			S_advance_offset(parser, &input, CODE_INDENT, true);
 			container = add_child(parser, container, NODE_CODE_BLOCK, parser->offset + 1);
 			container->as.code.fenced = false;
 			container->as.code.fence_char = 0;
diff --git a/src/parser.h b/src/parser.h
@@ -17,7 +17,9 @@ struct cmark_parser {
 	struct cmark_node* current;
 	int line_number;
 	bufsize_t offset;
+	bufsize_t column;
 	bufsize_t first_nonspace;
+	bufsize_t first_nonspace_column;
 	int indent;
 	bool blank;
 	cmark_strbuf *curline;