diff --git a/src/blocks.c b/src/blocks.c
@@ -10,13 +10,10 @@
#include "inlines.h"
#include "html/houdini.h"
#include "buffer.h"
-#include "bench.h"
+#include "debug.h"
#define peek_at(i, n) (i)->data[n]
-static void incorporate_line(strbuf *ln, int line_number, node_block** curptr);
-static void finalize(node_block* b, int line_number);
-
static node_block* make_block(int tag, int start_line, int start_column)
{
node_block* e;
@@ -44,18 +41,42 @@ static node_block* make_document()
return e;
}
+cmark_doc_parser *cmark_new_doc_parser()
+{
+ cmark_doc_parser *parser = (cmark_doc_parser*)malloc(sizeof(cmark_doc_parser));
+ node_block *document = make_document();
+ strbuf *line = (strbuf*)malloc(sizeof(strbuf));
+ cmark_strbuf_init(line, 256);
+
+ parser->head = document;
+ parser->current = document;
+ parser->line_number = 0;
+ parser->curline = line;
+
+ return parser;
+}
+
+void cmark_free_doc_parser(cmark_doc_parser *parser)
+{
+ cmark_strbuf_free(parser->curline);
+ free(parser->curline);
+ free(parser);
+}
+
+static void finalize(node_block* b, int line_number);
+
// Returns true if line has only space characters, else false.
static bool is_blank(strbuf *s, int offset)
{
while (offset < s->size) {
switch (s->ptr[offset]) {
- case '\n':
- return true;
- case ' ':
- offset++;
- break;
- default:
- return false;
+ case '\n':
+ return true;
+ case ' ':
+ offset++;
+ break;
+ default:
+ return false;
}
}
@@ -65,17 +86,17 @@ static bool is_blank(strbuf *s, int offset)
static inline bool can_contain(int parent_type, int child_type)
{
return ( parent_type == BLOCK_DOCUMENT ||
- parent_type == BLOCK_BQUOTE ||
- parent_type == BLOCK_LIST_ITEM ||
- (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
+ parent_type == BLOCK_BQUOTE ||
+ parent_type == BLOCK_LIST_ITEM ||
+ (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
}
static inline bool accepts_lines(int block_type)
{
return (block_type == BLOCK_PARAGRAPH ||
- block_type == BLOCK_ATX_HEADER ||
- block_type == BLOCK_INDENTED_CODE ||
- block_type == BLOCK_FENCED_CODE);
+ block_type == BLOCK_ATX_HEADER ||
+ block_type == BLOCK_INDENTED_CODE ||
+ block_type == BLOCK_FENCED_CODE);
}
static void add_line(node_block* node_block, chunk *ch, int offset)
@@ -158,77 +179,77 @@ static void finalize(node_block* b, int line_number)
}
switch (b->tag) {
- case BLOCK_PARAGRAPH:
- pos = 0;
- while (strbuf_at(&b->string_content, 0) == '[' &&
- (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
+ case BLOCK_PARAGRAPH:
+ pos = 0;
+ while (strbuf_at(&b->string_content, 0) == '[' &&
+ (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
- strbuf_drop(&b->string_content, pos);
- }
- if (is_blank(&b->string_content, 0)) {
- b->tag = BLOCK_REFERENCE_DEF;
- }
- break;
-
- case BLOCK_INDENTED_CODE:
- remove_trailing_blank_lines(&b->string_content);
- strbuf_putc(&b->string_content, '\n');
- break;
-
- case BLOCK_FENCED_CODE:
- // first line of contents becomes info
- firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
-
- strbuf_init(&b->as.code.info, 0);
- houdini_unescape_html_f(
- &b->as.code.info,
- b->string_content.ptr,
- firstlinelen
- );
-
- strbuf_drop(&b->string_content, firstlinelen + 1);
-
- strbuf_trim(&b->as.code.info);
- strbuf_unescape(&b->as.code.info);
- break;
-
- case BLOCK_LIST: // determine tight/loose status
- b->as.list.tight = true; // tight by default
- item = b->children;
-
- while (item) {
- // check for non-final non-empty list item ending with blank line:
- if (item->last_line_blank && item->next) {
- b->as.list.tight = false;
- break;
+ strbuf_drop(&b->string_content, pos);
}
- // recurse into children of list item, to see if there are
- // spaces between them:
- subitem = item->children;
- while (subitem) {
- if (ends_with_blank_line(subitem) &&
- (item->next || subitem->next)) {
+ if (is_blank(&b->string_content, 0)) {
+ b->tag = BLOCK_REFERENCE_DEF;
+ }
+ break;
+
+ case BLOCK_INDENTED_CODE:
+ remove_trailing_blank_lines(&b->string_content);
+ strbuf_putc(&b->string_content, '\n');
+ break;
+
+ case BLOCK_FENCED_CODE:
+ // first line of contents becomes info
+ firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
+
+ strbuf_init(&b->as.code.info, 0);
+ houdini_unescape_html_f(
+ &b->as.code.info,
+ b->string_content.ptr,
+ firstlinelen
+ );
+
+ strbuf_drop(&b->string_content, firstlinelen + 1);
+
+ strbuf_trim(&b->as.code.info);
+ strbuf_unescape(&b->as.code.info);
+ break;
+
+ case BLOCK_LIST: // determine tight/loose status
+ b->as.list.tight = true; // tight by default
+ item = b->children;
+
+ while (item) {
+ // check for non-final non-empty list item ending with blank line:
+ if (item->last_line_blank && item->next) {
b->as.list.tight = false;
break;
}
- subitem = subitem->next;
- }
- if (!(b->as.list.tight)) {
- break;
+ // recurse into children of list item, to see if there are
+ // spaces between them:
+ subitem = item->children;
+ while (subitem) {
+ if (ends_with_blank_line(subitem) &&
+ (item->next || subitem->next)) {
+ b->as.list.tight = false;
+ break;
+ }
+ subitem = subitem->next;
+ }
+ if (!(b->as.list.tight)) {
+ break;
+ }
+ item = item->next;
}
- item = item->next;
- }
- break;
+ break;
- default:
- break;
+ default:
+ break;
}
}
// Add a node_block as child of another. Return pointer to child.
static node_block* add_child(node_block* parent,
- int block_type, int start_line, int start_column)
+ int block_type, int start_line, int start_column)
{
assert(parent);
@@ -269,14 +290,14 @@ static void process_inlines(node_block* cur, reference_map *refmap)
while (cur != NULL) {
switch (cur->tag) {
- case BLOCK_PARAGRAPH:
- case BLOCK_ATX_HEADER:
- case BLOCK_SETEXT_HEADER:
- cur->inline_content = parse_inlines(&cur->string_content, refmap);
- break;
+ case BLOCK_PARAGRAPH:
+ case BLOCK_ATX_HEADER:
+ case BLOCK_SETEXT_HEADER:
+ cur->inline_content = parse_inlines(&cur->string_content, refmap);
+ break;
- default:
- break;
+ default:
+ break;
}
if (cur->children) {
@@ -373,14 +394,13 @@ static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
static int lists_match(struct ListData *list_data, struct ListData *item_data)
{
return (list_data->list_type == item_data->list_type &&
- list_data->delimiter == item_data->delimiter &&
- // list_data->marker_offset == item_data.marker_offset &&
- list_data->bullet_char == item_data->bullet_char);
+ list_data->delimiter == item_data->delimiter &&
+ // list_data->marker_offset == item_data.marker_offset &&
+ list_data->bullet_char == item_data->bullet_char);
}
static node_block *finalize_document(node_block *document, int linenum)
{
- start_timer();
while (document != document->top) {
finalize(document, linenum);
document = document->parent;
@@ -388,56 +408,46 @@ static node_block *finalize_document(node_block *document, int linenum)
finalize(document, linenum);
process_inlines(document, document->as.document.refmap);
- end_timer("finalize_document");
return document;
}
extern node_block *cmark_parse_file(FILE *f)
{
- strbuf line = GH_BUF_INIT;
unsigned char buffer[4096];
- int linenum = 1;
- node_block *document = make_document();
+ cmark_doc_parser *parser = cmark_new_doc_parser();
+ size_t offset;
+ node_block *document;
- start_timer();
while (fgets((char *)buffer, sizeof(buffer), f)) {
- utf8proc_detab(&line, buffer, strlen((char *)buffer));
- incorporate_line(&line, linenum, &document);
- strbuf_clear(&line);
- linenum++;
+ offset = strlen((char *)buffer);
+ cmark_process_line(parser, buffer, offset);
}
- end_timer("incorporate_line(s)");
- strbuf_free(&line);
- return finalize_document(document, linenum);
+ document = cmark_finish(parser);
+ cmark_free_doc_parser(parser);
+ return document;
}
extern node_block *cmark_parse_document(const unsigned char *buffer, size_t len)
{
- strbuf line = GH_BUF_INIT;
int linenum = 1;
const unsigned char *end = buffer + len;
- node_block *document = make_document();
+ size_t offset;
+ cmark_doc_parser *parser = cmark_new_doc_parser();
+ node_block *document;
while (buffer < end) {
const unsigned char *eol = memchr(buffer, '\n', end - buffer);
-
- if (!eol) {
- utf8proc_detab(&line, buffer, end - buffer);
- buffer = end;
- } else {
- utf8proc_detab(&line, buffer, (eol - buffer) + 1);
- buffer += (eol - buffer) + 1;
- }
-
- incorporate_line(&line, linenum, &document);
- strbuf_clear(&line);
+ offset = eol ? (eol - buffer) + 1 : eol - buffer;
+ cmark_process_line(parser, buffer, offset);
+ buffer += offset;
linenum++;
}
- strbuf_free(&line);
- return finalize_document(document, linenum);
+ document = cmark_finish(parser);
+ cmark_free_doc_parser(parser);
+ return document;
}
static void chop_trailing_hashtags(chunk *ch)
@@ -458,8 +468,8 @@ static void chop_trailing_hashtags(chunk *ch)
}
}
-// Process one line at a time, modifying a node_block.
-static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
+void cmark_process_line(cmark_doc_parser *parser, const unsigned char *buffer,
+ size_t bytes)
{
node_block* last_matched_container;
int offset = 0;
@@ -469,22 +479,27 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
struct ListData * data = NULL;
bool all_matched = true;
node_block* container;
- node_block* cur = *curptr;
+ node_block* cur = parser->current;
bool blank = false;
int first_nonspace;
int indent;
chunk input;
+ utf8proc_detab(parser->curline, buffer, bytes);
+
// Add a newline to the end if not present:
- if (line->ptr[line->size - 1] != '\n') {
- strbuf_putc(line, '\n');
+ // TODO this breaks abstraction:
+ if (parser->curline->ptr[parser->curline->size - 1] != '\n') {
+ strbuf_putc(parser->curline, '\n');
}
- input.data = line->ptr;
- input.len = line->size;
+ input.data = parser->curline->ptr;
+ input.len = parser->curline->size;
// container starts at the document root.
container = cur->top;
+ parser->line_number++;
+
// for each containing node_block, try to parse the associated line start.
// bail out on failure: container will point to the last matching node_block.
@@ -512,7 +527,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
} else if (container->tag == BLOCK_LIST_ITEM) {
if (indent >= container->as.list.marker_offset +
- container->as.list.padding) {
+ container->as.list.padding) {
offset += container->as.list.marker_offset +
container->as.list.padding;
} else if (blank) {
@@ -532,7 +547,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
}
} else if (container->tag == BLOCK_ATX_HEADER ||
- container->tag == BLOCK_SETEXT_HEADER) {
+ container->tag == BLOCK_SETEXT_HEADER) {
// a header can never contain more than one line
all_matched = false;
@@ -571,12 +586,12 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// check to see if we've hit 2nd blank line, break out of list:
if (blank && container->last_line_blank) {
- break_out_of_lists(&container, line_number);
+ break_out_of_lists(&container, parser->line_number);
}
// unless last matched container is code node_block, try new container starts:
while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
- container->tag != BLOCK_HTML) {
+ container->tag != BLOCK_HTML) {
first_nonspace = offset;
while (peek_at(&input, first_nonspace) == ' ')
@@ -588,7 +603,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
if (indent >= CODE_INDENT) {
if (cur->tag != BLOCK_PARAGRAPH && !blank) {
offset += CODE_INDENT;
- container = add_child(container, BLOCK_INDENTED_CODE, line_number, offset + 1);
+ container = add_child(container, BLOCK_INDENTED_CODE, parser->line_number, offset + 1);
} else { // indent > 4 in lazy line
break;
}
@@ -599,12 +614,12 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// optional following character
if (peek_at(&input, offset) == ' ')
offset++;
- container = add_child(container, BLOCK_BQUOTE, line_number, offset + 1);
+ container = add_child(container, BLOCK_BQUOTE, parser->line_number, offset + 1);
} else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
offset = first_nonspace + matched;
- container = add_child(container, BLOCK_ATX_HEADER, line_number, offset + 1);
+ container = add_child(container, BLOCK_ATX_HEADER, parser->line_number, offset + 1);
int hashpos = chunk_strchr(&input, '#', first_nonspace);
int level = 0;
@@ -617,7 +632,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
} else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
- container = add_child(container, BLOCK_FENCED_CODE, line_number, first_nonspace + 1);
+ container = add_child(container, BLOCK_FENCED_CODE, parser->line_number, first_nonspace + 1);
container->as.code.fence_char = peek_at(&input, first_nonspace);
container->as.code.fence_length = matched;
container->as.code.fence_offset = first_nonspace - offset;
@@ -625,25 +640,25 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
} else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
- container = add_child(container, BLOCK_HTML, line_number, first_nonspace + 1);
+ container = add_child(container, BLOCK_HTML, parser->line_number, first_nonspace + 1);
// note, we don't adjust offset because the tag is part of the text
} else if (container->tag == BLOCK_PARAGRAPH &&
- (lev = scan_setext_header_line(&input, first_nonspace)) &&
- // check that there is only one line in the paragraph:
- strbuf_strrchr(&container->string_content, '\n',
- strbuf_len(&container->string_content) - 2) < 0) {
+ (lev = scan_setext_header_line(&input, first_nonspace)) &&
+ // check that there is only one line in the paragraph:
+ strbuf_strrchr(&container->string_content, '\n',
+ strbuf_len(&container->string_content) - 2) < 0) {
container->tag = BLOCK_SETEXT_HEADER;
container->as.header.level = lev;
offset = input.len - 1;
} else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
- (matched = scan_hrule(&input, first_nonspace))) {
+ (matched = scan_hrule(&input, first_nonspace))) {
// it's only now that we know the line is not part of a setext header:
- container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
- finalize(container, line_number);
+ container = add_child(container, BLOCK_HRULE, parser->line_number, first_nonspace + 1);
+ finalize(container, parser->line_number);
container = container->parent;
offset = input.len - 1;
@@ -672,16 +687,16 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
data->marker_offset = indent;
if (container->tag != BLOCK_LIST ||
- !lists_match(&container->as.list, data)) {
- container = add_child(container, BLOCK_LIST, line_number,
- first_nonspace + 1);
+ !lists_match(&container->as.list, data)) {
+ container = add_child(container, BLOCK_LIST, parser->line_number,
+ first_nonspace + 1);
memcpy(&container->as.list, data, sizeof(*data));
}
// add the list item
- container = add_child(container, BLOCK_LIST_ITEM, line_number,
- first_nonspace + 1);
+ container = add_child(container, BLOCK_LIST_ITEM, parser->line_number,
+ first_nonspace + 1);
/* TODO: static */
memcpy(&container->as.list, data, sizeof(*data));
free(data);
@@ -710,11 +725,11 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// lists or breaking out of lists. we also don't set last_line_blank
// on an empty list item.
container->last_line_blank = (blank &&
- container->tag != BLOCK_BQUOTE &&
- container->tag != BLOCK_FENCED_CODE &&
- !(container->tag == BLOCK_LIST_ITEM &&
- container->children == NULL &&
- container->start_line == line_number));
+ container->tag != BLOCK_BQUOTE &&
+ container->tag != BLOCK_FENCED_CODE &&
+ !(container->tag == BLOCK_LIST_ITEM &&
+ container->children == NULL &&
+ container->start_line == parser->line_number));
node_block *cont = container;
while (cont->parent) {
@@ -723,10 +738,10 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
}
if (cur != last_matched_container &&
- container == last_matched_container &&
- !blank &&
- cur->tag == BLOCK_PARAGRAPH &&
- strbuf_len(&cur->string_content) > 0) {
+ container == last_matched_container &&
+ !blank &&
+ cur->tag == BLOCK_PARAGRAPH &&
+ strbuf_len(&cur->string_content) > 0) {
add_line(cur, &input, offset);
@@ -734,7 +749,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// finalize any blocks that were not matched and set cur to container:
while (cur != last_matched_container) {
- finalize(cur, line_number);
+ finalize(cur, parser->line_number);
cur = cur->parent;
assert(cur != NULL);
}
@@ -747,7 +762,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
matched = 0;
if (indent <= 3 &&
- peek_at(&input, first_nonspace) == container->as.code.fence_char) {
+ peek_at(&input, first_nonspace) == container->as.code.fence_char) {
int fence_len = scan_close_code_fence(&input, first_nonspace);
if (fence_len > container->as.code.fence_length)
matched = 1;
@@ -755,7 +770,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
if (matched) {
// if closing fence, don't add line to container; instead, close it:
- finalize(container, line_number);
+ finalize(container, parser->line_number);
container = container->parent; // back up to parent
} else {
add_line(container, &input, offset);
@@ -773,7 +788,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
chop_trailing_hashtags(&input);
add_line(container, &input, first_nonspace);
- finalize(container, line_number);
+ finalize(container, parser->line_number);
container = container->parent;
} else if (accepts_lines(container->tag)) {
@@ -783,13 +798,23 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
} else if (container->tag != BLOCK_HRULE && container->tag != BLOCK_SETEXT_HEADER) {
// create paragraph container for line
- container = add_child(container, BLOCK_PARAGRAPH, line_number, first_nonspace + 1);
+ container = add_child(container, BLOCK_PARAGRAPH, parser->line_number, first_nonspace + 1);
add_line(container, &input, first_nonspace);
} else {
assert(false);
}
- *curptr = container;
+ parser->current = container;
}
+ strbuf_clear(parser->curline);
+
}
+
+node_block *cmark_finish(cmark_doc_parser *parser)
+{
+ finalize_document(parser->current, parser->line_number);
+ strbuf_free(parser->curline);
+ return parser->head;
+}
+