cmark
My personal build of CMark ✏️
blocks.c (40674B)
1 /** 2 * Block parsing implementation. 3 * 4 * For a high-level overview of the block parsing process, 5 * see http://spec.commonmark.org/0.24/#phase-1-block-structure 6 */ 7 8 #include <stdlib.h> 9 #include <assert.h> 10 #include <stdio.h> 11 #include <limits.h> 12 13 #include "cmark_ctype.h" 14 #include "config.h" 15 #include "parser.h" 16 #include "cmark.h" 17 #include "node.h" 18 #include "references.h" 19 #include "utf8.h" 20 #include "scanners.h" 21 #include "inlines.h" 22 #include "houdini.h" 23 #include "buffer.h" 24 #include "chunk.h" 25 26 #define CODE_INDENT 4 27 #define TAB_STOP 4 28 29 #ifndef MIN 30 #define MIN(x, y) ((x < y) ? x : y) 31 #endif 32 33 #define peek_at(i, n) (i)->data[n] 34 35 static bool S_last_line_blank(const cmark_node *node) { 36 return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != 0; 37 } 38 39 static bool S_last_line_checked(const cmark_node *node) { 40 return (node->flags & CMARK_NODE__LAST_LINE_CHECKED) != 0; 41 } 42 43 static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) { 44 return (cmark_node_type)node->type; 45 } 46 47 static void S_set_last_line_blank(cmark_node *node, bool is_blank) { 48 if (is_blank) 49 node->flags |= CMARK_NODE__LAST_LINE_BLANK; 50 else 51 node->flags &= ~CMARK_NODE__LAST_LINE_BLANK; 52 } 53 54 static void S_set_last_line_checked(cmark_node *node) { 55 node->flags |= CMARK_NODE__LAST_LINE_CHECKED; 56 } 57 58 static CMARK_INLINE bool S_is_line_end_char(char c) { 59 return (c == '\n' || c == '\r'); 60 } 61 62 static CMARK_INLINE bool S_is_space_or_tab(char c) { 63 return (c == ' ' || c == '\t'); 64 } 65 66 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, 67 size_t len, bool eof); 68 69 static void S_process_line(cmark_parser *parser, const unsigned char *buffer, 70 bufsize_t bytes); 71 72 static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag, 73 int start_line, int start_column) { 74 cmark_node *e; 75 76 e = (cmark_node *)mem->calloc(1, sizeof(*e)); 77 e->mem = mem; 78 e->type = (uint16_t)tag; 79 e->flags = CMARK_NODE__OPEN; 80 e->start_line = start_line; 81 e->start_column = start_column; 82 e->end_line = start_line; 83 84 return e; 85 } 86 87 // Create a root document node. 88 static cmark_node *make_document(cmark_mem *mem) { 89 cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, 1, 1); 90 return e; 91 } 92 93 cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { 94 cmark_parser *parser = (cmark_parser *)mem->calloc(1, sizeof(cmark_parser)); 95 parser->mem = mem; 96 97 cmark_node *document = make_document(mem); 98 99 cmark_strbuf_init(mem, &parser->curline, 256); 100 cmark_strbuf_init(mem, &parser->linebuf, 0); 101 cmark_strbuf_init(mem, &parser->content, 0); 102 103 parser->refmap = cmark_reference_map_new(mem); 104 parser->root = document; 105 parser->current = document; 106 parser->line_number = 0; 107 parser->offset = 0; 108 parser->column = 0; 109 parser->first_nonspace = 0; 110 parser->first_nonspace_column = 0; 111 parser->thematic_break_kill_pos = 0; 112 parser->indent = 0; 113 parser->blank = false; 114 parser->partially_consumed_tab = false; 115 parser->last_line_length = 0; 116 parser->options = options; 117 parser->last_buffer_ended_with_cr = false; 118 119 return parser; 120 } 121 122 cmark_parser *cmark_parser_new(int options) { 123 extern cmark_mem DEFAULT_MEM_ALLOCATOR; 124 return cmark_parser_new_with_mem(options, &DEFAULT_MEM_ALLOCATOR); 125 } 126 127 void cmark_parser_free(cmark_parser *parser) { 128 cmark_mem *mem = parser->mem; 129 cmark_strbuf_free(&parser->curline); 130 cmark_strbuf_free(&parser->linebuf); 131 cmark_reference_map_free(parser->refmap); 132 mem->free(parser); 133 } 134 135 static cmark_node *finalize(cmark_parser *parser, cmark_node *b); 136 137 // Returns true if line has only space characters, else false. 138 static bool is_blank(cmark_strbuf *s, bufsize_t offset) { 139 while (offset < s->size) { 140 switch (s->ptr[offset]) { 141 case '\r': 142 case '\n': 143 return true; 144 case ' ': 145 offset++; 146 break; 147 case '\t': 148 offset++; 149 break; 150 default: 151 return false; 152 } 153 } 154 155 return true; 156 } 157 158 static CMARK_INLINE bool can_contain(cmark_node_type parent_type, 159 cmark_node_type child_type) { 160 return (parent_type == CMARK_NODE_DOCUMENT || 161 parent_type == CMARK_NODE_BLOCK_QUOTE || 162 parent_type == CMARK_NODE_ITEM || 163 (parent_type == CMARK_NODE_LIST && child_type == CMARK_NODE_ITEM)); 164 } 165 166 static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) { 167 return (block_type == CMARK_NODE_PARAGRAPH || 168 block_type == CMARK_NODE_HEADING || 169 block_type == CMARK_NODE_CODE_BLOCK); 170 } 171 172 static CMARK_INLINE bool contains_inlines(cmark_node_type block_type) { 173 return (block_type == CMARK_NODE_PARAGRAPH || 174 block_type == CMARK_NODE_HEADING); 175 } 176 177 static void add_line(cmark_chunk *ch, cmark_parser *parser) { 178 int chars_to_tab; 179 int i; 180 if (parser->partially_consumed_tab) { 181 parser->offset += 1; // skip over tab 182 // add space characters: 183 chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); 184 for (i = 0; i < chars_to_tab; i++) { 185 cmark_strbuf_putc(&parser->content, ' '); 186 } 187 } 188 cmark_strbuf_put(&parser->content, ch->data + parser->offset, 189 ch->len - parser->offset); 190 } 191 192 static void remove_trailing_blank_lines(cmark_strbuf *ln) { 193 bufsize_t i; 194 unsigned char c; 195 196 for (i = ln->size - 1; i >= 0; --i) { 197 c = ln->ptr[i]; 198 199 if (c != ' ' && c != '\t' && !S_is_line_end_char(c)) 200 break; 201 } 202 203 if (i < 0) { 204 cmark_strbuf_clear(ln); 205 return; 206 } 207 208 for (; i < ln->size; ++i) { 209 c = ln->ptr[i]; 210 211 if (!S_is_line_end_char(c)) 212 continue; 213 214 cmark_strbuf_truncate(ln, i); 215 break; 216 } 217 } 218 219 // Check to see if a node ends with a blank line, descending 220 // if needed into lists and sublists. 221 static bool S_ends_with_blank_line(cmark_node *node) { 222 if (S_last_line_checked(node)) { 223 return(S_last_line_blank(node)); 224 } else if ((S_type(node) == CMARK_NODE_LIST || 225 S_type(node) == CMARK_NODE_ITEM) && node->last_child) { 226 S_set_last_line_checked(node); 227 return(S_ends_with_blank_line(node->last_child)); 228 } else { 229 S_set_last_line_checked(node); 230 return (S_last_line_blank(node)); 231 } 232 } 233 234 // returns true if content remains after link defs are resolved. 235 static bool resolve_reference_link_definitions(cmark_parser *parser) { 236 bufsize_t pos; 237 cmark_strbuf *node_content = &parser->content; 238 cmark_chunk chunk = {node_content->ptr, node_content->size}; 239 while (chunk.len && chunk.data[0] == '[' && 240 (pos = cmark_parse_reference_inline(parser->mem, &chunk, 241 parser->refmap))) { 242 243 chunk.data += pos; 244 chunk.len -= pos; 245 } 246 cmark_strbuf_drop(node_content, (node_content->size - chunk.len)); 247 return !is_blank(node_content, 0); 248 } 249 250 static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { 251 bufsize_t pos; 252 cmark_node *item; 253 cmark_node *subitem; 254 cmark_node *parent; 255 bool has_content; 256 257 parent = b->parent; 258 assert(b->flags & 259 CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks 260 b->flags &= ~CMARK_NODE__OPEN; 261 262 if (parser->curline.size == 0) { 263 // end of input - line number has not been incremented 264 b->end_line = parser->line_number; 265 b->end_column = parser->last_line_length; 266 } else if (S_type(b) == CMARK_NODE_DOCUMENT || 267 (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) || 268 (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) { 269 b->end_line = parser->line_number; 270 b->end_column = parser->curline.size; 271 if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n') 272 b->end_column -= 1; 273 if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\r') 274 b->end_column -= 1; 275 } else { 276 b->end_line = parser->line_number - 1; 277 b->end_column = parser->last_line_length; 278 } 279 280 cmark_strbuf *node_content = &parser->content; 281 282 switch (S_type(b)) { 283 case CMARK_NODE_PARAGRAPH: 284 { 285 has_content = resolve_reference_link_definitions(parser); 286 if (!has_content) { 287 // remove blank node (former reference def) 288 cmark_node_free(b); 289 } else { 290 b->len = node_content->size; 291 b->data = cmark_strbuf_detach(node_content); 292 } 293 break; 294 } 295 296 case CMARK_NODE_CODE_BLOCK: 297 if (!b->as.code.fenced) { // indented code 298 remove_trailing_blank_lines(node_content); 299 cmark_strbuf_putc(node_content, '\n'); 300 } else { 301 // first line of contents becomes info 302 for (pos = 0; pos < node_content->size; ++pos) { 303 if (S_is_line_end_char(node_content->ptr[pos])) 304 break; 305 } 306 assert(pos < node_content->size); 307 308 if (pos == 0) { 309 b->as.code.info = NULL; 310 } else { 311 cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem); 312 houdini_unescape_html_f(&tmp, node_content->ptr, pos); 313 cmark_strbuf_trim(&tmp); 314 cmark_strbuf_unescape(&tmp); 315 b->as.code.info = cmark_strbuf_detach(&tmp); 316 } 317 318 if (node_content->ptr[pos] == '\r') 319 pos += 1; 320 if (node_content->ptr[pos] == '\n') 321 pos += 1; 322 cmark_strbuf_drop(node_content, pos); 323 } 324 b->len = node_content->size; 325 b->data = cmark_strbuf_detach(node_content); 326 break; 327 328 case CMARK_NODE_HEADING: 329 case CMARK_NODE_HTML_BLOCK: 330 b->len = node_content->size; 331 b->data = cmark_strbuf_detach(node_content); 332 break; 333 334 case CMARK_NODE_LIST: // determine tight/loose status 335 b->as.list.tight = true; // tight by default 336 item = b->first_child; 337 338 while (item) { 339 // check for non-final non-empty list item ending with blank line: 340 if (S_last_line_blank(item) && item->next) { 341 b->as.list.tight = false; 342 break; 343 } 344 // recurse into children of list item, to see if there are 345 // spaces between them: 346 subitem = item->first_child; 347 while (subitem) { 348 if ((item->next || subitem->next) && 349 S_ends_with_blank_line(subitem)) { 350 b->as.list.tight = false; 351 break; 352 } 353 subitem = subitem->next; 354 } 355 if (!(b->as.list.tight)) { 356 break; 357 } 358 item = item->next; 359 } 360 361 break; 362 363 default: 364 break; 365 } 366 367 return parent; 368 } 369 370 // Add a node as child of another. Return pointer to child. 371 static cmark_node *add_child(cmark_parser *parser, cmark_node *parent, 372 cmark_node_type block_type, int start_column) { 373 assert(parent); 374 375 // if 'parent' isn't the kind of node that can accept this child, 376 // then back up til we hit a node that can. 377 while (!can_contain(S_type(parent), block_type)) { 378 parent = finalize(parser, parent); 379 } 380 381 cmark_node *child = 382 make_block(parser->mem, block_type, parser->line_number, start_column); 383 child->parent = parent; 384 385 if (parent->last_child) { 386 parent->last_child->next = child; 387 child->prev = parent->last_child; 388 } else { 389 parent->first_child = child; 390 child->prev = NULL; 391 } 392 parent->last_child = child; 393 return child; 394 } 395 396 // Walk through node and all children, recursively, parsing 397 // string content into inline content where appropriate. 398 static void process_inlines(cmark_mem *mem, cmark_node *root, 399 cmark_reference_map *refmap, int options) { 400 cmark_iter *iter = cmark_iter_new(root); 401 cmark_node *cur; 402 cmark_event_type ev_type; 403 404 while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { 405 cur = cmark_iter_get_node(iter); 406 if (ev_type == CMARK_EVENT_ENTER) { 407 if (contains_inlines(S_type(cur))) { 408 cmark_parse_inlines(mem, cur, refmap, options); 409 mem->free(cur->data); 410 cur->data = NULL; 411 cur->len = 0; 412 } 413 } 414 } 415 416 cmark_iter_free(iter); 417 } 418 419 // Attempts to parse a list item marker (bullet or enumerated). 420 // On success, returns length of the marker, and populates 421 // data with the details. On failure, returns 0. 422 static bufsize_t parse_list_marker(cmark_mem *mem, cmark_chunk *input, 423 bufsize_t pos, bool interrupts_paragraph, 424 cmark_list **dataptr) { 425 unsigned char c; 426 bufsize_t startpos; 427 cmark_list *data; 428 bufsize_t i; 429 430 startpos = pos; 431 c = peek_at(input, pos); 432 433 if (c == '*' || c == '-' || c == '+') { 434 pos++; 435 if (!cmark_isspace(peek_at(input, pos))) { 436 return 0; 437 } 438 439 if (interrupts_paragraph) { 440 i = pos; 441 // require non-blank content after list marker: 442 while (S_is_space_or_tab(peek_at(input, i))) { 443 i++; 444 } 445 if (peek_at(input, i) == '\n') { 446 return 0; 447 } 448 } 449 450 data = (cmark_list *)mem->calloc(1, sizeof(*data)); 451 data->marker_offset = 0; // will be adjusted later 452 data->list_type = CMARK_BULLET_LIST; 453 data->bullet_char = c; 454 data->start = 0; 455 data->delimiter = CMARK_NO_DELIM; 456 data->tight = false; 457 } else if (cmark_isdigit(c)) { 458 int start = 0; 459 int digits = 0; 460 461 do { 462 start = (10 * start) + (peek_at(input, pos) - '0'); 463 pos++; 464 digits++; 465 // We limit to 9 digits to avoid overflow, 466 // assuming max int is 2^31 - 1 467 // This also seems to be the limit for 'start' in some browsers. 468 } while (digits < 9 && cmark_isdigit(peek_at(input, pos))); 469 470 if (interrupts_paragraph && start != 1) { 471 return 0; 472 } 473 c = peek_at(input, pos); 474 if (c == '.' || c == ')') { 475 pos++; 476 if (!cmark_isspace(peek_at(input, pos))) { 477 return 0; 478 } 479 if (interrupts_paragraph) { 480 // require non-blank content after list marker: 481 i = pos; 482 while (S_is_space_or_tab(peek_at(input, i))) { 483 i++; 484 } 485 if (S_is_line_end_char(peek_at(input, i))) { 486 return 0; 487 } 488 } 489 490 data = (cmark_list *)mem->calloc(1, sizeof(*data)); 491 data->marker_offset = 0; // will be adjusted later 492 data->list_type = CMARK_ORDERED_LIST; 493 data->bullet_char = 0; 494 data->start = start; 495 data->delimiter = (c == '.' ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM); 496 data->tight = false; 497 } else { 498 return 0; 499 } 500 } else { 501 return 0; 502 } 503 504 *dataptr = data; 505 return (pos - startpos); 506 } 507 508 // Return 1 if list item belongs in list, else 0. 509 static int lists_match(cmark_list *list_data, cmark_list *item_data) { 510 return (list_data->list_type == item_data->list_type && 511 list_data->delimiter == item_data->delimiter && 512 // list_data->marker_offset == item_data.marker_offset && 513 list_data->bullet_char == item_data->bullet_char); 514 } 515 516 static cmark_node *finalize_document(cmark_parser *parser) { 517 while (parser->current != parser->root) { 518 parser->current = finalize(parser, parser->current); 519 } 520 521 finalize(parser, parser->root); 522 523 // Limit total size of extra content created from reference links to 524 // document size to avoid superlinear growth. Always allow 100KB. 525 if (parser->total_size > 100000) 526 parser->refmap->max_ref_size = parser->total_size; 527 else 528 parser->refmap->max_ref_size = 100000; 529 530 process_inlines(parser->mem, parser->root, parser->refmap, parser->options); 531 532 cmark_strbuf_free(&parser->content); 533 534 return parser->root; 535 } 536 537 cmark_node *cmark_parse_file(FILE *f, int options) { 538 unsigned char buffer[4096]; 539 cmark_parser *parser = cmark_parser_new(options); 540 size_t bytes; 541 cmark_node *document; 542 543 while ((bytes = fread(buffer, 1, sizeof(buffer), f)) > 0) { 544 bool eof = bytes < sizeof(buffer); 545 S_parser_feed(parser, buffer, bytes, eof); 546 if (eof) { 547 break; 548 } 549 } 550 551 document = cmark_parser_finish(parser); 552 cmark_parser_free(parser); 553 return document; 554 } 555 556 cmark_node *cmark_parse_document(const char *buffer, size_t len, int options) { 557 cmark_parser *parser = cmark_parser_new(options); 558 cmark_node *document; 559 560 S_parser_feed(parser, (const unsigned char *)buffer, len, true); 561 562 document = cmark_parser_finish(parser); 563 cmark_parser_free(parser); 564 return document; 565 } 566 567 void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) { 568 S_parser_feed(parser, (const unsigned char *)buffer, len, false); 569 } 570 571 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, 572 size_t len, bool eof) { 573 const unsigned char *end = buffer + len; 574 static const uint8_t repl[] = {239, 191, 189}; 575 576 if (len > UINT_MAX - parser->total_size) 577 parser->total_size = UINT_MAX; 578 else 579 parser->total_size += len; 580 581 // Skip UTF-8 BOM if present; see #334 582 if (parser->line_number == 0 && parser->column == 0 && len >= 3 && 583 *buffer == 0xEF && *(buffer + 1) == 0xBB && 584 *(buffer + 2) == 0xBF) { 585 buffer += 3; 586 } else if (parser->last_buffer_ended_with_cr && *buffer == '\n') { 587 // skip NL if last buffer ended with CR ; see #117 588 buffer++; 589 } 590 591 parser->last_buffer_ended_with_cr = false; 592 while (buffer < end) { 593 const unsigned char *eol; 594 bufsize_t chunk_len; 595 bool process = false; 596 for (eol = buffer; eol < end; ++eol) { 597 if (S_is_line_end_char(*eol)) { 598 process = true; 599 break; 600 } 601 if (*eol == '\0' && eol < end) { 602 break; 603 } 604 } 605 if (eol >= end && eof) { 606 process = true; 607 } 608 609 chunk_len = (eol - buffer); 610 if (process) { 611 if (parser->linebuf.size > 0) { 612 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len); 613 S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); 614 cmark_strbuf_clear(&parser->linebuf); 615 } else { 616 S_process_line(parser, buffer, chunk_len); 617 } 618 } else { 619 if (eol < end && *eol == '\0') { 620 // omit NULL byte 621 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len); 622 // add replacement character 623 cmark_strbuf_put(&parser->linebuf, repl, 3); 624 } else { 625 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len); 626 } 627 } 628 629 buffer += chunk_len; 630 if (buffer < end) { 631 if (*buffer == '\0') { 632 // skip over NULL 633 buffer++; 634 } else { 635 // skip over line ending characters 636 if (*buffer == '\r') { 637 buffer++; 638 if (buffer == end) 639 parser->last_buffer_ended_with_cr = true; 640 } 641 if (buffer < end && *buffer == '\n') 642 buffer++; 643 } 644 } 645 } 646 } 647 648 static void chop_trailing_hashtags(cmark_chunk *ch) { 649 bufsize_t n, orig_n; 650 651 cmark_chunk_rtrim(ch); 652 orig_n = n = ch->len - 1; 653 654 // if string ends in space followed by #s, remove these: 655 while (n >= 0 && peek_at(ch, n) == '#') 656 n--; 657 658 // Check for a space before the final #s: 659 if (n != orig_n && n >= 0 && S_is_space_or_tab(peek_at(ch, n))) { 660 ch->len = n; 661 cmark_chunk_rtrim(ch); 662 } 663 } 664 665 // Check for thematic break. On failure, return 0 and update 666 // thematic_break_kill_pos with the index at which the 667 // parse fails. On success, return length of match. 668 // "...three or more hyphens, asterisks, 669 // or underscores on a line by themselves. If you wish, you may use 670 // spaces between the hyphens or asterisks." 671 static int S_scan_thematic_break(cmark_parser *parser, cmark_chunk *input, 672 bufsize_t offset) { 673 bufsize_t i; 674 char c; 675 char nextc = '\0'; 676 int count; 677 i = offset; 678 c = peek_at(input, i); 679 if (!(c == '*' || c == '_' || c == '-')) { 680 parser->thematic_break_kill_pos = i; 681 return 0; 682 } 683 count = 1; 684 while ((nextc = peek_at(input, ++i))) { 685 if (nextc == c) { 686 count++; 687 } else if (nextc != ' ' && nextc != '\t') { 688 break; 689 } 690 } 691 if (count >= 3 && (nextc == '\r' || nextc == '\n')) { 692 return (i - offset) + 1; 693 } else { 694 parser->thematic_break_kill_pos = i; 695 return 0; 696 } 697 } 698 699 // Find first nonspace character from current offset, setting 700 // parser->first_nonspace, parser->first_nonspace_column, 701 // parser->indent, and parser->blank. Does not advance parser->offset. 702 static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) { 703 char c; 704 int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); 705 706 if (parser->first_nonspace <= parser->offset) { 707 parser->first_nonspace = parser->offset; 708 parser->first_nonspace_column = parser->column; 709 while ((c = peek_at(input, parser->first_nonspace))) { 710 if (c == ' ') { 711 parser->first_nonspace += 1; 712 parser->first_nonspace_column += 1; 713 chars_to_tab = chars_to_tab - 1; 714 if (chars_to_tab == 0) { 715 chars_to_tab = TAB_STOP; 716 } 717 } else if (c == '\t') { 718 parser->first_nonspace += 1; 719 parser->first_nonspace_column += chars_to_tab; 720 chars_to_tab = TAB_STOP; 721 } else { 722 break; 723 } 724 } 725 } 726 727 parser->indent = parser->first_nonspace_column - parser->column; 728 parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace)); 729 } 730 731 // Advance parser->offset and parser->column. parser->offset is the 732 // byte position in input; parser->column is a virtual column number 733 // that takes into account tabs. (Multibyte characters are not taken 734 // into account, because the Markdown line prefixes we are interested in 735 // analyzing are entirely ASCII.) The count parameter indicates 736 // how far to advance the offset. If columns is true, then count 737 // indicates a number of columns; otherwise, a number of bytes. 738 // If advancing a certain number of columns partially consumes 739 // a tab character, parser->partially_consumed_tab is set to true. 740 static void S_advance_offset(cmark_parser *parser, cmark_chunk *input, 741 bufsize_t count, bool columns) { 742 char c; 743 int chars_to_tab; 744 int chars_to_advance; 745 while (count > 0 && (c = peek_at(input, parser->offset))) { 746 if (c == '\t') { 747 chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); 748 if (columns) { 749 parser->partially_consumed_tab = chars_to_tab > count; 750 chars_to_advance = MIN(count, chars_to_tab); 751 parser->column += chars_to_advance; 752 parser->offset += (parser->partially_consumed_tab ? 0 : 1); 753 count -= chars_to_advance; 754 } else { 755 parser->partially_consumed_tab = false; 756 parser->column += chars_to_tab; 757 parser->offset += 1; 758 count -= 1; 759 } 760 } else { 761 parser->partially_consumed_tab = false; 762 parser->offset += 1; 763 parser->column += 1; // assume ascii; block starts are ascii 764 count -= 1; 765 } 766 } 767 } 768 769 static bool S_last_child_is_open(cmark_node *container) { 770 return container->last_child && 771 (container->last_child->flags & CMARK_NODE__OPEN); 772 } 773 774 static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) { 775 bool res = false; 776 bufsize_t matched = 0; 777 778 matched = 779 parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>'; 780 if (matched) { 781 782 S_advance_offset(parser, input, parser->indent + 1, true); 783 784 if (S_is_space_or_tab(peek_at(input, parser->offset))) { 785 S_advance_offset(parser, input, 1, true); 786 } 787 788 res = true; 789 } 790 return res; 791 } 792 793 static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input, 794 cmark_node *container) { 795 bool res = false; 796 797 if (parser->indent >= 798 container->as.list.marker_offset + container->as.list.padding) { 799 S_advance_offset(parser, input, container->as.list.marker_offset + 800 container->as.list.padding, 801 true); 802 res = true; 803 } else if (parser->blank && container->first_child != NULL) { 804 // if container->first_child is NULL, then the opening line 805 // of the list item was blank after the list marker; in this 806 // case, we are done with the list item. 807 S_advance_offset(parser, input, parser->first_nonspace - parser->offset, 808 false); 809 res = true; 810 } 811 return res; 812 } 813 814 static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input, 815 cmark_node *container, 816 bool *should_continue) { 817 bool res = false; 818 819 if (!container->as.code.fenced) { // indented 820 if (parser->indent >= CODE_INDENT) { 821 S_advance_offset(parser, input, CODE_INDENT, true); 822 res = true; 823 } else if (parser->blank) { 824 S_advance_offset(parser, input, parser->first_nonspace - parser->offset, 825 false); 826 res = true; 827 } 828 } else { // fenced 829 bufsize_t matched = 0; 830 831 if (parser->indent <= 3 && (peek_at(input, parser->first_nonspace) == 832 container->as.code.fence_char)) { 833 matched = scan_close_code_fence(input, parser->first_nonspace); 834 } 835 836 if (matched >= container->as.code.fence_length) { 837 // closing fence - and since we're at 838 // the end of a line, we can stop processing it: 839 *should_continue = false; 840 S_advance_offset(parser, input, matched, false); 841 parser->current = finalize(parser, container); 842 } else { 843 // skip opt. spaces of fence parser->offset 844 int i = container->as.code.fence_offset; 845 846 while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) { 847 S_advance_offset(parser, input, 1, true); 848 i--; 849 } 850 res = true; 851 } 852 } 853 854 return res; 855 } 856 857 static bool parse_html_block_prefix(cmark_parser *parser, 858 cmark_node *container) { 859 bool res = false; 860 int html_block_type = container->as.html_block_type; 861 862 assert(html_block_type >= 1 && html_block_type <= 7); 863 switch (html_block_type) { 864 case 1: 865 case 2: 866 case 3: 867 case 4: 868 case 5: 869 // these types of blocks can accept blanks 870 res = true; 871 break; 872 case 6: 873 case 7: 874 res = !parser->blank; 875 break; 876 } 877 878 return res; 879 } 880 881 /** 882 * For each containing node, try to parse the associated line start. 883 * 884 * Will not close unmatched blocks, as we may have a lazy continuation 885 * line -> http://spec.commonmark.org/0.24/#lazy-continuation-line 886 * 887 * Returns: The last matching node, or NULL 888 */ 889 static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input, 890 bool *all_matched) { 891 bool should_continue = true; 892 *all_matched = false; 893 cmark_node *container = parser->root; 894 cmark_node_type cont_type; 895 896 while (S_last_child_is_open(container)) { 897 container = container->last_child; 898 cont_type = S_type(container); 899 900 S_find_first_nonspace(parser, input); 901 902 switch (cont_type) { 903 case CMARK_NODE_BLOCK_QUOTE: 904 if (!parse_block_quote_prefix(parser, input)) 905 goto done; 906 break; 907 case CMARK_NODE_ITEM: 908 if (!parse_node_item_prefix(parser, input, container)) 909 goto done; 910 break; 911 case CMARK_NODE_CODE_BLOCK: 912 if (!parse_code_block_prefix(parser, input, container, &should_continue)) 913 goto done; 914 break; 915 case CMARK_NODE_HEADING: 916 // a heading can never contain more than one line 917 goto done; 918 case CMARK_NODE_HTML_BLOCK: 919 if (!parse_html_block_prefix(parser, container)) 920 goto done; 921 break; 922 case CMARK_NODE_PARAGRAPH: 923 if (parser->blank) 924 goto done; 925 break; 926 default: 927 break; 928 } 929 } 930 931 *all_matched = true; 932 933 done: 934 if (!*all_matched) { 935 container = container->parent; // back up to last matching node 936 } 937 938 if (!should_continue) { 939 container = NULL; 940 } 941 942 return container; 943 } 944 945 static void open_new_blocks(cmark_parser *parser, cmark_node **container, 946 cmark_chunk *input, bool all_matched) { 947 bool indented; 948 cmark_list *data = NULL; 949 bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH; 950 cmark_node_type cont_type = S_type(*container); 951 bufsize_t matched = 0; 952 int lev = 0; 953 bool save_partially_consumed_tab; 954 bool has_content; 955 int save_offset; 956 int save_column; 957 958 while (cont_type != CMARK_NODE_CODE_BLOCK && 959 cont_type != CMARK_NODE_HTML_BLOCK) { 960 961 S_find_first_nonspace(parser, input); 962 indented = parser->indent >= CODE_INDENT; 963 964 if (!indented && peek_at(input, parser->first_nonspace) == '>') { 965 966 bufsize_t blockquote_startpos = parser->first_nonspace; 967 968 S_advance_offset(parser, input, 969 parser->first_nonspace + 1 - parser->offset, false); 970 // optional following character 971 if (S_is_space_or_tab(peek_at(input, parser->offset))) { 972 S_advance_offset(parser, input, 1, true); 973 } 974 *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE, 975 blockquote_startpos + 1); 976 977 } else if (!indented && (matched = scan_atx_heading_start( 978 input, parser->first_nonspace))) { 979 bufsize_t hashpos; 980 int level = 0; 981 bufsize_t heading_startpos = parser->first_nonspace; 982 983 S_advance_offset(parser, input, 984 parser->first_nonspace + matched - parser->offset, 985 false); 986 *container = add_child(parser, *container, CMARK_NODE_HEADING, 987 heading_startpos + 1); 988 989 hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace); 990 991 while (peek_at(input, hashpos) == '#') { 992 level++; 993 hashpos++; 994 } 995 996 (*container)->as.heading.level = level; 997 (*container)->as.heading.setext = false; 998 (*container)->internal_offset = matched; 999 1000 } else if (!indented && (matched = scan_open_code_fence( 1001 input, parser->first_nonspace))) { 1002 *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK, 1003 parser->first_nonspace + 1); 1004 (*container)->as.code.fenced = true; 1005 (*container)->as.code.fence_char = peek_at(input, parser->first_nonspace); 1006 (*container)->as.code.fence_length = (matched > 255) ? 255 : matched; 1007 (*container)->as.code.fence_offset = 1008 (int8_t)(parser->first_nonspace - parser->offset); 1009 (*container)->as.code.info = NULL; 1010 S_advance_offset(parser, input, 1011 parser->first_nonspace + matched - parser->offset, 1012 false); 1013 1014 } else if (!indented && ((matched = scan_html_block_start( 1015 input, parser->first_nonspace)) || 1016 (cont_type != CMARK_NODE_PARAGRAPH && 1017 !maybe_lazy && 1018 (matched = scan_html_block_start_7( 1019 input, parser->first_nonspace))))) { 1020 *container = add_child(parser, *container, CMARK_NODE_HTML_BLOCK, 1021 parser->first_nonspace + 1); 1022 (*container)->as.html_block_type = matched; 1023 // note, we don't adjust parser->offset because the tag is part of the 1024 // text 1025 } else if (!indented && cont_type == CMARK_NODE_PARAGRAPH && 1026 (lev = 1027 scan_setext_heading_line(input, parser->first_nonspace))) { 1028 // finalize paragraph, resolving reference links 1029 has_content = resolve_reference_link_definitions(parser); 1030 1031 if (has_content) { 1032 1033 (*container)->type = (uint16_t)CMARK_NODE_HEADING; 1034 (*container)->as.heading.level = lev; 1035 (*container)->as.heading.setext = true; 1036 S_advance_offset(parser, input, input->len - 1 - parser->offset, false); 1037 } 1038 } else if (!indented && 1039 !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) && 1040 (parser->thematic_break_kill_pos <= parser->first_nonspace) && 1041 (matched = S_scan_thematic_break(parser, input, parser->first_nonspace))) { 1042 // it's only now that we know the line is not part of a setext heading: 1043 *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK, 1044 parser->first_nonspace + 1); 1045 S_advance_offset(parser, input, input->len - 1 - parser->offset, false); 1046 } else if ((!indented || cont_type == CMARK_NODE_LIST) && 1047 parser->indent < 4 && 1048 (matched = parse_list_marker( 1049 parser->mem, input, parser->first_nonspace, 1050 (*container)->type == CMARK_NODE_PARAGRAPH, &data))) { 1051 1052 // Note that we can have new list items starting with >= 4 1053 // spaces indent, as long as the list container is still open. 1054 int i = 0; 1055 1056 // compute padding: 1057 S_advance_offset(parser, input, 1058 parser->first_nonspace + matched - parser->offset, 1059 false); 1060 1061 save_partially_consumed_tab = parser->partially_consumed_tab; 1062 save_offset = parser->offset; 1063 save_column = parser->column; 1064 1065 while (parser->column - save_column <= 5 && 1066 S_is_space_or_tab(peek_at(input, parser->offset))) { 1067 S_advance_offset(parser, input, 1, true); 1068 } 1069 1070 i = parser->column - save_column; 1071 if (i >= 5 || i < 1 || 1072 // only spaces after list marker: 1073 S_is_line_end_char(peek_at(input, parser->offset))) { 1074 data->padding = matched + 1; 1075 parser->offset = save_offset; 1076 parser->column = save_column; 1077 parser->partially_consumed_tab = save_partially_consumed_tab; 1078 if (i > 0) { 1079 S_advance_offset(parser, input, 1, true); 1080 } 1081 } else { 1082 data->padding = matched + i; 1083 } 1084 1085 // check container; if it's a list, see if this list item 1086 // can continue the list; otherwise, create a list container. 1087 1088 data->marker_offset = parser->indent; 1089 1090 if (cont_type != CMARK_NODE_LIST || 1091 !lists_match(&((*container)->as.list), data)) { 1092 *container = add_child(parser, *container, CMARK_NODE_LIST, 1093 parser->first_nonspace + 1); 1094 1095 memcpy(&((*container)->as.list), data, sizeof(*data)); 1096 } 1097 1098 // add the list item 1099 *container = add_child(parser, *container, CMARK_NODE_ITEM, 1100 parser->first_nonspace + 1); 1101 /* TODO: static */ 1102 memcpy(&((*container)->as.list), data, sizeof(*data)); 1103 parser->mem->free(data); 1104 } else if (indented && !maybe_lazy && !parser->blank) { 1105 S_advance_offset(parser, input, CODE_INDENT, true); 1106 *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK, 1107 parser->offset + 1); 1108 (*container)->as.code.fenced = false; 1109 (*container)->as.code.fence_char = 0; 1110 (*container)->as.code.fence_length = 0; 1111 (*container)->as.code.fence_offset = 0; 1112 (*container)->as.code.info = NULL; 1113 1114 } else { 1115 break; 1116 } 1117 1118 if (accepts_lines(S_type(*container))) { 1119 // if it's a line container, it can't contain other containers 1120 break; 1121 } 1122 1123 cont_type = S_type(*container); 1124 maybe_lazy = false; 1125 } 1126 } 1127 1128 static void add_text_to_container(cmark_parser *parser, cmark_node *container, 1129 cmark_node *last_matched_container, 1130 cmark_chunk *input) { 1131 cmark_node *tmp; 1132 // what remains at parser->offset is a text line. add the text to the 1133 // appropriate container. 1134 1135 S_find_first_nonspace(parser, input); 1136 1137 if (parser->blank && container->last_child) 1138 S_set_last_line_blank(container->last_child, true); 1139 1140 // block quote lines are never blank as they start with > 1141 // and we don't count blanks in fenced code for purposes of tight/loose 1142 // lists or breaking out of lists. we also don't set last_line_blank 1143 // on an empty list item. 1144 const cmark_node_type ctype = S_type(container); 1145 const bool last_line_blank = 1146 (parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE && 1147 ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK && 1148 !(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) && 1149 !(ctype == CMARK_NODE_ITEM && container->first_child == NULL && 1150 container->start_line == parser->line_number)); 1151 1152 S_set_last_line_blank(container, last_line_blank); 1153 1154 tmp = container; 1155 while (tmp->parent) { 1156 S_set_last_line_blank(tmp->parent, false); 1157 tmp = tmp->parent; 1158 } 1159 1160 // If the last line processed belonged to a paragraph node, 1161 // and we didn't match all of the line prefixes for the open containers, 1162 // and we didn't start any new containers, 1163 // and the line isn't blank, 1164 // then treat this as a "lazy continuation line" and add it to 1165 // the open paragraph. 1166 if (parser->current != last_matched_container && 1167 container == last_matched_container && !parser->blank && 1168 S_type(parser->current) == CMARK_NODE_PARAGRAPH) { 1169 add_line(input, parser); 1170 } else { // not a lazy continuation 1171 // Finalize any blocks that were not matched and set cur to container: 1172 while (parser->current != last_matched_container) { 1173 parser->current = finalize(parser, parser->current); 1174 assert(parser->current != NULL); 1175 } 1176 1177 if (S_type(container) == CMARK_NODE_CODE_BLOCK) { 1178 add_line(input, parser); 1179 } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) { 1180 add_line(input, parser); 1181 1182 int matches_end_condition; 1183 switch (container->as.html_block_type) { 1184 case 1: 1185 // </script>, </style>, </textarea>, </pre> 1186 matches_end_condition = 1187 scan_html_block_end_1(input, parser->first_nonspace); 1188 break; 1189 case 2: 1190 // --> 1191 matches_end_condition = 1192 scan_html_block_end_2(input, parser->first_nonspace); 1193 break; 1194 case 3: 1195 // ?> 1196 matches_end_condition = 1197 scan_html_block_end_3(input, parser->first_nonspace); 1198 break; 1199 case 4: 1200 // > 1201 matches_end_condition = 1202 scan_html_block_end_4(input, parser->first_nonspace); 1203 break; 1204 case 5: 1205 // ]]> 1206 matches_end_condition = 1207 scan_html_block_end_5(input, parser->first_nonspace); 1208 break; 1209 default: 1210 matches_end_condition = 0; 1211 break; 1212 } 1213 1214 if (matches_end_condition) { 1215 container = finalize(parser, container); 1216 assert(parser->current != NULL); 1217 } 1218 } else if (parser->blank) { 1219 // ??? do nothing 1220 } else if (accepts_lines(S_type(container))) { 1221 if (S_type(container) == CMARK_NODE_HEADING && 1222 container->as.heading.setext == false) { 1223 chop_trailing_hashtags(input); 1224 } 1225 S_advance_offset(parser, input, parser->first_nonspace - parser->offset, 1226 false); 1227 add_line(input, parser); 1228 } else { 1229 // create paragraph container for line 1230 container = add_child(parser, container, CMARK_NODE_PARAGRAPH, 1231 parser->first_nonspace + 1); 1232 S_advance_offset(parser, input, parser->first_nonspace - parser->offset, 1233 false); 1234 add_line(input, parser); 1235 } 1236 1237 parser->current = container; 1238 } 1239 } 1240 1241 /* See http://spec.commonmark.org/0.24/#phase-1-block-structure */ 1242 static void S_process_line(cmark_parser *parser, const unsigned char *buffer, 1243 bufsize_t bytes) { 1244 cmark_node *last_matched_container; 1245 bool all_matched = true; 1246 cmark_node *container; 1247 cmark_chunk input; 1248 1249 if (parser->options & CMARK_OPT_VALIDATE_UTF8) 1250 cmark_utf8proc_check(&parser->curline, buffer, bytes); 1251 else 1252 cmark_strbuf_put(&parser->curline, buffer, bytes); 1253 1254 bytes = parser->curline.size; 1255 1256 // ensure line ends with a newline: 1257 if (bytes == 0 || !S_is_line_end_char(parser->curline.ptr[bytes - 1])) 1258 cmark_strbuf_putc(&parser->curline, '\n'); 1259 1260 parser->offset = 0; 1261 parser->column = 0; 1262 parser->first_nonspace = 0; 1263 parser->first_nonspace_column = 0; 1264 parser->thematic_break_kill_pos = 0; 1265 parser->indent = 0; 1266 parser->blank = false; 1267 parser->partially_consumed_tab = false; 1268 1269 input.data = parser->curline.ptr; 1270 input.len = parser->curline.size; 1271 1272 parser->line_number++; 1273 1274 last_matched_container = check_open_blocks(parser, &input, &all_matched); 1275 1276 if (!last_matched_container) 1277 goto finished; 1278 1279 container = last_matched_container; 1280 1281 open_new_blocks(parser, &container, &input, all_matched); 1282 1283 add_text_to_container(parser, container, last_matched_container, &input); 1284 1285 finished: 1286 parser->last_line_length = input.len; 1287 if (parser->last_line_length && 1288 input.data[parser->last_line_length - 1] == '\n') 1289 parser->last_line_length -= 1; 1290 if (parser->last_line_length && 1291 input.data[parser->last_line_length - 1] == '\r') 1292 parser->last_line_length -= 1; 1293 1294 cmark_strbuf_clear(&parser->curline); 1295 } 1296 1297 cmark_node *cmark_parser_finish(cmark_parser *parser) { 1298 if (parser->linebuf.size) { 1299 S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); 1300 cmark_strbuf_clear(&parser->linebuf); 1301 } 1302 1303 finalize_document(parser); 1304 1305 cmark_consolidate_text_nodes(parser->root); 1306 1307 cmark_strbuf_free(&parser->curline); 1308 1309 #if CMARK_DEBUG_NODES 1310 if (cmark_node_check(parser->root, stderr)) { 1311 abort(); 1312 } 1313 #endif 1314 return parser->root; 1315 }