cmark

My personal build of CMark ✏️

blocks.c (40674B)

   1 /**
   2  * Block parsing implementation.
   3  *
   4  * For a high-level overview of the block parsing process,
   5  * see http://spec.commonmark.org/0.24/#phase-1-block-structure
   6  */
   7 
   8 #include <stdlib.h>
   9 #include <assert.h>
  10 #include <stdio.h>
  11 #include <limits.h>
  12 
  13 #include "cmark_ctype.h"
  14 #include "config.h"
  15 #include "parser.h"
  16 #include "cmark.h"
  17 #include "node.h"
  18 #include "references.h"
  19 #include "utf8.h"
  20 #include "scanners.h"
  21 #include "inlines.h"
  22 #include "houdini.h"
  23 #include "buffer.h"
  24 #include "chunk.h"
  25 
  26 #define CODE_INDENT 4
  27 #define TAB_STOP 4
  28 
  29 #ifndef MIN
  30 #define MIN(x, y) ((x < y) ? x : y)
  31 #endif
  32 
  33 #define peek_at(i, n) (i)->data[n]
  34 
  35 static bool S_last_line_blank(const cmark_node *node) {
  36   return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != 0;
  37 }
  38 
  39 static bool S_last_line_checked(const cmark_node *node) {
  40   return (node->flags & CMARK_NODE__LAST_LINE_CHECKED) != 0;
  41 }
  42 
  43 static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) {
  44   return (cmark_node_type)node->type;
  45 }
  46 
  47 static void S_set_last_line_blank(cmark_node *node, bool is_blank) {
  48   if (is_blank)
  49     node->flags |= CMARK_NODE__LAST_LINE_BLANK;
  50   else
  51     node->flags &= ~CMARK_NODE__LAST_LINE_BLANK;
  52 }
  53 
  54 static void S_set_last_line_checked(cmark_node *node) {
  55   node->flags |= CMARK_NODE__LAST_LINE_CHECKED;
  56 }
  57 
  58 static CMARK_INLINE bool S_is_line_end_char(char c) {
  59   return (c == '\n' || c == '\r');
  60 }
  61 
  62 static CMARK_INLINE bool S_is_space_or_tab(char c) {
  63   return (c == ' ' || c == '\t');
  64 }
  65 
  66 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
  67                           size_t len, bool eof);
  68 
  69 static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
  70                            bufsize_t bytes);
  71 
  72 static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag,
  73                               int start_line, int start_column) {
  74   cmark_node *e;
  75 
  76   e = (cmark_node *)mem->calloc(1, sizeof(*e));
  77   e->mem = mem;
  78   e->type = (uint16_t)tag;
  79   e->flags = CMARK_NODE__OPEN;
  80   e->start_line = start_line;
  81   e->start_column = start_column;
  82   e->end_line = start_line;
  83 
  84   return e;
  85 }
  86 
  87 // Create a root document node.
  88 static cmark_node *make_document(cmark_mem *mem) {
  89   cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, 1, 1);
  90   return e;
  91 }
  92 
  93 cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
  94   cmark_parser *parser = (cmark_parser *)mem->calloc(1, sizeof(cmark_parser));
  95   parser->mem = mem;
  96 
  97   cmark_node *document = make_document(mem);
  98 
  99   cmark_strbuf_init(mem, &parser->curline, 256);
 100   cmark_strbuf_init(mem, &parser->linebuf, 0);
 101   cmark_strbuf_init(mem, &parser->content, 0);
 102 
 103   parser->refmap = cmark_reference_map_new(mem);
 104   parser->root = document;
 105   parser->current = document;
 106   parser->line_number = 0;
 107   parser->offset = 0;
 108   parser->column = 0;
 109   parser->first_nonspace = 0;
 110   parser->first_nonspace_column = 0;
 111   parser->thematic_break_kill_pos = 0;
 112   parser->indent = 0;
 113   parser->blank = false;
 114   parser->partially_consumed_tab = false;
 115   parser->last_line_length = 0;
 116   parser->options = options;
 117   parser->last_buffer_ended_with_cr = false;
 118 
 119   return parser;
 120 }
 121 
 122 cmark_parser *cmark_parser_new(int options) {
 123   extern cmark_mem DEFAULT_MEM_ALLOCATOR;
 124   return cmark_parser_new_with_mem(options, &DEFAULT_MEM_ALLOCATOR);
 125 }
 126 
 127 void cmark_parser_free(cmark_parser *parser) {
 128   cmark_mem *mem = parser->mem;
 129   cmark_strbuf_free(&parser->curline);
 130   cmark_strbuf_free(&parser->linebuf);
 131   cmark_reference_map_free(parser->refmap);
 132   mem->free(parser);
 133 }
 134 
 135 static cmark_node *finalize(cmark_parser *parser, cmark_node *b);
 136 
 137 // Returns true if line has only space characters, else false.
 138 static bool is_blank(cmark_strbuf *s, bufsize_t offset) {
 139   while (offset < s->size) {
 140     switch (s->ptr[offset]) {
 141     case '\r':
 142     case '\n':
 143       return true;
 144     case ' ':
 145       offset++;
 146       break;
 147     case '\t':
 148       offset++;
 149       break;
 150     default:
 151       return false;
 152     }
 153   }
 154 
 155   return true;
 156 }
 157 
 158 static CMARK_INLINE bool can_contain(cmark_node_type parent_type,
 159                                      cmark_node_type child_type) {
 160   return (parent_type == CMARK_NODE_DOCUMENT ||
 161           parent_type == CMARK_NODE_BLOCK_QUOTE ||
 162           parent_type == CMARK_NODE_ITEM ||
 163           (parent_type == CMARK_NODE_LIST && child_type == CMARK_NODE_ITEM));
 164 }
 165 
 166 static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) {
 167   return (block_type == CMARK_NODE_PARAGRAPH ||
 168           block_type == CMARK_NODE_HEADING ||
 169           block_type == CMARK_NODE_CODE_BLOCK);
 170 }
 171 
 172 static CMARK_INLINE bool contains_inlines(cmark_node_type block_type) {
 173   return (block_type == CMARK_NODE_PARAGRAPH ||
 174           block_type == CMARK_NODE_HEADING);
 175 }
 176 
 177 static void add_line(cmark_chunk *ch, cmark_parser *parser) {
 178   int chars_to_tab;
 179   int i;
 180   if (parser->partially_consumed_tab) {
 181     parser->offset += 1; // skip over tab
 182     // add space characters:
 183     chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
 184     for (i = 0; i < chars_to_tab; i++) {
 185       cmark_strbuf_putc(&parser->content, ' ');
 186     }
 187   }
 188   cmark_strbuf_put(&parser->content, ch->data + parser->offset,
 189                    ch->len - parser->offset);
 190 }
 191 
 192 static void remove_trailing_blank_lines(cmark_strbuf *ln) {
 193   bufsize_t i;
 194   unsigned char c;
 195 
 196   for (i = ln->size - 1; i >= 0; --i) {
 197     c = ln->ptr[i];
 198 
 199     if (c != ' ' && c != '\t' && !S_is_line_end_char(c))
 200       break;
 201   }
 202 
 203   if (i < 0) {
 204     cmark_strbuf_clear(ln);
 205     return;
 206   }
 207 
 208   for (; i < ln->size; ++i) {
 209     c = ln->ptr[i];
 210 
 211     if (!S_is_line_end_char(c))
 212       continue;
 213 
 214     cmark_strbuf_truncate(ln, i);
 215     break;
 216   }
 217 }
 218 
 219 // Check to see if a node ends with a blank line, descending
 220 // if needed into lists and sublists.
 221 static bool S_ends_with_blank_line(cmark_node *node) {
 222   if (S_last_line_checked(node)) {
 223     return(S_last_line_blank(node));
 224   } else if ((S_type(node) == CMARK_NODE_LIST ||
 225               S_type(node) == CMARK_NODE_ITEM) && node->last_child) {
 226     S_set_last_line_checked(node);
 227     return(S_ends_with_blank_line(node->last_child));
 228   } else {
 229     S_set_last_line_checked(node);
 230     return (S_last_line_blank(node));
 231   }
 232 }
 233 
 234 // returns true if content remains after link defs are resolved.
 235 static bool resolve_reference_link_definitions(cmark_parser *parser) {
 236   bufsize_t pos;
 237   cmark_strbuf *node_content = &parser->content;
 238   cmark_chunk chunk = {node_content->ptr, node_content->size};
 239   while (chunk.len && chunk.data[0] == '[' &&
 240          (pos = cmark_parse_reference_inline(parser->mem, &chunk,
 241 					     parser->refmap))) {
 242 
 243     chunk.data += pos;
 244     chunk.len -= pos;
 245   }
 246   cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
 247   return !is_blank(node_content, 0);
 248 }
 249 
 250 static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
 251   bufsize_t pos;
 252   cmark_node *item;
 253   cmark_node *subitem;
 254   cmark_node *parent;
 255   bool has_content;
 256 
 257   parent = b->parent;
 258   assert(b->flags &
 259          CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
 260   b->flags &= ~CMARK_NODE__OPEN;
 261 
 262   if (parser->curline.size == 0) {
 263     // end of input - line number has not been incremented
 264     b->end_line = parser->line_number;
 265     b->end_column = parser->last_line_length;
 266   } else if (S_type(b) == CMARK_NODE_DOCUMENT ||
 267              (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
 268              (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
 269     b->end_line = parser->line_number;
 270     b->end_column = parser->curline.size;
 271     if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
 272       b->end_column -= 1;
 273     if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\r')
 274       b->end_column -= 1;
 275   } else {
 276     b->end_line = parser->line_number - 1;
 277     b->end_column = parser->last_line_length;
 278   }
 279 
 280   cmark_strbuf *node_content = &parser->content;
 281 
 282   switch (S_type(b)) {
 283   case CMARK_NODE_PARAGRAPH:
 284   {
 285     has_content = resolve_reference_link_definitions(parser);
 286     if (!has_content) {
 287       // remove blank node (former reference def)
 288       cmark_node_free(b);
 289     } else {
 290       b->len = node_content->size;
 291       b->data = cmark_strbuf_detach(node_content);
 292     }
 293     break;
 294   }
 295 
 296   case CMARK_NODE_CODE_BLOCK:
 297     if (!b->as.code.fenced) { // indented code
 298       remove_trailing_blank_lines(node_content);
 299       cmark_strbuf_putc(node_content, '\n');
 300     } else {
 301       // first line of contents becomes info
 302       for (pos = 0; pos < node_content->size; ++pos) {
 303         if (S_is_line_end_char(node_content->ptr[pos]))
 304           break;
 305       }
 306       assert(pos < node_content->size);
 307 
 308       if (pos == 0) {
 309         b->as.code.info = NULL;
 310       } else {
 311         cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem);
 312         houdini_unescape_html_f(&tmp, node_content->ptr, pos);
 313         cmark_strbuf_trim(&tmp);
 314         cmark_strbuf_unescape(&tmp);
 315         b->as.code.info = cmark_strbuf_detach(&tmp);
 316       }
 317 
 318       if (node_content->ptr[pos] == '\r')
 319         pos += 1;
 320       if (node_content->ptr[pos] == '\n')
 321         pos += 1;
 322       cmark_strbuf_drop(node_content, pos);
 323     }
 324     b->len = node_content->size;
 325     b->data = cmark_strbuf_detach(node_content);
 326     break;
 327 
 328   case CMARK_NODE_HEADING:
 329   case CMARK_NODE_HTML_BLOCK:
 330     b->len = node_content->size;
 331     b->data = cmark_strbuf_detach(node_content);
 332     break;
 333 
 334   case CMARK_NODE_LIST:      // determine tight/loose status
 335     b->as.list.tight = true; // tight by default
 336     item = b->first_child;
 337 
 338     while (item) {
 339       // check for non-final non-empty list item ending with blank line:
 340       if (S_last_line_blank(item) && item->next) {
 341         b->as.list.tight = false;
 342         break;
 343       }
 344       // recurse into children of list item, to see if there are
 345       // spaces between them:
 346       subitem = item->first_child;
 347       while (subitem) {
 348         if ((item->next || subitem->next) &&
 349             S_ends_with_blank_line(subitem)) {
 350           b->as.list.tight = false;
 351           break;
 352         }
 353         subitem = subitem->next;
 354       }
 355       if (!(b->as.list.tight)) {
 356         break;
 357       }
 358       item = item->next;
 359     }
 360 
 361     break;
 362 
 363   default:
 364     break;
 365   }
 366 
 367   return parent;
 368 }
 369 
 370 // Add a node as child of another.  Return pointer to child.
 371 static cmark_node *add_child(cmark_parser *parser, cmark_node *parent,
 372                              cmark_node_type block_type, int start_column) {
 373   assert(parent);
 374 
 375   // if 'parent' isn't the kind of node that can accept this child,
 376   // then back up til we hit a node that can.
 377   while (!can_contain(S_type(parent), block_type)) {
 378     parent = finalize(parser, parent);
 379   }
 380 
 381   cmark_node *child =
 382       make_block(parser->mem, block_type, parser->line_number, start_column);
 383   child->parent = parent;
 384 
 385   if (parent->last_child) {
 386     parent->last_child->next = child;
 387     child->prev = parent->last_child;
 388   } else {
 389     parent->first_child = child;
 390     child->prev = NULL;
 391   }
 392   parent->last_child = child;
 393   return child;
 394 }
 395 
 396 // Walk through node and all children, recursively, parsing
 397 // string content into inline content where appropriate.
 398 static void process_inlines(cmark_mem *mem, cmark_node *root,
 399                             cmark_reference_map *refmap, int options) {
 400   cmark_iter *iter = cmark_iter_new(root);
 401   cmark_node *cur;
 402   cmark_event_type ev_type;
 403 
 404   while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
 405     cur = cmark_iter_get_node(iter);
 406     if (ev_type == CMARK_EVENT_ENTER) {
 407       if (contains_inlines(S_type(cur))) {
 408         cmark_parse_inlines(mem, cur, refmap, options);
 409         mem->free(cur->data);
 410         cur->data = NULL;
 411         cur->len = 0;
 412       }
 413     }
 414   }
 415 
 416   cmark_iter_free(iter);
 417 }
 418 
 419 // Attempts to parse a list item marker (bullet or enumerated).
 420 // On success, returns length of the marker, and populates
 421 // data with the details.  On failure, returns 0.
 422 static bufsize_t parse_list_marker(cmark_mem *mem, cmark_chunk *input,
 423                                    bufsize_t pos, bool interrupts_paragraph,
 424                                    cmark_list **dataptr) {
 425   unsigned char c;
 426   bufsize_t startpos;
 427   cmark_list *data;
 428   bufsize_t i;
 429 
 430   startpos = pos;
 431   c = peek_at(input, pos);
 432 
 433   if (c == '*' || c == '-' || c == '+') {
 434     pos++;
 435     if (!cmark_isspace(peek_at(input, pos))) {
 436       return 0;
 437     }
 438 
 439     if (interrupts_paragraph) {
 440       i = pos;
 441       // require non-blank content after list marker:
 442       while (S_is_space_or_tab(peek_at(input, i))) {
 443         i++;
 444       }
 445       if (peek_at(input, i) == '\n') {
 446         return 0;
 447       }
 448     }
 449 
 450     data = (cmark_list *)mem->calloc(1, sizeof(*data));
 451     data->marker_offset = 0; // will be adjusted later
 452     data->list_type = CMARK_BULLET_LIST;
 453     data->bullet_char = c;
 454     data->start = 0;
 455     data->delimiter = CMARK_NO_DELIM;
 456     data->tight = false;
 457   } else if (cmark_isdigit(c)) {
 458     int start = 0;
 459     int digits = 0;
 460 
 461     do {
 462       start = (10 * start) + (peek_at(input, pos) - '0');
 463       pos++;
 464       digits++;
 465       // We limit to 9 digits to avoid overflow,
 466       // assuming max int is 2^31 - 1
 467       // This also seems to be the limit for 'start' in some browsers.
 468     } while (digits < 9 && cmark_isdigit(peek_at(input, pos)));
 469 
 470     if (interrupts_paragraph && start != 1) {
 471       return 0;
 472     }
 473     c = peek_at(input, pos);
 474     if (c == '.' || c == ')') {
 475       pos++;
 476       if (!cmark_isspace(peek_at(input, pos))) {
 477         return 0;
 478       }
 479       if (interrupts_paragraph) {
 480         // require non-blank content after list marker:
 481         i = pos;
 482         while (S_is_space_or_tab(peek_at(input, i))) {
 483           i++;
 484         }
 485         if (S_is_line_end_char(peek_at(input, i))) {
 486           return 0;
 487         }
 488       }
 489 
 490       data = (cmark_list *)mem->calloc(1, sizeof(*data));
 491       data->marker_offset = 0; // will be adjusted later
 492       data->list_type = CMARK_ORDERED_LIST;
 493       data->bullet_char = 0;
 494       data->start = start;
 495       data->delimiter = (c == '.' ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM);
 496       data->tight = false;
 497     } else {
 498       return 0;
 499     }
 500   } else {
 501     return 0;
 502   }
 503 
 504   *dataptr = data;
 505   return (pos - startpos);
 506 }
 507 
 508 // Return 1 if list item belongs in list, else 0.
 509 static int lists_match(cmark_list *list_data, cmark_list *item_data) {
 510   return (list_data->list_type == item_data->list_type &&
 511           list_data->delimiter == item_data->delimiter &&
 512           // list_data->marker_offset == item_data.marker_offset &&
 513           list_data->bullet_char == item_data->bullet_char);
 514 }
 515 
 516 static cmark_node *finalize_document(cmark_parser *parser) {
 517   while (parser->current != parser->root) {
 518     parser->current = finalize(parser, parser->current);
 519   }
 520 
 521   finalize(parser, parser->root);
 522 
 523   // Limit total size of extra content created from reference links to
 524   // document size to avoid superlinear growth. Always allow 100KB.
 525   if (parser->total_size > 100000)
 526     parser->refmap->max_ref_size = parser->total_size;
 527   else
 528     parser->refmap->max_ref_size = 100000;
 529 
 530   process_inlines(parser->mem, parser->root, parser->refmap, parser->options);
 531 
 532   cmark_strbuf_free(&parser->content);
 533 
 534   return parser->root;
 535 }
 536 
 537 cmark_node *cmark_parse_file(FILE *f, int options) {
 538   unsigned char buffer[4096];
 539   cmark_parser *parser = cmark_parser_new(options);
 540   size_t bytes;
 541   cmark_node *document;
 542 
 543   while ((bytes = fread(buffer, 1, sizeof(buffer), f)) > 0) {
 544     bool eof = bytes < sizeof(buffer);
 545     S_parser_feed(parser, buffer, bytes, eof);
 546     if (eof) {
 547       break;
 548     }
 549   }
 550 
 551   document = cmark_parser_finish(parser);
 552   cmark_parser_free(parser);
 553   return document;
 554 }
 555 
 556 cmark_node *cmark_parse_document(const char *buffer, size_t len, int options) {
 557   cmark_parser *parser = cmark_parser_new(options);
 558   cmark_node *document;
 559 
 560   S_parser_feed(parser, (const unsigned char *)buffer, len, true);
 561 
 562   document = cmark_parser_finish(parser);
 563   cmark_parser_free(parser);
 564   return document;
 565 }
 566 
 567 void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) {
 568   S_parser_feed(parser, (const unsigned char *)buffer, len, false);
 569 }
 570 
 571 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
 572                           size_t len, bool eof) {
 573   const unsigned char *end = buffer + len;
 574   static const uint8_t repl[] = {239, 191, 189};
 575 
 576   if (len > UINT_MAX - parser->total_size)
 577     parser->total_size = UINT_MAX;
 578   else
 579     parser->total_size += len;
 580 
 581   // Skip UTF-8 BOM if present; see #334
 582   if (parser->line_number == 0 && parser->column == 0 && len >= 3 &&
 583       *buffer == 0xEF && *(buffer + 1) == 0xBB &&
 584       *(buffer + 2) == 0xBF) {
 585     buffer += 3;
 586   } else if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
 587     // skip NL if last buffer ended with CR ; see #117
 588     buffer++;
 589   }
 590 
 591   parser->last_buffer_ended_with_cr = false;
 592   while (buffer < end) {
 593     const unsigned char *eol;
 594     bufsize_t chunk_len;
 595     bool process = false;
 596     for (eol = buffer; eol < end; ++eol) {
 597       if (S_is_line_end_char(*eol)) {
 598         process = true;
 599         break;
 600       }
 601       if (*eol == '\0' && eol < end) {
 602         break;
 603       }
 604     }
 605     if (eol >= end && eof) {
 606       process = true;
 607     }
 608 
 609     chunk_len = (eol - buffer);
 610     if (process) {
 611       if (parser->linebuf.size > 0) {
 612         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
 613         S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
 614         cmark_strbuf_clear(&parser->linebuf);
 615       } else {
 616         S_process_line(parser, buffer, chunk_len);
 617       }
 618     } else {
 619       if (eol < end && *eol == '\0') {
 620         // omit NULL byte
 621         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
 622         // add replacement character
 623         cmark_strbuf_put(&parser->linebuf, repl, 3);
 624       } else {
 625         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
 626       }
 627     }
 628 
 629     buffer += chunk_len;
 630     if (buffer < end) {
 631       if (*buffer == '\0') {
 632         // skip over NULL
 633         buffer++;
 634       } else {
 635         // skip over line ending characters
 636         if (*buffer == '\r') {
 637           buffer++;
 638           if (buffer == end)
 639             parser->last_buffer_ended_with_cr = true;
 640         }
 641         if (buffer < end && *buffer == '\n')
 642           buffer++;
 643       }
 644     }
 645   }
 646 }
 647 
 648 static void chop_trailing_hashtags(cmark_chunk *ch) {
 649   bufsize_t n, orig_n;
 650 
 651   cmark_chunk_rtrim(ch);
 652   orig_n = n = ch->len - 1;
 653 
 654   // if string ends in space followed by #s, remove these:
 655   while (n >= 0 && peek_at(ch, n) == '#')
 656     n--;
 657 
 658   // Check for a space before the final #s:
 659   if (n != orig_n && n >= 0 && S_is_space_or_tab(peek_at(ch, n))) {
 660     ch->len = n;
 661     cmark_chunk_rtrim(ch);
 662   }
 663 }
 664 
 665 // Check for thematic break.  On failure, return 0 and update
 666 // thematic_break_kill_pos with the index at which the
 667 // parse fails.  On success, return length of match.
 668 // "...three or more hyphens, asterisks,
 669 // or underscores on a line by themselves. If you wish, you may use
 670 // spaces between the hyphens or asterisks."
 671 static int S_scan_thematic_break(cmark_parser *parser, cmark_chunk *input,
 672                                  bufsize_t offset) {
 673   bufsize_t i;
 674   char c;
 675   char nextc = '\0';
 676   int count;
 677   i = offset;
 678   c = peek_at(input, i);
 679   if (!(c == '*' || c == '_' || c == '-')) {
 680     parser->thematic_break_kill_pos = i;
 681     return 0;
 682   }
 683   count = 1;
 684   while ((nextc = peek_at(input, ++i))) {
 685     if (nextc == c) {
 686       count++;
 687     } else if (nextc != ' ' && nextc != '\t') {
 688       break;
 689     }
 690   }
 691   if (count >= 3 && (nextc == '\r' || nextc == '\n')) {
 692     return (i - offset) + 1;
 693   } else {
 694     parser->thematic_break_kill_pos = i;
 695     return 0;
 696   }
 697 }
 698 
 699 // Find first nonspace character from current offset, setting
 700 // parser->first_nonspace, parser->first_nonspace_column,
 701 // parser->indent, and parser->blank. Does not advance parser->offset.
 702 static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) {
 703   char c;
 704   int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
 705 
 706   if (parser->first_nonspace <= parser->offset) {
 707     parser->first_nonspace = parser->offset;
 708     parser->first_nonspace_column = parser->column;
 709     while ((c = peek_at(input, parser->first_nonspace))) {
 710       if (c == ' ') {
 711         parser->first_nonspace += 1;
 712         parser->first_nonspace_column += 1;
 713         chars_to_tab = chars_to_tab - 1;
 714         if (chars_to_tab == 0) {
 715           chars_to_tab = TAB_STOP;
 716         }
 717       } else if (c == '\t') {
 718         parser->first_nonspace += 1;
 719         parser->first_nonspace_column += chars_to_tab;
 720         chars_to_tab = TAB_STOP;
 721       } else {
 722         break;
 723       }
 724     }
 725   }
 726 
 727   parser->indent = parser->first_nonspace_column - parser->column;
 728   parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
 729 }
 730 
 731 // Advance parser->offset and parser->column.  parser->offset is the
 732 // byte position in input; parser->column is a virtual column number
 733 // that takes into account tabs. (Multibyte characters are not taken
 734 // into account, because the Markdown line prefixes we are interested in
 735 // analyzing are entirely ASCII.)  The count parameter indicates
 736 // how far to advance the offset.  If columns is true, then count
 737 // indicates a number of columns; otherwise, a number of bytes.
 738 // If advancing a certain number of columns partially consumes
 739 // a tab character, parser->partially_consumed_tab is set to true.
 740 static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
 741                              bufsize_t count, bool columns) {
 742   char c;
 743   int chars_to_tab;
 744   int chars_to_advance;
 745   while (count > 0 && (c = peek_at(input, parser->offset))) {
 746     if (c == '\t') {
 747       chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
 748       if (columns) {
 749         parser->partially_consumed_tab = chars_to_tab > count;
 750         chars_to_advance = MIN(count, chars_to_tab);
 751         parser->column += chars_to_advance;
 752         parser->offset += (parser->partially_consumed_tab ? 0 : 1);
 753         count -= chars_to_advance;
 754       } else {
 755         parser->partially_consumed_tab = false;
 756         parser->column += chars_to_tab;
 757         parser->offset += 1;
 758         count -= 1;
 759       }
 760     } else {
 761       parser->partially_consumed_tab = false;
 762       parser->offset += 1;
 763       parser->column += 1; // assume ascii; block starts are ascii
 764       count -= 1;
 765     }
 766   }
 767 }
 768 
 769 static bool S_last_child_is_open(cmark_node *container) {
 770   return container->last_child &&
 771          (container->last_child->flags & CMARK_NODE__OPEN);
 772 }
 773 
 774 static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
 775   bool res = false;
 776   bufsize_t matched = 0;
 777 
 778   matched =
 779       parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>';
 780   if (matched) {
 781 
 782     S_advance_offset(parser, input, parser->indent + 1, true);
 783 
 784     if (S_is_space_or_tab(peek_at(input, parser->offset))) {
 785       S_advance_offset(parser, input, 1, true);
 786     }
 787 
 788     res = true;
 789   }
 790   return res;
 791 }
 792 
 793 static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
 794                                    cmark_node *container) {
 795   bool res = false;
 796 
 797   if (parser->indent >=
 798       container->as.list.marker_offset + container->as.list.padding) {
 799     S_advance_offset(parser, input, container->as.list.marker_offset +
 800                                         container->as.list.padding,
 801                      true);
 802     res = true;
 803   } else if (parser->blank && container->first_child != NULL) {
 804     // if container->first_child is NULL, then the opening line
 805     // of the list item was blank after the list marker; in this
 806     // case, we are done with the list item.
 807     S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
 808                      false);
 809     res = true;
 810   }
 811   return res;
 812 }
 813 
 814 static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
 815                                     cmark_node *container,
 816                                     bool *should_continue) {
 817   bool res = false;
 818 
 819   if (!container->as.code.fenced) { // indented
 820     if (parser->indent >= CODE_INDENT) {
 821       S_advance_offset(parser, input, CODE_INDENT, true);
 822       res = true;
 823     } else if (parser->blank) {
 824       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
 825                        false);
 826       res = true;
 827     }
 828   } else { // fenced
 829     bufsize_t matched = 0;
 830 
 831     if (parser->indent <= 3 && (peek_at(input, parser->first_nonspace) ==
 832                                 container->as.code.fence_char)) {
 833       matched = scan_close_code_fence(input, parser->first_nonspace);
 834     }
 835 
 836     if (matched >= container->as.code.fence_length) {
 837       // closing fence - and since we're at
 838       // the end of a line, we can stop processing it:
 839       *should_continue = false;
 840       S_advance_offset(parser, input, matched, false);
 841       parser->current = finalize(parser, container);
 842     } else {
 843       // skip opt. spaces of fence parser->offset
 844       int i = container->as.code.fence_offset;
 845 
 846       while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) {
 847         S_advance_offset(parser, input, 1, true);
 848         i--;
 849       }
 850       res = true;
 851     }
 852   }
 853 
 854   return res;
 855 }
 856 
 857 static bool parse_html_block_prefix(cmark_parser *parser,
 858                                     cmark_node *container) {
 859   bool res = false;
 860   int html_block_type = container->as.html_block_type;
 861 
 862   assert(html_block_type >= 1 && html_block_type <= 7);
 863   switch (html_block_type) {
 864   case 1:
 865   case 2:
 866   case 3:
 867   case 4:
 868   case 5:
 869     // these types of blocks can accept blanks
 870     res = true;
 871     break;
 872   case 6:
 873   case 7:
 874     res = !parser->blank;
 875     break;
 876   }
 877 
 878   return res;
 879 }
 880 
 881 /**
 882  * For each containing node, try to parse the associated line start.
 883  *
 884  * Will not close unmatched blocks, as we may have a lazy continuation
 885  * line -> http://spec.commonmark.org/0.24/#lazy-continuation-line
 886  *
 887  * Returns: The last matching node, or NULL
 888  */
 889 static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input,
 890                                      bool *all_matched) {
 891   bool should_continue = true;
 892   *all_matched = false;
 893   cmark_node *container = parser->root;
 894   cmark_node_type cont_type;
 895 
 896   while (S_last_child_is_open(container)) {
 897     container = container->last_child;
 898     cont_type = S_type(container);
 899 
 900     S_find_first_nonspace(parser, input);
 901 
 902     switch (cont_type) {
 903     case CMARK_NODE_BLOCK_QUOTE:
 904       if (!parse_block_quote_prefix(parser, input))
 905         goto done;
 906       break;
 907     case CMARK_NODE_ITEM:
 908       if (!parse_node_item_prefix(parser, input, container))
 909         goto done;
 910       break;
 911     case CMARK_NODE_CODE_BLOCK:
 912       if (!parse_code_block_prefix(parser, input, container, &should_continue))
 913         goto done;
 914       break;
 915     case CMARK_NODE_HEADING:
 916       // a heading can never contain more than one line
 917       goto done;
 918     case CMARK_NODE_HTML_BLOCK:
 919       if (!parse_html_block_prefix(parser, container))
 920         goto done;
 921       break;
 922     case CMARK_NODE_PARAGRAPH:
 923       if (parser->blank)
 924         goto done;
 925       break;
 926     default:
 927       break;
 928     }
 929   }
 930 
 931   *all_matched = true;
 932 
 933 done:
 934   if (!*all_matched) {
 935     container = container->parent; // back up to last matching node
 936   }
 937 
 938   if (!should_continue) {
 939     container = NULL;
 940   }
 941 
 942   return container;
 943 }
 944 
 945 static void open_new_blocks(cmark_parser *parser, cmark_node **container,
 946                             cmark_chunk *input, bool all_matched) {
 947   bool indented;
 948   cmark_list *data = NULL;
 949   bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH;
 950   cmark_node_type cont_type = S_type(*container);
 951   bufsize_t matched = 0;
 952   int lev = 0;
 953   bool save_partially_consumed_tab;
 954   bool has_content;
 955   int save_offset;
 956   int save_column;
 957 
 958   while (cont_type != CMARK_NODE_CODE_BLOCK &&
 959          cont_type != CMARK_NODE_HTML_BLOCK) {
 960 
 961     S_find_first_nonspace(parser, input);
 962     indented = parser->indent >= CODE_INDENT;
 963 
 964     if (!indented && peek_at(input, parser->first_nonspace) == '>') {
 965 
 966       bufsize_t blockquote_startpos = parser->first_nonspace;
 967 
 968       S_advance_offset(parser, input,
 969                        parser->first_nonspace + 1 - parser->offset, false);
 970       // optional following character
 971       if (S_is_space_or_tab(peek_at(input, parser->offset))) {
 972         S_advance_offset(parser, input, 1, true);
 973       }
 974       *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
 975                              blockquote_startpos + 1);
 976 
 977     } else if (!indented && (matched = scan_atx_heading_start(
 978                                  input, parser->first_nonspace))) {
 979       bufsize_t hashpos;
 980       int level = 0;
 981       bufsize_t heading_startpos = parser->first_nonspace;
 982 
 983       S_advance_offset(parser, input,
 984                        parser->first_nonspace + matched - parser->offset,
 985                        false);
 986       *container = add_child(parser, *container, CMARK_NODE_HEADING,
 987                              heading_startpos + 1);
 988 
 989       hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace);
 990 
 991       while (peek_at(input, hashpos) == '#') {
 992         level++;
 993         hashpos++;
 994       }
 995 
 996       (*container)->as.heading.level = level;
 997       (*container)->as.heading.setext = false;
 998       (*container)->internal_offset = matched;
 999 
1000     } else if (!indented && (matched = scan_open_code_fence(
1001                                  input, parser->first_nonspace))) {
1002       *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1003                              parser->first_nonspace + 1);
1004       (*container)->as.code.fenced = true;
1005       (*container)->as.code.fence_char = peek_at(input, parser->first_nonspace);
1006       (*container)->as.code.fence_length = (matched > 255) ? 255 : matched;
1007       (*container)->as.code.fence_offset =
1008           (int8_t)(parser->first_nonspace - parser->offset);
1009       (*container)->as.code.info = NULL;
1010       S_advance_offset(parser, input,
1011                        parser->first_nonspace + matched - parser->offset,
1012                        false);
1013 
1014     } else if (!indented && ((matched = scan_html_block_start(
1015                                   input, parser->first_nonspace)) ||
1016                              (cont_type != CMARK_NODE_PARAGRAPH &&
1017 			      !maybe_lazy &&
1018                               (matched = scan_html_block_start_7(
1019                                    input, parser->first_nonspace))))) {
1020       *container = add_child(parser, *container, CMARK_NODE_HTML_BLOCK,
1021                              parser->first_nonspace + 1);
1022       (*container)->as.html_block_type = matched;
1023       // note, we don't adjust parser->offset because the tag is part of the
1024       // text
1025     } else if (!indented && cont_type == CMARK_NODE_PARAGRAPH &&
1026                (lev =
1027                     scan_setext_heading_line(input, parser->first_nonspace))) {
1028       // finalize paragraph, resolving reference links
1029       has_content = resolve_reference_link_definitions(parser);
1030 
1031       if (has_content) {
1032 
1033         (*container)->type = (uint16_t)CMARK_NODE_HEADING;
1034         (*container)->as.heading.level = lev;
1035         (*container)->as.heading.setext = true;
1036         S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1037       }
1038     } else if (!indented &&
1039                !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
1040 	       (parser->thematic_break_kill_pos <= parser->first_nonspace) &&
1041                (matched = S_scan_thematic_break(parser, input, parser->first_nonspace))) {
1042       // it's only now that we know the line is not part of a setext heading:
1043       *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
1044                              parser->first_nonspace + 1);
1045       S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1046     } else if ((!indented || cont_type == CMARK_NODE_LIST) &&
1047 	       parser->indent < 4 &&
1048                (matched = parse_list_marker(
1049                     parser->mem, input, parser->first_nonspace,
1050                     (*container)->type == CMARK_NODE_PARAGRAPH, &data))) {
1051 
1052       // Note that we can have new list items starting with >= 4
1053       // spaces indent, as long as the list container is still open.
1054       int i = 0;
1055 
1056       // compute padding:
1057       S_advance_offset(parser, input,
1058                        parser->first_nonspace + matched - parser->offset,
1059                        false);
1060 
1061       save_partially_consumed_tab = parser->partially_consumed_tab;
1062       save_offset = parser->offset;
1063       save_column = parser->column;
1064 
1065       while (parser->column - save_column <= 5 &&
1066              S_is_space_or_tab(peek_at(input, parser->offset))) {
1067         S_advance_offset(parser, input, 1, true);
1068       }
1069 
1070       i = parser->column - save_column;
1071       if (i >= 5 || i < 1 ||
1072           // only spaces after list marker:
1073           S_is_line_end_char(peek_at(input, parser->offset))) {
1074         data->padding = matched + 1;
1075         parser->offset = save_offset;
1076         parser->column = save_column;
1077         parser->partially_consumed_tab = save_partially_consumed_tab;
1078         if (i > 0) {
1079           S_advance_offset(parser, input, 1, true);
1080         }
1081       } else {
1082         data->padding = matched + i;
1083       }
1084 
1085       // check container; if it's a list, see if this list item
1086       // can continue the list; otherwise, create a list container.
1087 
1088       data->marker_offset = parser->indent;
1089 
1090       if (cont_type != CMARK_NODE_LIST ||
1091           !lists_match(&((*container)->as.list), data)) {
1092         *container = add_child(parser, *container, CMARK_NODE_LIST,
1093                                parser->first_nonspace + 1);
1094 
1095         memcpy(&((*container)->as.list), data, sizeof(*data));
1096       }
1097 
1098       // add the list item
1099       *container = add_child(parser, *container, CMARK_NODE_ITEM,
1100                              parser->first_nonspace + 1);
1101       /* TODO: static */
1102       memcpy(&((*container)->as.list), data, sizeof(*data));
1103       parser->mem->free(data);
1104     } else if (indented && !maybe_lazy && !parser->blank) {
1105       S_advance_offset(parser, input, CODE_INDENT, true);
1106       *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1107                              parser->offset + 1);
1108       (*container)->as.code.fenced = false;
1109       (*container)->as.code.fence_char = 0;
1110       (*container)->as.code.fence_length = 0;
1111       (*container)->as.code.fence_offset = 0;
1112       (*container)->as.code.info = NULL;
1113 
1114     } else {
1115       break;
1116     }
1117 
1118     if (accepts_lines(S_type(*container))) {
1119       // if it's a line container, it can't contain other containers
1120       break;
1121     }
1122 
1123     cont_type = S_type(*container);
1124     maybe_lazy = false;
1125   }
1126 }
1127 
1128 static void add_text_to_container(cmark_parser *parser, cmark_node *container,
1129                                   cmark_node *last_matched_container,
1130                                   cmark_chunk *input) {
1131   cmark_node *tmp;
1132   // what remains at parser->offset is a text line.  add the text to the
1133   // appropriate container.
1134 
1135   S_find_first_nonspace(parser, input);
1136 
1137   if (parser->blank && container->last_child)
1138     S_set_last_line_blank(container->last_child, true);
1139 
1140   // block quote lines are never blank as they start with >
1141   // and we don't count blanks in fenced code for purposes of tight/loose
1142   // lists or breaking out of lists.  we also don't set last_line_blank
1143   // on an empty list item.
1144   const cmark_node_type ctype = S_type(container);
1145   const bool last_line_blank =
1146       (parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE &&
1147        ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK &&
1148        !(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) &&
1149        !(ctype == CMARK_NODE_ITEM && container->first_child == NULL &&
1150          container->start_line == parser->line_number));
1151 
1152   S_set_last_line_blank(container, last_line_blank);
1153 
1154   tmp = container;
1155   while (tmp->parent) {
1156     S_set_last_line_blank(tmp->parent, false);
1157     tmp = tmp->parent;
1158   }
1159 
1160   // If the last line processed belonged to a paragraph node,
1161   // and we didn't match all of the line prefixes for the open containers,
1162   // and we didn't start any new containers,
1163   // and the line isn't blank,
1164   // then treat this as a "lazy continuation line" and add it to
1165   // the open paragraph.
1166   if (parser->current != last_matched_container &&
1167       container == last_matched_container && !parser->blank &&
1168       S_type(parser->current) == CMARK_NODE_PARAGRAPH) {
1169     add_line(input, parser);
1170   } else { // not a lazy continuation
1171     // Finalize any blocks that were not matched and set cur to container:
1172     while (parser->current != last_matched_container) {
1173       parser->current = finalize(parser, parser->current);
1174       assert(parser->current != NULL);
1175     }
1176 
1177     if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
1178       add_line(input, parser);
1179     } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
1180       add_line(input, parser);
1181 
1182       int matches_end_condition;
1183       switch (container->as.html_block_type) {
1184       case 1:
1185         // </script>, </style>, </textarea>, </pre>
1186         matches_end_condition =
1187             scan_html_block_end_1(input, parser->first_nonspace);
1188         break;
1189       case 2:
1190         // -->
1191         matches_end_condition =
1192             scan_html_block_end_2(input, parser->first_nonspace);
1193         break;
1194       case 3:
1195         // ?>
1196         matches_end_condition =
1197             scan_html_block_end_3(input, parser->first_nonspace);
1198         break;
1199       case 4:
1200         // >
1201         matches_end_condition =
1202             scan_html_block_end_4(input, parser->first_nonspace);
1203         break;
1204       case 5:
1205         // ]]>
1206         matches_end_condition =
1207             scan_html_block_end_5(input, parser->first_nonspace);
1208         break;
1209       default:
1210         matches_end_condition = 0;
1211         break;
1212       }
1213 
1214       if (matches_end_condition) {
1215         container = finalize(parser, container);
1216         assert(parser->current != NULL);
1217       }
1218     } else if (parser->blank) {
1219       // ??? do nothing
1220     } else if (accepts_lines(S_type(container))) {
1221       if (S_type(container) == CMARK_NODE_HEADING &&
1222           container->as.heading.setext == false) {
1223         chop_trailing_hashtags(input);
1224       }
1225       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1226                        false);
1227       add_line(input, parser);
1228     } else {
1229       // create paragraph container for line
1230       container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
1231                             parser->first_nonspace + 1);
1232       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1233                        false);
1234       add_line(input, parser);
1235     }
1236 
1237     parser->current = container;
1238   }
1239 }
1240 
1241 /* See http://spec.commonmark.org/0.24/#phase-1-block-structure */
1242 static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
1243                            bufsize_t bytes) {
1244   cmark_node *last_matched_container;
1245   bool all_matched = true;
1246   cmark_node *container;
1247   cmark_chunk input;
1248 
1249   if (parser->options & CMARK_OPT_VALIDATE_UTF8)
1250     cmark_utf8proc_check(&parser->curline, buffer, bytes);
1251   else
1252     cmark_strbuf_put(&parser->curline, buffer, bytes);
1253 
1254   bytes = parser->curline.size;
1255 
1256   // ensure line ends with a newline:
1257   if (bytes == 0 || !S_is_line_end_char(parser->curline.ptr[bytes - 1]))
1258     cmark_strbuf_putc(&parser->curline, '\n');
1259 
1260   parser->offset = 0;
1261   parser->column = 0;
1262   parser->first_nonspace = 0;
1263   parser->first_nonspace_column = 0;
1264   parser->thematic_break_kill_pos = 0;
1265   parser->indent = 0;
1266   parser->blank = false;
1267   parser->partially_consumed_tab = false;
1268 
1269   input.data = parser->curline.ptr;
1270   input.len = parser->curline.size;
1271 
1272   parser->line_number++;
1273 
1274   last_matched_container = check_open_blocks(parser, &input, &all_matched);
1275 
1276   if (!last_matched_container)
1277     goto finished;
1278 
1279   container = last_matched_container;
1280 
1281   open_new_blocks(parser, &container, &input, all_matched);
1282 
1283   add_text_to_container(parser, container, last_matched_container, &input);
1284 
1285 finished:
1286   parser->last_line_length = input.len;
1287   if (parser->last_line_length &&
1288       input.data[parser->last_line_length - 1] == '\n')
1289     parser->last_line_length -= 1;
1290   if (parser->last_line_length &&
1291       input.data[parser->last_line_length - 1] == '\r')
1292     parser->last_line_length -= 1;
1293 
1294   cmark_strbuf_clear(&parser->curline);
1295 }
1296 
1297 cmark_node *cmark_parser_finish(cmark_parser *parser) {
1298   if (parser->linebuf.size) {
1299     S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
1300     cmark_strbuf_clear(&parser->linebuf);
1301   }
1302 
1303   finalize_document(parser);
1304 
1305   cmark_consolidate_text_nodes(parser->root);
1306 
1307   cmark_strbuf_free(&parser->curline);
1308 
1309 #if CMARK_DEBUG_NODES
1310   if (cmark_node_check(parser->root, stderr)) {
1311     abort();
1312   }
1313 #endif
1314   return parser->root;
1315 }