cmark

My personal build of CMark ✏️

commonmark.c (12853B)

  1 #include <stdlib.h>
  2 #include <stdio.h>
  3 #include <string.h>
  4 #include <stdint.h>
  5 #include <assert.h>
  6 
  7 #include "config.h"
  8 #include "cmark.h"
  9 #include "node.h"
 10 #include "buffer.h"
 11 #include "utf8.h"
 12 #include "scanners.h"
 13 #include "render.h"
 14 
 15 #define OUT(s, wrap, escaping) renderer->out(renderer, s, wrap, escaping)
 16 #define LIT(s) renderer->out(renderer, s, false, LITERAL)
 17 #define CR() renderer->cr(renderer)
 18 #define BLANKLINE() renderer->blankline(renderer)
 19 #define ENCODED_SIZE 20
 20 #define LISTMARKER_SIZE 20
 21 
 22 // Functions to convert cmark_nodes to commonmark strings.
 23 
 24 static CMARK_INLINE void outc(cmark_renderer *renderer, cmark_escaping escape,
 25                               int32_t c, unsigned char nextc) {
 26   bool needs_escaping = false;
 27   bool follows_digit =
 28       renderer->buffer->size > 0 &&
 29       cmark_isdigit(renderer->buffer->ptr[renderer->buffer->size - 1]);
 30   char encoded[ENCODED_SIZE];
 31   int options = renderer->options;
 32 
 33   needs_escaping =
 34       c < 0x80 && escape != LITERAL &&
 35       ((escape == NORMAL &&
 36         (c < 0x20 ||
 37 	 c == '*' || c == '_' || c == '[' || c == ']' || c == '#' || c == '<' ||
 38          c == '>' || c == '\\' || c == '`' || c == '!' ||
 39          (c == '&' && cmark_isalpha(nextc)) || (c == '!' && nextc == '[') ||
 40 	 ((CMARK_OPT_SMART & options) &&
 41 	    ((c == '-' && nextc == '-') ||
 42 	     (c == '.' && nextc == '.') ||
 43 	     c == '"' || c == '\'')) ||
 44          (renderer->begin_content && (c == '-' || c == '+' || c == '=') &&
 45           // begin_content doesn't get set to false til we've passed digits
 46           // at the beginning of line, so...
 47           !follows_digit) ||
 48          (renderer->begin_content && (c == '.' || c == ')') && follows_digit &&
 49           (nextc == 0 || cmark_isspace(nextc))))) ||
 50        (escape == URL &&
 51         (c == '`' || c == '<' || c == '>' || cmark_isspace(c) || c == '\\' ||
 52          c == ')' || c == '(')) ||
 53        (escape == TITLE &&
 54         (c == '`' || c == '<' || c == '>' || c == '"' || c == '\\')));
 55 
 56   if (needs_escaping) {
 57     if (escape == URL && cmark_isspace(c)) {
 58       // use percent encoding for spaces
 59       snprintf(encoded, ENCODED_SIZE, "%%%2X", c);
 60       cmark_strbuf_puts(renderer->buffer, encoded);
 61       renderer->column += 3;
 62     } else if (cmark_ispunct(c)) {
 63       cmark_render_ascii(renderer, "\\");
 64       cmark_render_code_point(renderer, c);
 65     } else { // render as entity
 66       snprintf(encoded, ENCODED_SIZE, "&#%d;", c);
 67       cmark_strbuf_puts(renderer->buffer, encoded);
 68       renderer->column += strlen(encoded);
 69     }
 70   } else {
 71     cmark_render_code_point(renderer, c);
 72   }
 73 }
 74 
 75 static int longest_backtick_sequence(const char *code) {
 76   int longest = 0;
 77   int current = 0;
 78   size_t i = 0;
 79   size_t code_len = strlen(code);
 80   while (i <= code_len) {
 81     if (code[i] == '`') {
 82       current++;
 83     } else {
 84       if (current > longest) {
 85         longest = current;
 86       }
 87       current = 0;
 88     }
 89     i++;
 90   }
 91   return longest;
 92 }
 93 
 94 static int shortest_unused_backtick_sequence(const char *code) {
 95   // note: if the shortest sequence is >= 32, this returns 32
 96   // so as not to overflow the bit array.
 97   uint32_t used = 1;
 98   int current = 0;
 99   size_t i = 0;
100   size_t code_len = strlen(code);
101   while (i <= code_len) {
102     if (code[i] == '`') {
103       current++;
104     } else {
105       if (current > 0 && current < 32) {
106         used |= (1U << current);
107       }
108       current = 0;
109     }
110     i++;
111   }
112   // return number of first bit that is 0:
113   i = 0;
114   while (i < 32 && used & 1) {
115     used = used >> 1;
116     i++;
117   }
118   return (int)i;
119 }
120 
121 static bool is_autolink(cmark_node *node) {
122   const unsigned char *title;
123   const unsigned char *url;
124   cmark_node *link_text;
125 
126   if (node->type != CMARK_NODE_LINK) {
127     return false;
128   }
129 
130   url = node->as.link.url;
131   if (url == NULL || _scan_scheme(url) == 0) {
132     return false;
133   }
134 
135   title = node->as.link.title;
136   // if it has a title, we can't treat it as an autolink:
137   if (title && title[0]) {
138     return false;
139   }
140 
141   link_text = node->first_child;
142   if (link_text == NULL) {
143     return false;
144   }
145   cmark_consolidate_text_nodes(link_text);
146   if (strncmp((const char *)url, "mailto:", 7) == 0) {
147     url += 7;
148   }
149   return link_text->data != NULL &&
150          strcmp((const char *)url, (char *)link_text->data) == 0;
151 }
152 
153 // if node is a block node, returns node.
154 // otherwise returns first block-level node that is an ancestor of node.
155 // if there is no block-level ancestor, returns NULL.
156 static cmark_node *get_containing_block(cmark_node *node) {
157   while (node) {
158     if (node->type >= CMARK_NODE_FIRST_BLOCK &&
159         node->type <= CMARK_NODE_LAST_BLOCK) {
160       return node;
161     } else {
162       node = node->parent;
163     }
164   }
165   return NULL;
166 }
167 
168 static int S_render_node(cmark_renderer *renderer, cmark_node *node,
169                          cmark_event_type ev_type, int options) {
170   cmark_node *tmp;
171   int list_number;
172   cmark_delim_type list_delim;
173   size_t numticks;
174   bool extra_spaces;
175   size_t i;
176   bool entering = (ev_type == CMARK_EVENT_ENTER);
177   const char *info, *code, *title;
178   char fencechar[2] = {'\0', '\0'};
179   size_t code_len;
180   char listmarker[LISTMARKER_SIZE];
181   const char *emph_delim;
182   bool first_in_list_item;
183   bufsize_t marker_width;
184   bool has_nonspace;
185   bool allow_wrap = renderer->width > 0 && !(CMARK_OPT_NOBREAKS & options) &&
186                     !(CMARK_OPT_HARDBREAKS & options);
187 
188   // Don't adjust tight list status til we've started the list.
189   // Otherwise we loose the blank line between a paragraph and
190   // a following list.
191   if (!(node->type == CMARK_NODE_ITEM && node->prev == NULL && entering)) {
192     tmp = get_containing_block(node);
193     renderer->in_tight_list_item =
194         tmp && // tmp might be NULL if there is no containing block
195         ((tmp->type == CMARK_NODE_ITEM &&
196           cmark_node_get_list_tight(tmp->parent)) ||
197          (tmp && tmp->parent && tmp->parent->type == CMARK_NODE_ITEM &&
198           cmark_node_get_list_tight(tmp->parent->parent)));
199   }
200 
201   switch (node->type) {
202   case CMARK_NODE_DOCUMENT:
203     break;
204 
205   case CMARK_NODE_BLOCK_QUOTE:
206     if (entering) {
207       LIT("> ");
208       renderer->begin_content = true;
209       cmark_strbuf_puts(renderer->prefix, "> ");
210     } else {
211       cmark_strbuf_truncate(renderer->prefix, renderer->prefix->size - 2);
212       BLANKLINE();
213     }
214     break;
215 
216   case CMARK_NODE_LIST:
217     if (!entering && node->next && (node->next->type == CMARK_NODE_LIST)) {
218       // this ensures that a following indented code block or list will be
219       // inteprereted correctly.
220       CR();
221       LIT("<!-- end list -->");
222       BLANKLINE();
223     }
224     break;
225 
226   case CMARK_NODE_ITEM:
227     if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) {
228       marker_width = 4;
229     } else {
230       list_number = cmark_node_get_list_start(node->parent);
231       list_delim = cmark_node_get_list_delim(node->parent);
232       tmp = node;
233       while (tmp->prev) {
234         tmp = tmp->prev;
235         list_number += 1;
236       }
237       // we ensure a width of at least 4 so
238       // we get nice transition from single digits
239       // to double
240       snprintf(listmarker, LISTMARKER_SIZE, "%d%s%s", list_number,
241                list_delim == CMARK_PAREN_DELIM ? ")" : ".",
242                list_number < 10 ? "  " : " ");
243       marker_width = strlen(listmarker);
244     }
245     if (entering) {
246       if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) {
247         LIT("  - ");
248         renderer->begin_content = true;
249       } else {
250         LIT(listmarker);
251         renderer->begin_content = true;
252       }
253       for (i = marker_width; i--;) {
254         cmark_strbuf_putc(renderer->prefix, ' ');
255       }
256     } else {
257       cmark_strbuf_truncate(renderer->prefix,
258                             renderer->prefix->size - marker_width);
259       CR();
260     }
261     break;
262 
263   case CMARK_NODE_HEADING:
264     if (entering) {
265       for (i = cmark_node_get_heading_level(node); i > 0; i--) {
266         LIT("#");
267       }
268       LIT(" ");
269       renderer->begin_content = true;
270       renderer->no_linebreaks = true;
271     } else {
272       renderer->no_linebreaks = false;
273       BLANKLINE();
274     }
275     break;
276 
277   case CMARK_NODE_CODE_BLOCK:
278 
279     first_in_list_item = node->prev == NULL && node->parent &&
280                          node->parent->type == CMARK_NODE_ITEM;
281 
282     if (!first_in_list_item) {
283       BLANKLINE();
284     }
285     info = cmark_node_get_fence_info(node);
286     fencechar[0] = strchr(info, '`') == NULL ? '`' : '~';
287     code = cmark_node_get_literal(node);
288 
289     numticks = longest_backtick_sequence(code) + 1;
290     if (numticks < 3) {
291       numticks = 3;
292     }
293     for (i = 0; i < numticks; i++) {
294       LIT(fencechar);
295     }
296     LIT(" ");
297     OUT(info, false, LITERAL);
298     CR();
299     OUT(cmark_node_get_literal(node), false, LITERAL);
300     CR();
301     for (i = 0; i < numticks; i++) {
302       LIT(fencechar);
303     }
304 
305     BLANKLINE();
306     break;
307 
308   case CMARK_NODE_HTML_BLOCK:
309     BLANKLINE();
310     OUT(cmark_node_get_literal(node), false, LITERAL);
311     BLANKLINE();
312     break;
313 
314   case CMARK_NODE_CUSTOM_BLOCK:
315     BLANKLINE();
316     OUT(entering ? cmark_node_get_on_enter(node) : cmark_node_get_on_exit(node),
317         false, LITERAL);
318     BLANKLINE();
319     break;
320 
321   case CMARK_NODE_THEMATIC_BREAK:
322     BLANKLINE();
323     LIT("-----");
324     BLANKLINE();
325     break;
326 
327   case CMARK_NODE_PARAGRAPH:
328     if (!entering) {
329       BLANKLINE();
330     }
331     break;
332 
333   case CMARK_NODE_TEXT:
334     OUT(cmark_node_get_literal(node), allow_wrap, NORMAL);
335     break;
336 
337   case CMARK_NODE_LINEBREAK:
338     if (!(CMARK_OPT_HARDBREAKS & options)) {
339       LIT("  ");
340     }
341     CR();
342     break;
343 
344   case CMARK_NODE_SOFTBREAK:
345     if (CMARK_OPT_HARDBREAKS & options) {
346       LIT("  ");
347       CR();
348     } else if (!renderer->no_linebreaks && renderer->width == 0 &&
349                !(CMARK_OPT_HARDBREAKS & options) &&
350                !(CMARK_OPT_NOBREAKS & options)) {
351       CR();
352     } else {
353       OUT(" ", allow_wrap, LITERAL);
354     }
355     break;
356 
357   case CMARK_NODE_CODE:
358     code = cmark_node_get_literal(node);
359     code_len = strlen(code);
360     numticks = shortest_unused_backtick_sequence(code);
361     has_nonspace = false;
362     for (i=0; i < code_len; i++) {
363       if (code[i] != ' ') {
364         has_nonspace = true;
365         break;
366       }
367     }
368     extra_spaces = code_len == 0 ||
369 	    code[0] == '`' || code[code_len - 1] == '`' ||
370 	    (has_nonspace && code[0] == ' ' && code[code_len - 1] == ' ');
371     for (i = 0; i < numticks; i++) {
372       LIT("`");
373     }
374     if (extra_spaces) {
375       LIT(" ");
376     }
377     OUT(cmark_node_get_literal(node), allow_wrap, LITERAL);
378     if (extra_spaces) {
379       LIT(" ");
380     }
381     for (i = 0; i < numticks; i++) {
382       LIT("`");
383     }
384     break;
385 
386   case CMARK_NODE_HTML_INLINE:
387     OUT(cmark_node_get_literal(node), false, LITERAL);
388     break;
389 
390   case CMARK_NODE_CUSTOM_INLINE:
391     OUT(entering ? cmark_node_get_on_enter(node) : cmark_node_get_on_exit(node),
392         false, LITERAL);
393     break;
394 
395   case CMARK_NODE_STRONG:
396     if (entering) {
397       LIT("**");
398     } else {
399       LIT("**");
400     }
401     break;
402 
403   case CMARK_NODE_EMPH:
404     // If we have EMPH(EMPH(x)), we need to use *_x_*
405     // because **x** is STRONG(x):
406     if (node->parent && node->parent->type == CMARK_NODE_EMPH &&
407         node->next == NULL && node->prev == NULL) {
408       emph_delim = "_";
409     } else {
410       emph_delim = "*";
411     }
412     if (entering) {
413       LIT(emph_delim);
414     } else {
415       LIT(emph_delim);
416     }
417     break;
418 
419   case CMARK_NODE_LINK:
420     if (is_autolink(node)) {
421       if (entering) {
422         LIT("<");
423         if (strncmp(cmark_node_get_url(node), "mailto:", 7) == 0) {
424           LIT((const char *)cmark_node_get_url(node) + 7);
425         } else {
426           LIT((const char *)cmark_node_get_url(node));
427         }
428         LIT(">");
429         // return signal to skip contents of node...
430         return 0;
431       }
432     } else {
433       if (entering) {
434         LIT("[");
435       } else {
436         LIT("](");
437         OUT(cmark_node_get_url(node), false, URL);
438         title = cmark_node_get_title(node);
439         if (strlen(title) > 0) {
440           LIT(" \"");
441           OUT(title, false, TITLE);
442           LIT("\"");
443         }
444         LIT(")");
445       }
446     }
447     break;
448 
449   case CMARK_NODE_IMAGE:
450     if (entering) {
451       LIT("![");
452     } else {
453       LIT("](");
454       OUT(cmark_node_get_url(node), false, URL);
455       title = cmark_node_get_title(node);
456       if (strlen(title) > 0) {
457         OUT(" \"", allow_wrap, LITERAL);
458         OUT(title, false, TITLE);
459         LIT("\"");
460       }
461       LIT(")");
462     }
463     break;
464 
465   default:
466     assert(false);
467     break;
468   }
469 
470   return 1;
471 }
472 
473 char *cmark_render_commonmark(cmark_node *root, int options, int width) {
474   if (options & CMARK_OPT_HARDBREAKS) {
475     // disable breaking on width, since it has
476     // a different meaning with OPT_HARDBREAKS
477     width = 0;
478   }
479   return cmark_render(root, options, width, outc, S_render_node);
480 }