cmark
My personal build of CMark ✏️
scanners.re (8662B)
1 #include <stdlib.h> 2 #include "chunk.h" 3 #include "scanners.h" 4 5 bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c, bufsize_t offset) 6 { 7 bufsize_t res; 8 unsigned char *ptr = (unsigned char *)c->data; 9 10 if (ptr == NULL || offset > c->len) { 11 return 0; 12 } else { 13 unsigned char lim = ptr[c->len]; 14 15 ptr[c->len] = '\0'; 16 res = scanner(ptr + offset); 17 ptr[c->len] = lim; 18 } 19 20 return res; 21 } 22 23 /*!re2c 24 re2c:define:YYCTYPE = "unsigned char"; 25 re2c:define:YYCURSOR = p; 26 re2c:define:YYMARKER = marker; 27 re2c:define:YYCTXMARKER = marker; 28 re2c:yyfill:enable = 0; 29 30 wordchar = [^\x00-\x20]; 31 32 spacechar = [ \t\v\f\r\n]; 33 34 reg_char = [^\\()\x00-\x20]; 35 36 escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-]; 37 38 tagname = [A-Za-z][A-Za-z0-9-]*; 39 40 blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul'; 41 42 attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*; 43 44 unquotedvalue = [^ \t\r\n\v\f"'=<>`\x00]+; 45 singlequotedvalue = ['][^'\x00]*[']; 46 doublequotedvalue = ["][^"\x00]*["]; 47 48 attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue; 49 50 attributevaluespec = spacechar* [=] spacechar* attributevalue; 51 52 attribute = spacechar+ attributename attributevaluespec?; 53 54 opentag = tagname attribute* spacechar* [/]? [>]; 55 closetag = [/] tagname spacechar* [>]; 56 57 htmlcomment = "!---->" | ("!--" ([-]? [^\x00>-]) ([-]? [^\x00-])* "-->"); 58 59 processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00] | [>])* "?>"; 60 61 declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">"; 62 63 cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>"; 64 65 htmltag = opentag | closetag | htmlcomment | processinginstruction | 66 declaration | cdata; 67 68 in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)]; 69 70 in_double_quotes = ["] (escaped_char|[^"\x00])* ["]; 71 in_single_quotes = ['] (escaped_char|[^'\x00])* [']; 72 in_parens = [(] (escaped_char|[^)\x00])* [)]; 73 74 scheme = [A-Za-z][A-Za-z0-9.+-]{1,31}; 75 */ 76 77 // Try to match a scheme including colon. 78 bufsize_t _scan_scheme(const unsigned char *p) 79 { 80 const unsigned char *marker = NULL; 81 const unsigned char *start = p; 82 /*!re2c 83 scheme [:] { return (bufsize_t)(p - start); } 84 * { return 0; } 85 */ 86 } 87 88 // Try to match URI autolink after first <, returning number of chars matched. 89 bufsize_t _scan_autolink_uri(const unsigned char *p) 90 { 91 const unsigned char *marker = NULL; 92 const unsigned char *start = p; 93 /*!re2c 94 scheme [:][^\x00-\x20<>]*[>] { return (bufsize_t)(p - start); } 95 * { return 0; } 96 */ 97 } 98 99 // Try to match email autolink after first <, returning num of chars matched. 100 bufsize_t _scan_autolink_email(const unsigned char *p) 101 { 102 const unsigned char *marker = NULL; 103 const unsigned char *start = p; 104 /*!re2c 105 [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+ 106 [@] 107 [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? 108 ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)* 109 [>] { return (bufsize_t)(p - start); } 110 * { return 0; } 111 */ 112 } 113 114 // Try to match an HTML tag after first <, returning num of chars matched. 115 bufsize_t _scan_html_tag(const unsigned char *p) 116 { 117 const unsigned char *marker = NULL; 118 const unsigned char *start = p; 119 /*!re2c 120 htmltag { return (bufsize_t)(p - start); } 121 * { return 0; } 122 */ 123 } 124 125 // Try to match an HTML block tag start line, returning 126 // an integer code for the type of block (1-6, matching the spec). 127 // #7 is handled by a separate function, below. 128 bufsize_t _scan_html_block_start(const unsigned char *p) 129 { 130 const unsigned char *marker = NULL; 131 /*!re2c 132 [<] ('script'|'pre'|'textarea'|'style') (spacechar | [>]) { return 1; } 133 '<!--' { return 2; } 134 '<?' { return 3; } 135 '<!' [A-Z] { return 4; } 136 '<![CDATA[' { return 5; } 137 [<] [/]? blocktagname (spacechar | [/]? [>]) { return 6; } 138 * { return 0; } 139 */ 140 } 141 142 // Try to match an HTML block tag start line of type 7, returning 143 // 7 if successful, 0 if not. 144 bufsize_t _scan_html_block_start_7(const unsigned char *p) 145 { 146 const unsigned char *marker = NULL; 147 /*!re2c 148 [<] (opentag | closetag) [\t\n\f ]* [\r\n] { return 7; } 149 * { return 0; } 150 */ 151 } 152 153 // Try to match an HTML block end line of type 1 154 bufsize_t _scan_html_block_end_1(const unsigned char *p) 155 { 156 const unsigned char *marker = NULL; 157 const unsigned char *start = p; 158 /*!re2c 159 [^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return (bufsize_t)(p - start); } 160 * { return 0; } 161 */ 162 } 163 164 // Try to match an HTML block end line of type 2 165 bufsize_t _scan_html_block_end_2(const unsigned char *p) 166 { 167 const unsigned char *marker = NULL; 168 const unsigned char *start = p; 169 /*!re2c 170 [^\n\x00]* '-->' { return (bufsize_t)(p - start); } 171 * { return 0; } 172 */ 173 } 174 175 // Try to match an HTML block end line of type 3 176 bufsize_t _scan_html_block_end_3(const unsigned char *p) 177 { 178 const unsigned char *marker = NULL; 179 const unsigned char *start = p; 180 /*!re2c 181 [^\n\x00]* '?>' { return (bufsize_t)(p - start); } 182 * { return 0; } 183 */ 184 } 185 186 // Try to match an HTML block end line of type 4 187 bufsize_t _scan_html_block_end_4(const unsigned char *p) 188 { 189 const unsigned char *marker = NULL; 190 const unsigned char *start = p; 191 /*!re2c 192 [^\n\x00]* '>' { return (bufsize_t)(p - start); } 193 * { return 0; } 194 */ 195 } 196 197 // Try to match an HTML block end line of type 5 198 bufsize_t _scan_html_block_end_5(const unsigned char *p) 199 { 200 const unsigned char *marker = NULL; 201 const unsigned char *start = p; 202 /*!re2c 203 [^\n\x00]* ']]>' { return (bufsize_t)(p - start); } 204 * { return 0; } 205 */ 206 } 207 208 // Try to match a link title (in single quotes, in double quotes, or 209 // in parentheses), returning number of chars matched. Allow one 210 // level of internal nesting (quotes within quotes). 211 bufsize_t _scan_link_title(const unsigned char *p) 212 { 213 const unsigned char *marker = NULL; 214 const unsigned char *start = p; 215 /*!re2c 216 ["] (escaped_char|[^"\x00])* ["] { return (bufsize_t)(p - start); } 217 ['] (escaped_char|[^'\x00])* ['] { return (bufsize_t)(p - start); } 218 [(] (escaped_char|[^()\x00])* [)] { return (bufsize_t)(p - start); } 219 * { return 0; } 220 */ 221 } 222 223 // Match space characters, including newlines. 224 bufsize_t _scan_spacechars(const unsigned char *p) 225 { 226 const unsigned char *start = p; \ 227 /*!re2c 228 [ \t\v\f\r\n]+ { return (bufsize_t)(p - start); } 229 * { return 0; } 230 */ 231 } 232 233 // Match ATX heading start. 234 bufsize_t _scan_atx_heading_start(const unsigned char *p) 235 { 236 const unsigned char *marker = NULL; 237 const unsigned char *start = p; 238 /*!re2c 239 [#]{1,6} ([ \t]+|[\r\n]) { return (bufsize_t)(p - start); } 240 * { return 0; } 241 */ 242 } 243 244 // Match setext heading line. Return 1 for level-1 heading, 245 // 2 for level-2, 0 for no match. 246 bufsize_t _scan_setext_heading_line(const unsigned char *p) 247 { 248 const unsigned char *marker = NULL; 249 /*!re2c 250 [=]+ [ \t]* [\r\n] { return 1; } 251 [-]+ [ \t]* [\r\n] { return 2; } 252 * { return 0; } 253 */ 254 } 255 256 // Scan an opening code fence. 257 bufsize_t _scan_open_code_fence(const unsigned char *p) 258 { 259 const unsigned char *marker = NULL; 260 const unsigned char *start = p; 261 /*!re2c 262 [`]{3,} / [^`\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); } 263 [~]{3,} / [^\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); } 264 * { return 0; } 265 */ 266 } 267 268 // Scan a closing code fence with length at least len. 269 bufsize_t _scan_close_code_fence(const unsigned char *p) 270 { 271 const unsigned char *marker = NULL; 272 const unsigned char *start = p; 273 /*!re2c 274 [`]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); } 275 [~]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); } 276 * { return 0; } 277 */ 278 } 279 280 // Scans an entity. 281 // Returns number of chars matched. 282 bufsize_t _scan_entity(const unsigned char *p) 283 { 284 const unsigned char *marker = NULL; 285 const unsigned char *start = p; 286 /*!re2c 287 [&] ([#] ([Xx][A-Fa-f0-9]{1,6}|[0-9]{1,7}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;] 288 { return (bufsize_t)(p - start); } 289 * { return 0; } 290 */ 291 } 292 293 // Returns positive value if a URL begins in a way that is potentially 294 // dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0. 295 bufsize_t _scan_dangerous_url(const unsigned char *p) 296 { 297 const unsigned char *marker = NULL; 298 const unsigned char *start = p; 299 /*!re2c 300 'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; } 301 'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); } 302 * { return 0; } 303 */ 304 } 305