cmark

My personal build of CMark ✏️

scanners.re (8662B)

  1 #include <stdlib.h>
  2 #include "chunk.h"
  3 #include "scanners.h"
  4 
  5 bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c, bufsize_t offset)
  6 {
  7 	bufsize_t res;
  8 	unsigned char *ptr = (unsigned char *)c->data;
  9 
 10         if (ptr == NULL || offset > c->len) {
 11           return 0;
 12         } else {
 13 	  unsigned char lim = ptr[c->len];
 14 
 15 	  ptr[c->len] = '\0';
 16 	  res = scanner(ptr + offset);
 17 	  ptr[c->len] = lim;
 18         }
 19 
 20 	return res;
 21 }
 22 
 23 /*!re2c
 24   re2c:define:YYCTYPE  = "unsigned char";
 25   re2c:define:YYCURSOR = p;
 26   re2c:define:YYMARKER = marker;
 27   re2c:define:YYCTXMARKER = marker;
 28   re2c:yyfill:enable = 0;
 29 
 30   wordchar = [^\x00-\x20];
 31 
 32   spacechar = [ \t\v\f\r\n];
 33 
 34   reg_char     = [^\\()\x00-\x20];
 35 
 36   escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
 37 
 38   tagname = [A-Za-z][A-Za-z0-9-]*;
 39 
 40   blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'head'|'header'|'hr'|'html'|'iframe'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
 41 
 42   attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
 43 
 44   unquotedvalue = [^ \t\r\n\v\f"'=<>`\x00]+;
 45   singlequotedvalue = ['][^'\x00]*['];
 46   doublequotedvalue = ["][^"\x00]*["];
 47 
 48   attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;
 49 
 50   attributevaluespec = spacechar* [=] spacechar* attributevalue;
 51 
 52   attribute = spacechar+ attributename attributevaluespec?;
 53 
 54   opentag = tagname attribute* spacechar* [/]? [>];
 55   closetag = [/] tagname spacechar* [>];
 56 
 57   htmlcomment = "!---->" | ("!--" ([-]? [^\x00>-]) ([-]? [^\x00-])* "-->");
 58 
 59   processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00] | [>])* "?>";
 60 
 61   declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">";
 62 
 63   cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>";
 64 
 65   htmltag = opentag | closetag | htmlcomment | processinginstruction |
 66             declaration | cdata;
 67 
 68   in_parens_nosp   = [(] (reg_char|escaped_char|[\\])* [)];
 69 
 70   in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
 71   in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
 72   in_parens        = [(] (escaped_char|[^)\x00])* [)];
 73 
 74   scheme           = [A-Za-z][A-Za-z0-9.+-]{1,31};
 75 */
 76 
 77 // Try to match a scheme including colon.
 78 bufsize_t _scan_scheme(const unsigned char *p)
 79 {
 80   const unsigned char *marker = NULL;
 81   const unsigned char *start = p;
 82 /*!re2c
 83   scheme [:] { return (bufsize_t)(p - start); }
 84   * { return 0; }
 85 */
 86 }
 87 
 88 // Try to match URI autolink after first <, returning number of chars matched.
 89 bufsize_t _scan_autolink_uri(const unsigned char *p)
 90 {
 91   const unsigned char *marker = NULL;
 92   const unsigned char *start = p;
 93 /*!re2c
 94   scheme [:][^\x00-\x20<>]*[>]  { return (bufsize_t)(p - start); }
 95   * { return 0; }
 96 */
 97 }
 98 
 99 // Try to match email autolink after first <, returning num of chars matched.
100 bufsize_t _scan_autolink_email(const unsigned char *p)
101 {
102   const unsigned char *marker = NULL;
103   const unsigned char *start = p;
104 /*!re2c
105   [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
106     [@]
107     [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
108     ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
109     [>] { return (bufsize_t)(p - start); }
110   * { return 0; }
111 */
112 }
113 
114 // Try to match an HTML tag after first <, returning num of chars matched.
115 bufsize_t _scan_html_tag(const unsigned char *p)
116 {
117   const unsigned char *marker = NULL;
118   const unsigned char *start = p;
119 /*!re2c
120   htmltag { return (bufsize_t)(p - start); }
121   * { return 0; }
122 */
123 }
124 
125 // Try to match an HTML block tag start line, returning
126 // an integer code for the type of block (1-6, matching the spec).
127 // #7 is handled by a separate function, below.
128 bufsize_t _scan_html_block_start(const unsigned char *p)
129 {
130   const unsigned char *marker = NULL;
131 /*!re2c
132   [<] ('script'|'pre'|'textarea'|'style') (spacechar | [>]) { return 1; }
133   '<!--' { return 2; }
134   '<?' { return 3; }
135   '<!' [A-Z] { return 4; }
136   '<![CDATA[' { return 5; }
137   [<] [/]? blocktagname (spacechar | [/]? [>])  { return 6; }
138   * { return 0; }
139 */
140 }
141 
142 // Try to match an HTML block tag start line of type 7, returning
143 // 7 if successful, 0 if not.
144 bufsize_t _scan_html_block_start_7(const unsigned char *p)
145 {
146   const unsigned char *marker = NULL;
147 /*!re2c
148   [<] (opentag | closetag) [\t\n\f ]* [\r\n] { return 7; }
149   * { return 0; }
150 */
151 }
152 
153 // Try to match an HTML block end line of type 1
154 bufsize_t _scan_html_block_end_1(const unsigned char *p)
155 {
156   const unsigned char *marker = NULL;
157   const unsigned char *start = p;
158 /*!re2c
159   [^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return (bufsize_t)(p - start); }
160   * { return 0; }
161 */
162 }
163 
164 // Try to match an HTML block end line of type 2
165 bufsize_t _scan_html_block_end_2(const unsigned char *p)
166 {
167   const unsigned char *marker = NULL;
168   const unsigned char *start = p;
169 /*!re2c
170   [^\n\x00]* '-->' { return (bufsize_t)(p - start); }
171   * { return 0; }
172 */
173 }
174 
175 // Try to match an HTML block end line of type 3
176 bufsize_t _scan_html_block_end_3(const unsigned char *p)
177 {
178   const unsigned char *marker = NULL;
179   const unsigned char *start = p;
180 /*!re2c
181   [^\n\x00]* '?>' { return (bufsize_t)(p - start); }
182   * { return 0; }
183 */
184 }
185 
186 // Try to match an HTML block end line of type 4
187 bufsize_t _scan_html_block_end_4(const unsigned char *p)
188 {
189   const unsigned char *marker = NULL;
190   const unsigned char *start = p;
191 /*!re2c
192   [^\n\x00]* '>' { return (bufsize_t)(p - start); }
193   * { return 0; }
194 */
195 }
196 
197 // Try to match an HTML block end line of type 5
198 bufsize_t _scan_html_block_end_5(const unsigned char *p)
199 {
200   const unsigned char *marker = NULL;
201   const unsigned char *start = p;
202 /*!re2c
203   [^\n\x00]* ']]>' { return (bufsize_t)(p - start); }
204   * { return 0; }
205 */
206 }
207 
208 // Try to match a link title (in single quotes, in double quotes, or
209 // in parentheses), returning number of chars matched.  Allow one
210 // level of internal nesting (quotes within quotes).
211 bufsize_t _scan_link_title(const unsigned char *p)
212 {
213   const unsigned char *marker = NULL;
214   const unsigned char *start = p;
215 /*!re2c
216   ["] (escaped_char|[^"\x00])* ["]   { return (bufsize_t)(p - start); }
217   ['] (escaped_char|[^'\x00])* ['] { return (bufsize_t)(p - start); }
218   [(] (escaped_char|[^()\x00])* [)]  { return (bufsize_t)(p - start); }
219   * { return 0; }
220 */
221 }
222 
223 // Match space characters, including newlines.
224 bufsize_t _scan_spacechars(const unsigned char *p)
225 {
226   const unsigned char *start = p; \
227 /*!re2c
228   [ \t\v\f\r\n]+ { return (bufsize_t)(p - start); }
229   * { return 0; }
230 */
231 }
232 
233 // Match ATX heading start.
234 bufsize_t _scan_atx_heading_start(const unsigned char *p)
235 {
236   const unsigned char *marker = NULL;
237   const unsigned char *start = p;
238 /*!re2c
239   [#]{1,6} ([ \t]+|[\r\n])  { return (bufsize_t)(p - start); }
240   * { return 0; }
241 */
242 }
243 
244 // Match setext heading line.  Return 1 for level-1 heading,
245 // 2 for level-2, 0 for no match.
246 bufsize_t _scan_setext_heading_line(const unsigned char *p)
247 {
248   const unsigned char *marker = NULL;
249 /*!re2c
250   [=]+ [ \t]* [\r\n] { return 1; }
251   [-]+ [ \t]* [\r\n] { return 2; }
252   * { return 0; }
253 */
254 }
255 
256 // Scan an opening code fence.
257 bufsize_t _scan_open_code_fence(const unsigned char *p)
258 {
259   const unsigned char *marker = NULL;
260   const unsigned char *start = p;
261 /*!re2c
262   [`]{3,} / [^`\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); }
263   [~]{3,} / [^\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); }
264   * { return 0; }
265 */
266 }
267 
268 // Scan a closing code fence with length at least len.
269 bufsize_t _scan_close_code_fence(const unsigned char *p)
270 {
271   const unsigned char *marker = NULL;
272   const unsigned char *start = p;
273 /*!re2c
274   [`]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); }
275   [~]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); }
276   * { return 0; }
277 */
278 }
279 
280 // Scans an entity.
281 // Returns number of chars matched.
282 bufsize_t _scan_entity(const unsigned char *p)
283 {
284   const unsigned char *marker = NULL;
285   const unsigned char *start = p;
286 /*!re2c
287   [&] ([#] ([Xx][A-Fa-f0-9]{1,6}|[0-9]{1,7}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;]
288      { return (bufsize_t)(p - start); }
289   * { return 0; }
290 */
291 }
292 
293 // Returns positive value if a URL begins in a way that is potentially
294 // dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0.
295 bufsize_t _scan_dangerous_url(const unsigned char *p)
296 {
297   const unsigned char *marker = NULL;
298   const unsigned char *start = p;
299 /*!re2c
300   'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; }
301   'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); }
302   * { return 0; }
303 */
304 }
305