cmark

My personal build of CMark ✏️

houdini_href_e.c (3016B)

  1 #include <assert.h>
  2 #include <stdio.h>
  3 #include <string.h>
  4 
  5 #include "houdini.h"
  6 
  7 /*
  8  * The following characters will not be escaped:
  9  *
 10  *		-_.+!*'(),%#@?=;:/,+&$ alphanum
 11  *
 12  * Note that this character set is the addition of:
 13  *
 14  *	- The characters which are safe to be in an URL
 15  *	- The characters which are *not* safe to be in
 16  *	an URL because they are RESERVED characters.
 17  *
 18  * We assume (lazily) that any RESERVED char that
 19  * appears inside an URL is actually meant to
 20  * have its native function (i.e. as an URL
 21  * component/separator) and hence needs no escaping.
 22  *
 23  * There are two exceptions: the characters & (amp)
 24  * and ' (single quote) do not appear in the table.
 25  * They are meant to appear in the URL as components,
 26  * yet they require special HTML-entity escaping
 27  * to generate valid HTML markup.
 28  *
 29  * All other characters will be escaped to %XX.
 30  *
 31  */
 32 static const char HREF_SAFE[] = {
 33     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 34     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
 35     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 36     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 37     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 38     1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 39     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 40     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 41     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 42     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 43     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 44 };
 45 
 46 int houdini_escape_href(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) {
 47   static const uint8_t hex_chars[] = "0123456789ABCDEF";
 48   bufsize_t i = 0, org;
 49   uint8_t hex_str[3];
 50 
 51   hex_str[0] = '%';
 52 
 53   while (i < size) {
 54     org = i;
 55     while (i < size && HREF_SAFE[src[i]] != 0)
 56       i++;
 57 
 58     if (likely(i > org))
 59       cmark_strbuf_put(ob, src + org, i - org);
 60 
 61     /* escaping */
 62     if (i >= size)
 63       break;
 64 
 65     switch (src[i]) {
 66     /* amp appears all the time in URLs, but needs
 67      * HTML-entity escaping to be inside an href */
 68     case '&':
 69       cmark_strbuf_puts(ob, "&amp;");
 70       break;
 71 
 72     /* the single quote is a valid URL character
 73      * according to the standard; it needs HTML
 74      * entity escaping too */
 75     case '\'':
 76       cmark_strbuf_puts(ob, "&#x27;");
 77       break;
 78 
 79 /* the space can be escaped to %20 or a plus
 80  * sign. we're going with the generic escape
 81  * for now. the plus thing is more commonly seen
 82  * when building GET strings */
 83 #if 0
 84 		case ' ':
 85 			cmark_strbuf_putc(ob, '+');
 86 			break;
 87 #endif
 88 
 89     /* every other character goes with a %XX escaping */
 90     default:
 91       hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
 92       hex_str[2] = hex_chars[src[i] & 0xF];
 93       cmark_strbuf_put(ob, hex_str, 3);
 94     }
 95 
 96     i++;
 97   }
 98 
 99   return 1;
100 }