cmark
My personal build of CMark ✏️
houdini_html_u.c (3649B)
1 #include <assert.h> 2 #include <stdio.h> 3 #include <string.h> 4 5 #include "buffer.h" 6 #include "houdini.h" 7 #include "utf8.h" 8 #include "entities.inc" 9 10 /* Binary tree lookup code for entities added by JGM */ 11 12 static const unsigned char *S_lookup(int i, int low, int hi, 13 const unsigned char *s, int len) { 14 int j; 15 int cmp = 16 strncmp((const char *)s, (const char *)cmark_entities[i].entity, len); 17 if (cmp == 0 && cmark_entities[i].entity[len] == 0) { 18 return (const unsigned char *)cmark_entities[i].bytes; 19 } else if (cmp <= 0 && i > low) { 20 j = i - ((i - low) / 2); 21 if (j == i) 22 j -= 1; 23 return S_lookup(j, low, i - 1, s, len); 24 } else if (cmp > 0 && i < hi) { 25 j = i + ((hi - i) / 2); 26 if (j == i) 27 j += 1; 28 return S_lookup(j, i + 1, hi, s, len); 29 } else { 30 return NULL; 31 } 32 } 33 34 static const unsigned char *S_lookup_entity(const unsigned char *s, int len) { 35 return S_lookup(CMARK_NUM_ENTITIES / 2, 0, CMARK_NUM_ENTITIES - 1, s, len); 36 } 37 38 bufsize_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, 39 bufsize_t size) { 40 bufsize_t i = 0; 41 42 if (size >= 3 && src[0] == '#') { 43 int codepoint = 0; 44 int num_digits = 0; 45 int max_digits = 7; 46 47 if (_isdigit(src[1])) { 48 for (i = 1; i < size && _isdigit(src[i]); ++i) { 49 codepoint = (codepoint * 10) + (src[i] - '0'); 50 51 if (codepoint >= 0x110000) { 52 // Keep counting digits but 53 // avoid integer overflow. 54 codepoint = 0x110000; 55 } 56 } 57 58 num_digits = i - 1; 59 max_digits = 7; 60 } 61 62 else if (src[1] == 'x' || src[1] == 'X') { 63 for (i = 2; i < size && _isxdigit(src[i]); ++i) { 64 codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9); 65 66 if (codepoint >= 0x110000) { 67 // Keep counting digits but 68 // avoid integer overflow. 69 codepoint = 0x110000; 70 } 71 } 72 73 num_digits = i - 2; 74 max_digits = 6; 75 } 76 77 if (num_digits >= 1 && num_digits <= max_digits && 78 i < size && src[i] == ';') { 79 if (codepoint == 0 || (codepoint >= 0xD800 && codepoint < 0xE000) || 80 codepoint >= 0x110000) { 81 codepoint = 0xFFFD; 82 } 83 cmark_utf8proc_encode_char(codepoint, ob); 84 return i + 1; 85 } 86 } 87 88 else { 89 if (size > CMARK_ENTITY_MAX_LENGTH) 90 size = CMARK_ENTITY_MAX_LENGTH; 91 92 for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) { 93 if (src[i] == ' ') 94 break; 95 96 if (src[i] == ';') { 97 const unsigned char *entity = S_lookup_entity(src, i); 98 99 if (entity != NULL) { 100 cmark_strbuf_puts(ob, (const char *)entity); 101 return i + 1; 102 } 103 104 break; 105 } 106 } 107 } 108 109 return 0; 110 } 111 112 int houdini_unescape_html(cmark_strbuf *ob, const uint8_t *src, 113 bufsize_t size) { 114 bufsize_t i = 0, org, ent; 115 116 while (i < size) { 117 org = i; 118 while (i < size && src[i] != '&') 119 i++; 120 121 if (likely(i > org)) { 122 if (unlikely(org == 0)) { 123 if (i >= size) 124 return 0; 125 126 cmark_strbuf_grow(ob, HOUDINI_UNESCAPED_SIZE(size)); 127 } 128 129 cmark_strbuf_put(ob, src + org, i - org); 130 } 131 132 /* escaping */ 133 if (i >= size) 134 break; 135 136 i++; 137 138 ent = houdini_unescape_ent(ob, src + i, size - i); 139 i += ent; 140 141 /* not really an entity */ 142 if (ent == 0) 143 cmark_strbuf_putc(ob, '&'); 144 } 145 146 return 1; 147 } 148 149 void houdini_unescape_html_f(cmark_strbuf *ob, const uint8_t *src, 150 bufsize_t size) { 151 if (!houdini_unescape_html(ob, src, size)) 152 cmark_strbuf_put(ob, src, size); 153 }