cmark

My personal build of CMark ✏️

houdini_html_u.c (3649B)

  1 #include <assert.h>
  2 #include <stdio.h>
  3 #include <string.h>
  4 
  5 #include "buffer.h"
  6 #include "houdini.h"
  7 #include "utf8.h"
  8 #include "entities.inc"
  9 
 10 /* Binary tree lookup code for entities added by JGM */
 11 
 12 static const unsigned char *S_lookup(int i, int low, int hi,
 13                                      const unsigned char *s, int len) {
 14   int j;
 15   int cmp =
 16       strncmp((const char *)s, (const char *)cmark_entities[i].entity, len);
 17   if (cmp == 0 && cmark_entities[i].entity[len] == 0) {
 18     return (const unsigned char *)cmark_entities[i].bytes;
 19   } else if (cmp <= 0 && i > low) {
 20     j = i - ((i - low) / 2);
 21     if (j == i)
 22       j -= 1;
 23     return S_lookup(j, low, i - 1, s, len);
 24   } else if (cmp > 0 && i < hi) {
 25     j = i + ((hi - i) / 2);
 26     if (j == i)
 27       j += 1;
 28     return S_lookup(j, i + 1, hi, s, len);
 29   } else {
 30     return NULL;
 31   }
 32 }
 33 
 34 static const unsigned char *S_lookup_entity(const unsigned char *s, int len) {
 35   return S_lookup(CMARK_NUM_ENTITIES / 2, 0, CMARK_NUM_ENTITIES - 1, s, len);
 36 }
 37 
 38 bufsize_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src,
 39                                bufsize_t size) {
 40   bufsize_t i = 0;
 41 
 42   if (size >= 3 && src[0] == '#') {
 43     int codepoint = 0;
 44     int num_digits = 0;
 45     int max_digits = 7;
 46 
 47     if (_isdigit(src[1])) {
 48       for (i = 1; i < size && _isdigit(src[i]); ++i) {
 49         codepoint = (codepoint * 10) + (src[i] - '0');
 50 
 51         if (codepoint >= 0x110000) {
 52           // Keep counting digits but
 53           // avoid integer overflow.
 54           codepoint = 0x110000;
 55         }
 56       }
 57 
 58       num_digits = i - 1;
 59       max_digits = 7;
 60     }
 61 
 62     else if (src[1] == 'x' || src[1] == 'X') {
 63       for (i = 2; i < size && _isxdigit(src[i]); ++i) {
 64         codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);
 65 
 66         if (codepoint >= 0x110000) {
 67           // Keep counting digits but
 68           // avoid integer overflow.
 69           codepoint = 0x110000;
 70         }
 71       }
 72 
 73       num_digits = i - 2;
 74       max_digits = 6;
 75     }
 76 
 77     if (num_digits >= 1 && num_digits <= max_digits &&
 78 		    i < size && src[i] == ';') {
 79       if (codepoint == 0 || (codepoint >= 0xD800 && codepoint < 0xE000) ||
 80           codepoint >= 0x110000) {
 81         codepoint = 0xFFFD;
 82       }
 83       cmark_utf8proc_encode_char(codepoint, ob);
 84       return i + 1;
 85     }
 86   }
 87 
 88   else {
 89     if (size > CMARK_ENTITY_MAX_LENGTH)
 90       size = CMARK_ENTITY_MAX_LENGTH;
 91 
 92     for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) {
 93       if (src[i] == ' ')
 94         break;
 95 
 96       if (src[i] == ';') {
 97         const unsigned char *entity = S_lookup_entity(src, i);
 98 
 99         if (entity != NULL) {
100           cmark_strbuf_puts(ob, (const char *)entity);
101           return i + 1;
102         }
103 
104         break;
105       }
106     }
107   }
108 
109   return 0;
110 }
111 
112 int houdini_unescape_html(cmark_strbuf *ob, const uint8_t *src,
113                           bufsize_t size) {
114   bufsize_t i = 0, org, ent;
115 
116   while (i < size) {
117     org = i;
118     while (i < size && src[i] != '&')
119       i++;
120 
121     if (likely(i > org)) {
122       if (unlikely(org == 0)) {
123         if (i >= size)
124           return 0;
125 
126         cmark_strbuf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
127       }
128 
129       cmark_strbuf_put(ob, src + org, i - org);
130     }
131 
132     /* escaping */
133     if (i >= size)
134       break;
135 
136     i++;
137 
138     ent = houdini_unescape_ent(ob, src + i, size - i);
139     i += ent;
140 
141     /* not really an entity */
142     if (ent == 0)
143       cmark_strbuf_putc(ob, '&');
144   }
145 
146   return 1;
147 }
148 
149 void houdini_unescape_html_f(cmark_strbuf *ob, const uint8_t *src,
150                              bufsize_t size) {
151   if (!houdini_unescape_html(ob, src, size))
152     cmark_strbuf_put(ob, src, size);
153 }