diff --git a/src/utf8.c b/src/utf8.c
@@ -56,21 +56,18 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
// Validate a single UTF-8 character according to RFC 3629.
static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
{
- int length = utf8proc_charlen(str, str_len);
+ int length = utf8proc_utf8class[str[0]];
- if (length <= 0)
- return length;
+ if (!length)
+ return -1;
- switch (length) {
- case 1:
- if (str[0] == 0x00) {
- // ASCII NUL is technically valid but rejected
- // for security reasons.
- return -length;
- }
- break;
+ if ((bufsize_t)length > str_len)
+ return -str_len;
+ switch (length) {
case 2:
+ if ((str[1] & 0xC0) != 0x80)
+ return -1;
if (str[0] < 0xC2) {
// Overlong
return -length;
@@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
break;
case 3:
+ if ((str[1] & 0xC0) != 0x80)
+ return -1;
+ if ((str[2] & 0xC0) != 0x80)
+ return -2;
if (str[0] == 0xE0) {
if (str[1] < 0xA0) {
// Overlong
@@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
break;
case 4:
+ if ((str[1] & 0xC0) != 0x80)
+ return -1;
+ if ((str[2] & 0xC0) != 0x80)
+ return -2;
+ if ((str[3] & 0xC0) != 0x80)
+ return -3;
if (str[0] == 0xF0) {
if (str[1] < 0x90) {
// Overlong
@@ -117,10 +124,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
while (i < size) {
bufsize_t org = i;
+ int charlen = 0;
+
+ while (i < size && line[i] != '\t') {
+ if (line[i] >= 0x80) {
+ charlen = utf8proc_valid(line + i, size - i);
+ if (charlen < 0) {
+ charlen = -charlen;
+ break;
+ }
+ i += charlen;
+ }
+ else if (line[i] == '\0') {
+ // ASCII NUL is technically valid but rejected
+ // for security reasons.
+ charlen = 1;
+ break;
+ }
+ else {
+ i++;
+ }
- while (i < size && line[i] != '\t' && line[i] != '\0'
- && line[i] < 0x80) {
- i++;
tab++;
}
@@ -136,14 +160,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
i += 1;
tab += numspaces;
} else {
- int charlen = utf8proc_valid(line + i, size - i);
-
- if (charlen >= 0) {
- cmark_strbuf_put(ob, line + i, charlen);
- } else {
- encode_unknown(ob);
- charlen = -charlen;
- }
+ // Invalid UTF-8
+ encode_unknown(ob);
i += charlen;
tab += 1;