cmark

My personal build of CMark ✏️

Commit
54d1249c2caebf45a24d691dc765fb93c9a5e594
Parent
bc14d869323650e936c7143dcf941b28ccd5b57d
Author
John MacFarlane <jgm@berkeley.edu>
Date

Merge pull request #58 from nwellnhof/optimize_utf8proc_detab

Further optimize utf8proc_valid

Diffstat

1 file changed, 34 insertions, 37 deletions

Status File Name N° Changes Insertions Deletions
Modified src/utf8.c 71 34 37
diff --git a/src/utf8.c b/src/utf8.c
@@ -54,9 +54,11 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
 }
 
 // Validate a single UTF-8 character according to RFC 3629.
+// Assumes a multi-byte UTF-8 sequence.
 static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 {
 	int length = utf8proc_utf8class[str[0]];
+	assert(length != 1);
 
 	if (!length)
 		return -1;
@@ -64,53 +66,48 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 	if ((bufsize_t)length > str_len)
 		return -str_len;
 
-	switch (length) {
-	case 2:
-		if ((str[1] & 0xC0) != 0x80)
-			return -1;
+	if ((str[1] & 0xC0) != 0x80)
+		return -1;
+
+	if (length == 2) {
 		if (str[0] < 0xC2) {
 			// Overlong
 			return -length;
 		}
-		break;
-
-	case 3:
-		if ((str[1] & 0xC0) != 0x80)
-			return -1;
+	}
+	else {
 		if ((str[2] & 0xC0) != 0x80)
 			return -2;
-		if (str[0] == 0xE0) {
-			if (str[1] < 0xA0) {
-				// Overlong
-				return -length;
-			}
-		} else if (str[0] == 0xED) {
-			if (str[1] >= 0xA0) {
-				// Surrogate
-				return -length;
-			}
-		}
-		break;
 
-	case 4:
-		if ((str[1] & 0xC0) != 0x80)
-			return -1;
-		if ((str[2] & 0xC0) != 0x80)
-			return -2;
-		if ((str[3] & 0xC0) != 0x80)
-			return -3;
-		if (str[0] == 0xF0) {
-			if (str[1] < 0x90) {
-				// Overlong
-				return -length;
+		if (length == 3) {
+			if (str[0] == 0xE0) {
+				if (str[1] < 0xA0) {
+					// Overlong
+					return -length;
+				}
+			} else if (str[0] == 0xED) {
+				if (str[1] >= 0xA0) {
+					// Surrogate
+					return -length;
+				}
 			}
-		} else if (str[0] >= 0xF4) {
-			if (str[0] > 0xF4 || str[1] >= 0x90) {
-				// Above 0x10FFFF
-				return -length;
+		}
+		else {
+			if ((str[3] & 0xC0) != 0x80)
+				return -3;
+
+			if (str[0] == 0xF0) {
+				if (str[1] < 0x90) {
+					// Overlong
+					return -length;
+				}
+			} else if (str[0] >= 0xF4) {
+				if (str[0] > 0xF4 || str[1] >= 0x90) {
+					// Above 0x10FFFF
+					return -length;
+				}
 			}
 		}
-		break;
 	}
 
 	return length;