cmark

My personal build of CMark ✏️

Commit
86fda06897ccd4d610410f920923c6e1f3e2bf3d
Parent
d943eed9db668bb3399264d5c978e20882bc6098
Author
John MacFarlane <jgm@berkeley.edu>
Date

Added cmark_ctype.h with locale-independent isspace, ispunct, etc.

Otherwise cmark's behavior varies unpredictably with the locale.

`is_punctuation` in utf8.h has also been adjusted so that everything that counts all ASCII symbol characters count as punctuation, even though some are not in P* character classes.

Diffstat

8 files changed, 52 insertions, 15 deletions

Status File Name N° Changes Insertions Deletions
Modified src/CMakeLists.txt 2 2 0
Modified src/blocks.c 2 1 1
Modified src/buffer.c 2 1 1
Modified src/chunk.h 2 1 1
Added src/cmark_ctype.c 33 33 0
Added src/cmark_ctype.h 11 11 0
Modified src/inlines.c 2 1 1
Modified src/utf8.c 13 2 11
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -15,6 +15,7 @@ set(HEADERS
   inlines.h
   html_unescape.h
   houdini.h
+  cmark_ctype.h
   )
 set(LIBRARY_SOURCES
   cmark.c
@@ -34,6 +35,7 @@ set(LIBRARY_SOURCES
   houdini_href_e.c
   houdini_html_e.c
   houdini_html_u.c
+  cmark_ctype.c
   ${HEADERS}
   )
 
diff --git a/src/blocks.c b/src/blocks.c
@@ -1,8 +1,8 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
-#include <ctype.h>
 
+#include "cmark_ctype.h"
 #include "config.h"
 #include "parser.h"
 #include "cmark.h"
diff --git a/src/buffer.c b/src/buffer.c
@@ -1,11 +1,11 @@
 #include <stdarg.h>
-#include <ctype.h>
 #include <string.h>
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "cmark_ctype.h"
 #include "buffer.h"
 
 /* Used as default value for cmark_strbuf->ptr so that people can always
diff --git a/src/chunk.h b/src/chunk.h
@@ -2,9 +2,9 @@
 #define CMARK_CHUNK_H
 
 #include <string.h>
-#include <ctype.h>
 #include <stdlib.h>
 #include <assert.h>
+#include "cmark_ctype.h"
 #include "buffer.h"
 
 typedef struct {
diff --git a/src/cmark_ctype.c b/src/cmark_ctype.c
@@ -0,0 +1,33 @@
+/**
+ * Returns 1 if c is a "whitespace" character as defined by the spec.
+ */
+int isspace(char c)
+{
+	return (c == 0x09 ||
+		c == 0x20 ||
+		c == 0x0a ||
+		c == 0x0d);
+}
+
+/**
+ * Returns 1 if c is an ascii punctuation character.
+ */
+int ispunct(char c)
+{
+	return ((c >= 33 && c <= 47) ||
+		(c >= 58 && c <= 64) ||
+		(c >= 91 && c <= 96) ||
+		(c >= 123 && c <= 126));
+}
+
+int isalnum(char c)
+{
+	return ((c >= 48 && c <= 57) ||
+		(c >= 65 && c <= 90) ||
+		(c >= 97 && c <= 122));
+}
+
+int isdigit(char c)
+{
+	return (c >= 48 && c <= 57);
+}
diff --git a/src/cmark_ctype.h b/src/cmark_ctype.h
@@ -0,0 +1,11 @@
+/** Locale-independent versions of functions from ctype.h.
+ * We want cmark to behave the same no matter what the system locale.
+ */
+
+int isspace(char c);
+
+int ispunct(char c);
+
+int isalnum(char c);
+
+int isdigit(char c);
diff --git a/src/inlines.c b/src/inlines.c
@@ -1,8 +1,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-#include <ctype.h>
 
+#include "cmark_ctype.h"
 #include "config.h"
 #include "node.h"
 #include "parser.h"
diff --git a/src/utf8.c b/src/utf8.c
@@ -2,6 +2,7 @@
 #include <stdint.h>
 #include <assert.h>
 
+#include "cmark_ctype.h"
 #include "utf8.h"
 
 static const int8_t utf8proc_utf8class[256] = {
@@ -268,17 +269,7 @@ int utf8proc_is_space(int32_t uc)
 // matches anything in the P[cdefios] classes.
 int utf8proc_is_punctuation(int32_t uc)
 {
-	return ((uc >= 33 && uc <= 35) ||
-		(uc >= 37 && uc <= 42) ||
-		(uc >= 44 && uc <= 47) ||
-		uc == 58 ||
-		uc == 59 ||
-		uc == 63 ||
-		uc == 64 ||
-		(uc >= 91 && uc <= 93) ||
-		uc == 95 ||
-		uc == 123 ||
-		uc == 125 ||
+	return ((uc < 128 && ispunct((char)uc)) ||
 		uc == 161 ||
 		uc == 167 ||
 		uc == 171 ||