cmark
My personal build of CMark ✏️
- Commit
- 04726a7089e44e7ff4e6c552524841579a1053da
- Parent
- fb7af2f0d6ca845b33364c6ce9a704a458e31ff9
- Author
- John MacFarlane <jgm@berkeley.edu>
- Date
Added `CMARK_OPT_VALIDATE_UTF8` option.
Also command line option `--validate-utf8`.
This option causes cmark to check for valid UTF-8,
replacing invalid sequences with the replacement
character, U+FFFD.
Reinstated api tests for utf8.
Diffstat
6 files changed, 36 insertions, 9 deletions
diff --git a/api_test/main.c b/api_test/main.c
@@ -658,7 +658,7 @@ test_continuation_byte(test_batch_runner *runner, const char *utf8)
strcat(expected, "))))</p>\n");
char *html = cmark_markdown_to_html(buf, strlen(buf),
- CMARK_OPT_DEFAULT);
+ CMARK_OPT_VALIDATE_UTF8);
STR_EQ(runner, html, expected,
"invalid utf8 continuation byte %d/%d", pos, len);
free(html);
@@ -718,7 +718,7 @@ test_md_to_html(test_batch_runner *runner, const char *markdown,
const char *expected_html, const char *msg)
{
char *html = cmark_markdown_to_html(markdown, strlen(markdown),
- CMARK_OPT_DEFAULT);
+ CMARK_OPT_VALIDATE_UTF8);
STR_EQ(runner, html, expected_html, msg);
free(html);
}
@@ -737,7 +737,7 @@ int main() {
hierarchy(runner);
parser(runner);
render_html(runner);
- // utf8(runner);
+ utf8(runner);
line_endings(runner);
numeric_entities(runner);
test_cplusplus(runner);
diff --git a/man/man3/cmark.3 b/man/man3/cmark.3
@@ -1,4 +1,4 @@
-.TH cmark 3 "June 07, 2015" "LOCAL" "Library Functions Manual"
+.TH cmark 3 "June 16, 2015" "LOCAL" "Library Functions Manual"
.SH
NAME
.PP
@@ -403,10 +403,10 @@ Streaming interface:
cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT);
FILE *fp = fopen("myfile.md", "r");
while ((bytes = fread(buffer, 1, sizeof(buffer), fp)) > 0) {
- cmark_parser_feed(parser, buffer, bytes);
- if (bytes < sizeof(buffer)) {
- break;
- }
+ cmark_parser_feed(parser, buffer, bytes);
+ if (bytes < sizeof(buffer)) {
+ break;
+ }
}
document = cmark_parser_finish(parser);
cmark_parser_free(parser);
@@ -539,6 +539,19 @@ Normalize tree by consolidating adjacent text nodes.
.PP
Convert straight quotes to curly, \-\-\- to em dashes, \-\- to en dashes.
+.PP
+.nf
+\fC
+.RS 0n
+#define CMARK_OPT_VALIDATE_UTF8 16
+.RE
+\f[]
+.fi
+
+.PP
+Validate UTF\-8 in the input before parsing, replacing illegal
+sequences with the replacement character U+FFFD.
+
.SS
Version information
diff --git a/src/blocks.c b/src/blocks.c
@@ -631,7 +631,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
cmark_chunk input;
bool maybe_lazy;
- cmark_strbuf_put(parser->curline, buffer, bytes);
+ if (parser->options & CMARK_OPT_VALIDATE_UTF8) {
+ utf8proc_check(parser->curline, buffer, bytes);
+ } else {
+ cmark_strbuf_put(parser->curline, buffer, bytes);
+ }
parser->offset = 0;
parser->column = 0;
parser->blank = false;