cmark

My personal build of CMark ✏️

Commit
bb26b18173df983c57459809e8b1691b89907a58
Parent
c6417fc0b9cd240eb175501d44f68ea9d4406ec4
Author
John MacFarlane <jgm@berkeley.edu>
Date

Added CMARK_OPT_SMARTPUNCT and --smart option.

So far this is only implemented for the HTML renderer.

Ultimately some of this should be factored out into a form that can be used in other renderers.

Diffstat

4 files changed, 106 insertions, 3 deletions

Status File Name N° Changes Insertions Deletions
Modified man/man3/cmark.3 14 13 1
Modified src/cmark.h 4 4 0
Modified src/html.c 88 86 2
Modified src/main.c 3 3 0
diff --git a/man/man3/cmark.3 b/man/man3/cmark.3
@@ -1,4 +1,4 @@
-.TH cmark 3 "January 28, 2015" "LOCAL" "Library Functions Manual"
+.TH cmark 3 "February 14, 2015" "LOCAL" "Library Functions Manual"
 .SH
 NAME
 .PP
@@ -520,6 +520,18 @@ Render \f[C]softbreak\f[] elements as hard line breaks.
 .PP
 Normalize tree by consolidating adjacent text nodes.
 
+.PP
+.nf
+\fC
+.RS 0n
+#define CMARK_OPT_SMARTPUNCT 8
+.RE
+\f[]
+.fi
+
+.PP
+Convert straight quotes to curly, \-\-\- to em dashes, \-\- to en dashes.
+
 .SS
 Version information
 
diff --git a/src/cmark.h b/src/cmark.h
@@ -496,6 +496,10 @@ char *cmark_render_man(cmark_node *root, long options);
  */
 #define CMARK_OPT_NORMALIZE 4
 
+/** Convert straight quotes to curly, --- to em dashes, -- to en dashes.
+ */
+#define CMARK_OPT_SMARTPUNCT 8
+
 /**
  * ## Version information
  */
diff --git a/src/html.c b/src/html.c
@@ -6,6 +6,7 @@
 #include "config.h"
 #include "cmark.h"
 #include "node.h"
+#include "utf8.h"
 #include "buffer.h"
 #include "houdini.h"
 
@@ -60,6 +61,10 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 	char start_header[] = "<h0";
 	char end_header[] = "</h0";
 	bool tight;
+	int lastout, i;
+	cmark_chunk lit;
+	char before_char, after_char, c;
+	bool left_flanking, right_flanking;
 
 	bool entering = (ev_type == CMARK_EVENT_ENTER);
 
@@ -217,8 +222,87 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 		break;
 
 	case CMARK_NODE_TEXT:
-		escape_html(html, node->as.literal.data,
-		            node->as.literal.len);
+		if (options & CMARK_OPT_SMARTPUNCT) {
+			lastout = 0;
+			i = 0;
+			lit = node->as.literal;
+			while (i < lit.len) {
+				c = lit.data[i];
+				// replace with efficient lookup table:
+				if (c != '"' && c != '-' && c != '\'' && c != '.') {
+					i++;
+					continue;
+				}
+				escape_html(html, lit.data + lastout,
+					    i - lastout);
+				if (c == '\'' || c == '"') {
+					before_char = i == 0 ? ',' : lit.data[i - 1];
+					after_char = i == lit.len - 1 ? ',' : lit.data[i + 1];
+					left_flanking = !utf8proc_is_space(after_char) &&
+						!(utf8proc_is_punctuation(after_char) &&
+						  !utf8proc_is_space(before_char) &&
+						  !utf8proc_is_punctuation(before_char));
+					right_flanking = !utf8proc_is_space(before_char) &&
+						!(utf8proc_is_punctuation(before_char) &&
+						  !utf8proc_is_space(after_char) &&
+						  !utf8proc_is_punctuation(after_char));
+				}
+				switch (lit.data[i]) {
+				case '"':
+					if (right_flanking) {
+						cmark_strbuf_puts(html, "&rdquo;");
+					} else {
+						cmark_strbuf_puts(html, "&ldquo;");
+					}
+					i += 1;
+					break;
+				case '\'':
+					if (left_flanking && !right_flanking) {
+						cmark_strbuf_puts(html, "&lsquo;");
+					} else {
+						cmark_strbuf_puts(html, "&rsquo;");
+					}
+					i += 1;
+					break;
+				case '-':
+					if (i < lit.len - 1 && lit.data[i + 1] == '-') {
+						if (lit.data[i + 2] == '-') {
+							cmark_strbuf_puts(html,
+									  "&mdash;");
+							i += 3;
+						} else {
+							cmark_strbuf_puts(html, "&ndash;");
+							i += 2;
+						}
+					} else {
+						cmark_strbuf_putc(html, c);
+						i += 1;
+					}
+					break;
+				case '.':
+					if (i < lit.len - 2 && lit.data[i + 1] == '.' &&
+					    lit.data[i + 2] == '.') {
+							cmark_strbuf_puts(html,
+									  "&hellip;");
+							i += 3;
+					} else {
+						cmark_strbuf_putc(html, c);
+						i += 1;
+					}
+					break;
+				default:
+					cmark_strbuf_putc(html, c);
+					i++;
+				}
+				lastout = i;
+			}
+			escape_html(html, node->as.literal.data + lastout,
+				    i - lastout);
+
+		} else {
+			escape_html(html, node->as.literal.data,
+				    node->as.literal.len);
+		}
 		break;
 
 	case CMARK_NODE_LINEBREAK:
diff --git a/src/main.c b/src/main.c
@@ -26,6 +26,7 @@ void print_usage()
 	printf("  --to, -t FORMAT  Specify output format (html, xml, man)\n");
 	printf("  --sourcepos      Include source position attribute\n");
 	printf("  --hardbreaks     Treat newlines as hard line breaks\n");
+	printf("  --smart          Use smart punctuation\n");
 	printf("  --normalize      Consolidate adjacent text nodes\n");
 	printf("  --help, -h       Print usage information\n");
 	printf("  --version        Print version\n");
@@ -80,6 +81,8 @@ int main(int argc, char *argv[])
 			options |= CMARK_OPT_SOURCEPOS;
 		} else if (strcmp(argv[i], "--hardbreaks") == 0) {
 			options |= CMARK_OPT_HARDBREAKS;
+		} else if (strcmp(argv[i], "--smart") == 0) {
+			options |= CMARK_OPT_SMARTPUNCT;
 		} else if (strcmp(argv[i], "--normalize") == 0) {
 			options |= CMARK_OPT_NORMALIZE;
 		} else if ((strcmp(argv[i], "--help") == 0) ||