cmark

My personal build of CMark ✏️

Commit
27373892cb98a2a6a1d35fba28798d9117fff58f
Parent
376f81ab8aa017ab01040e10d393d7682674562d
Author
John MacFarlane <jgm@berkeley.edu>
Date

Moved handling of --smart from renderer to parser.

This allows backslash escapes to disable smart quote transformations in particular cases.

Closes #8.

Diffstat

7 files changed, 113 insertions, 247 deletions

Status File Name N° Changes Insertions Deletions
Modified src/CMakeLists.txt 2 0 2
Modified src/html.c 10 1 9
Modified src/inlines.c 105 94 11
Modified src/man.c 17 6 11
Deleted src/smart.c 174 0 174
Deleted src/smart.h 28 0 28
Modified test/smart_punct.txt 24 12 12
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -16,7 +16,6 @@ set(HEADERS
   html_unescape.h
   houdini.h
   cmark_ctype.h
-  smart.h
   )
 set(LIBRARY_SOURCES
   cmark.c
@@ -37,7 +36,6 @@ set(LIBRARY_SOURCES
   houdini_html_e.c
   houdini_html_u.c
   cmark_ctype.c
-  smart.c
   ${HEADERS}
   )
 
diff --git a/src/html.c b/src/html.c
@@ -9,7 +9,6 @@
 #include "utf8.h"
 #include "buffer.h"
 #include "houdini.h"
-#include "smart.h"
 
 // Functions to convert cmark_nodes to HTML strings.
 
@@ -219,14 +218,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 		break;
 
 	case CMARK_NODE_TEXT:
-		if (options & CMARK_OPT_SMARTPUNCT) {
-			escape_with_smart(html, node, escape_html,
-					  "&ldquo;", "&rdquo;", "&lsquo;", "&rsquo;",
-					  "&mdash;", "&ndash;", "&hellip;");
-		} else {
-			escape_html(html, node->as.literal.data,
-			            node->as.literal.len);
-		}
+		escape_html(html, node->as.literal.data, node->as.literal.len);
 		break;
 
 	case CMARK_NODE_LINEBREAK:
diff --git a/src/inlines.c b/src/inlines.c
@@ -271,6 +271,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 	while (peek_char(subj) == c) {
 		numdelims++;
 		advance(subj);
+		if (c == '\'' || c == '"') {
+			break;  // limit to 1 delim for quotes
+		}
 	}
 
 	len = utf8proc_iterate(subj->input.data + subj->pos,
@@ -289,6 +292,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 	if (c == '_') {
 		*can_open = left_flanking && !right_flanking;
 		*can_close = right_flanking && !left_flanking;
+	} else if (c == '\'' || c == '"') {
+		*can_open = left_flanking && !right_flanking;
+		*can_close = right_flanking;
 	} else {
 		*can_open = left_flanking;
 		*can_close = right_flanking;
@@ -349,25 +355,68 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,
 	subj->last_delim = delim;
 }
 
-// Parse strong/emph or a fallback.
-// Assumes the subject has '_' or '*' at the current position.
-static cmark_node* handle_strong_emph(subject* subj, unsigned char c)
+// Assumes the subject has a c at the current position.
+static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart)
 {
 	int numdelims;
 	cmark_node * inl_text;
 	bool can_open, can_close;
+	cmark_chunk contents;
 
 	numdelims = scan_delims(subj, c, &can_open, &can_close);
 
-	inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
+	if (c == '\'' && smart) {
+		contents = cmark_chunk_literal("’");
+	} else if (c == '"' && smart) {
+		contents = cmark_chunk_literal("”");
+	} else {
+		contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
+	}
+
+	inl_text = make_str(contents);
 
-	if (can_open || can_close) {
+	if ((can_open || can_close) &&
+	    (!(c == '\'' || c == '"') || smart)) {
 		push_delimiter(subj, c, can_open, can_close, inl_text);
 	}
 
 	return inl_text;
 }
 
+// Assumes we have a hyphen at the current position.
+static cmark_node* handle_hyphen(subject* subj, bool smart)
+{
+	advance(subj);
+	if (smart && peek_char(subj) == '-') {
+		advance(subj);
+		if (peek_char(subj) == '-') {
+			advance(subj);
+			return make_str(cmark_chunk_literal("—"));
+		} else {
+			return make_str(cmark_chunk_literal("–"));
+		}
+	} else {
+		return make_str(cmark_chunk_literal("-"));
+	}
+}
+
+// Assumes we have a period at the current position.
+static cmark_node* handle_period(subject* subj, bool smart)
+{
+	advance(subj);
+	if (smart && peek_char(subj) == '.') {
+		advance(subj);
+		if (peek_char(subj) == '.') {
+			advance(subj);
+			return make_str(cmark_chunk_literal("…"));
+		} else {
+			return make_str(cmark_chunk_literal(".."));
+		}
+	} else {
+		return make_str(cmark_chunk_literal("."));
+	}
+}
+
 static void process_emphasis(subject *subj, delimiter *start_delim)
 {
 	delimiter *closer = subj->last_delim;
@@ -381,7 +430,8 @@ static void process_emphasis(subject *subj, delimiter *start_delim)
 	// now move forward, looking for closers, and handling each
 	while (closer != NULL) {
 		if (closer->can_close &&
-		    (closer->delim_char == '*' || closer->delim_char == '_')) {
+		    (closer->delim_char == '*' || closer->delim_char == '_' ||
+		     closer->delim_char == '"' || closer->delim_char == '\'')) {
 			// Now look backwards for first matching opener:
 			opener = closer->previous;
 			while (opener != NULL && opener != start_delim) {
@@ -391,9 +441,31 @@ static void process_emphasis(subject *subj, delimiter *start_delim)
 				}
 				opener = opener->previous;
 			}
-			if (opener != NULL && opener != start_delim) {
-				closer = S_insert_emph(subj, opener, closer);
-			} else {
+			if (closer->delim_char == '*' || closer->delim_char == '_') {
+				if (opener != NULL && opener != start_delim) {
+					closer = S_insert_emph(subj, opener, closer);
+				} else {
+					closer = closer->next;
+				}
+			} else if (closer->delim_char == '\'') {
+				cmark_chunk_free(&closer->inl_text->as.literal);
+				closer->inl_text->as.literal =
+					cmark_chunk_literal("’");
+				if (opener != NULL && opener != start_delim) {
+					cmark_chunk_free(&opener->inl_text->as.literal);
+					opener->inl_text->as.literal =
+						cmark_chunk_literal("‘");
+				}
+				closer = closer->next;
+			} else if (closer->delim_char == '"') {
+				cmark_chunk_free(&closer->inl_text->as.literal);
+				closer->inl_text->as.literal =
+					cmark_chunk_literal("”");
+				if (opener != NULL && opener != start_delim) {
+					cmark_chunk_free(&opener->inl_text->as.literal);
+					opener->inl_text->as.literal =
+						cmark_chunk_literal("“");
+				}
 				closer = closer->next;
 			}
 		} else {
@@ -866,7 +938,7 @@ static int subject_find_special_char(subject *subj, long options)
 	};
 
 	// " ' . -
-	static const char SMART_PUNCT_TABLE[] = {
+	static const char SMART_PUNCT_CHARS[] = {
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
@@ -890,6 +962,9 @@ static int subject_find_special_char(subject *subj, long options)
 	while (n < subj->input.len) {
 		if (SPECIAL_CHARS[subj->input.data[n]])
 			return n;
+		if (options & CMARK_OPT_SMARTPUNCT &&
+		    SMART_PUNCT_CHARS[subj->input.data[n]])
+			return n;
 		n++;
 	}
 
@@ -926,7 +1001,15 @@ static int parse_inline(subject* subj, cmark_node * parent, long options)
 		break;
 	case '*':
 	case '_':
-		new_inl = handle_strong_emph(subj, c);
+	case '\'':
+	case '"':
+		new_inl = handle_delim(subj, c, options & CMARK_OPT_SMARTPUNCT);
+		break;
+	case '-':
+		new_inl = handle_hyphen(subj, options & CMARK_OPT_SMARTPUNCT);
+		break;
+	case '.':
+		new_inl = handle_period(subj, options & CMARK_OPT_SMARTPUNCT);
 		break;
 	case '[':
 		advance(subj);
diff --git a/src/man.c b/src/man.c
@@ -7,10 +7,11 @@
 #include "cmark.h"
 #include "node.h"
 #include "buffer.h"
-#include "smart.h"
 
 // Functions to convert cmark_nodes to groff man strings.
 
+// TODO:  properly escape unicode punctuation used in smart mode:
+// "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", "\\[em]", "\\[en]", "..."
 static void escape_man(cmark_strbuf *dest, const unsigned char *source, int length)
 {
 	int i;
@@ -47,7 +48,7 @@ struct render_state {
 
 static int
 S_render_node(cmark_node *node, cmark_event_type ev_type,
-              struct render_state *state, long options)
+              struct render_state *state)
 {
 	cmark_node *tmp;
 	cmark_strbuf *man = state->man;
@@ -166,14 +167,8 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 		break;
 
 	case CMARK_NODE_TEXT:
-		if (options & CMARK_OPT_SMARTPUNCT) {
-			escape_with_smart(man, node, escape_man,
-					  "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]",
-					  "\\[em]", "\\[en]", "...");
-		} else {
-			escape_man(man, node->as.literal.data,
-				   node->as.literal.len);
-		}
+		escape_man(man, node->as.literal.data,
+			   node->as.literal.len);
 		break;
 
 	case CMARK_NODE_LINEBREAK:
@@ -248,7 +243,7 @@ char *cmark_render_man(cmark_node *root, long options)
 
 	while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
 		cur = cmark_iter_get_node(iter);
-		S_render_node(cur, ev_type, &state, options);
+		S_render_node(cur, ev_type, &state);
 	}
 	result = (char *)cmark_strbuf_detach(&man);
 
diff --git a/src/smart.c b/src/smart.c
@@ -1,174 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include "config.h"
-#include "cmark.h"
-#include "node.h"
-#include "utf8.h"
-#include "buffer.h"
-#include "chunk.h"
-
-static const char SMART_PUNCT_TABLE[] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-void escape_with_smart(cmark_strbuf *buf,
-		       cmark_node *node,
-		       void (*escape)(cmark_strbuf *, const unsigned char *, int),
-		       const char *left_double_quote,
-		       const char *right_double_quote,
-		       const char *left_single_quote,
-		       const char *right_single_quote,
-		       const char *em_dash,
-		       const char *en_dash,
-		       const char *ellipses)
-{
-	char c;
-	int32_t after_char = 0;
-	int32_t before_char = 0;
-	bool left_flanking, right_flanking;
-	int lastout = 0;
-	int i = 0, j = 0;
-	cmark_chunk lit = node->as.literal;
-	int len;
-
-	while (i < lit.len) {
-		c = lit.data[i];
-		i++;
-		if (SMART_PUNCT_TABLE[(int)c] == 0) {
-			continue;
-		}
-
-		if (i - 1 - lastout > 0) {
-			(*escape)(buf, lit.data + lastout, i - 1 - lastout);
-		}
-
-		if (c == 34 || c == 39) {
-			if (i == 1) {
-                                // set before_char based on previous text node if there is one:
-				if (node->prev) {
-					if (node->prev->type == CMARK_NODE_TEXT) {
-
-						// walk to the beginning of the UTF_8 sequence:
-						j = node->prev->as.literal.len - 1;
-						while (j > 0 &&
-						       node->prev->as.literal.data[j] >> 6 == 2) {
-							j--;
-						}
-						len = utf8proc_iterate(node->prev->as.literal.data + i,
-								       node->prev->as.literal.len - i,
-								       &before_char);
-						if (len == -1) {
-							before_char = 10;
-						}
-
-					} else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
-						   node->prev->type == CMARK_NODE_LINEBREAK) {
-						before_char = 10;
-
-					} else {
-						before_char = 65;
-					}
-				} else {
-					before_char = 10;
-				}
-			} else {
-				j = i - 2;
-				// walk back to the beginning of the UTF_8 sequence:
-				while (j > 0 && lit.data[j] >> 6 == 2) {
-					j--;
-				}
-				utf8proc_iterate(lit.data + j, lit.len - j, &before_char);
-			}
-
-			if (i >= lit.len) {
-				if (node->next) {
-					if (node->next->type == CMARK_NODE_TEXT) {
-						utf8proc_iterate(node->next->as.literal.data,
-								 node->next->as.literal.len,
-								 &after_char);
-					} else if (node->next->type == CMARK_NODE_SOFTBREAK ||
-						   node->next->type == CMARK_NODE_LINEBREAK) {
-						after_char = 10;
-					} else {
-						after_char = 65;
-					}
-				} else {
-					after_char = 10;
-				}
-			} else {
-				utf8proc_iterate(lit.data + i, lit.len - i, &after_char);
-			}
-
-			left_flanking = !utf8proc_is_space(after_char) &&
-				!(utf8proc_is_punctuation(after_char) &&
-				  !utf8proc_is_space(before_char) &&
-				  !utf8proc_is_punctuation(before_char));
-			right_flanking = !utf8proc_is_space(before_char) &&
-				!(utf8proc_is_punctuation(before_char) &&
-				  !utf8proc_is_space(after_char) &&
-				  !utf8proc_is_punctuation(after_char));
-		}
-
-		switch (c) {
-		case '"':
-			if (right_flanking) {
-				cmark_strbuf_puts(buf, right_double_quote);
-			} else {
-				cmark_strbuf_puts(buf, left_double_quote);
-			}
-			break;
-		case '\'':
-			if (left_flanking && !right_flanking) {
-				cmark_strbuf_puts(buf, left_single_quote);
-			} else {
-				cmark_strbuf_puts(buf, right_single_quote);
-			}
-			break;
-		case '-':
-			if (i < lit.len && lit.data[i] == '-') {
-				if (lit.data[i + 1] == '-') {
-					cmark_strbuf_puts(buf, em_dash);
-					i += 2;
-				} else {
-					cmark_strbuf_puts(buf, en_dash);
-					i += 1;
-				}
-			} else {
-				cmark_strbuf_putc(buf, c);
-			}
-			break;
-		case '.':
-			if (i < lit.len - 1 && lit.data[i] == '.' &&
-			    lit.data[i + 1] == '.') {
-				cmark_strbuf_puts(buf, ellipses);
-				i += 2;
-			} else {
-				cmark_strbuf_putc(buf, c);
-			}
-			break;
-		default:
-			cmark_strbuf_putc(buf, c);
-		}
-		lastout = i;
-	}
-	(*escape)(buf, node->as.literal.data + lastout, lit.len - lastout);
-
-}
diff --git a/src/smart.h b/src/smart.h
@@ -1,28 +0,0 @@
-#ifndef CMARK_SMART_H
-#define CMARK_SMART_H
-
-#include <stddef.h>
-#include <stdarg.h>
-#include "config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void escape_with_smart(cmark_strbuf *buf,
-		       cmark_node *node,
-		       void (*escape)(cmark_strbuf *, const unsigned char *, int),
-		       const char *left_double_quote,
-		       const char *right_double_quote,
-		       const char *left_single_quote,
-		       const char *right_single_quote,
-		       const char *em_dash,
-		       const char *en_dash,
-		       const char *ellipses);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
diff --git a/test/smart_punct.txt b/test/smart_punct.txt
@@ -4,58 +4,58 @@
 "Hello," said the spider.
 "'Shelob' is my name."
 .
-<p>&ldquo;Hello,&rdquo; said the spider.
-&ldquo;&lsquo;Shelob&rsquo; is my name.&rdquo;</p>
+<p>“Hello,” said the spider.
+“‘Shelob’ is my name.”</p>
 .
 
 .
 'A', 'B', and 'C' are letters.
 .
-<p>&lsquo;A&rsquo;, &lsquo;B&rsquo;, and &lsquo;C&rsquo; are letters.</p>
+<p>‘A’, ‘B’, and ‘C’ are letters.</p>
 .
 
 .
 'Oak,' 'elm,' and 'beech' are names of trees.
 So is 'pine.'
 .
-<p>&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees.
-So is &lsquo;pine.&rsquo;</p>
+<p>‘Oak,’ ‘elm,’ and ‘beech’ are names of trees.
+So is ‘pine.’</p>
 .
 
 .
 'He said, "I want to go."'
 .
-<p>&lsquo;He said, &ldquo;I want to go.&rdquo;&rsquo;</p>
+<p>‘He said, “I want to go.”’</p>
 .
 
 .
 Were you alive in the 70's?
 .
-<p>Were you alive in the 70&rsquo;s?</p>
+<p>Were you alive in the 70’s?</p>
 .
 
 .
 Here is some quoted '`code`' and a "[quoted link](url)".
 .
-<p>Here is some quoted &lsquo;<code>code</code>&rsquo; and a &ldquo;<a href="url">quoted link</a>&rdquo;.</p>
+<p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p>
 .
 
 .
 Some dashes:  one---two ---
 three---four --- five.
 .
-<p>Some dashes:  one&mdash;two &mdash;
-three&mdash;four &mdash; five.</p>
+<p>Some dashes:  one—two —
+three—four — five.</p>
 .
 
 .
 Dashes between numbers: 5--7, 255--66, 1987--1999.
 .
-<p>Dashes between numbers: 5&ndash;7, 255&ndash;66, 1987&ndash;1999.</p>
+<p>Dashes between numbers: 5–7, 255–66, 1987–1999.</p>
 .
 
 .
 Ellipses...and...and....
 .
-<p>Ellipses&hellip;and&hellip;and&hellip;.</p>
+<p>Ellipses…and…and….</p>
 .