cmark

My personal build of CMark ✏️

Commit
14b997d9350b3ee3f6d67fb12b470bf406d4a31b
Parent
b34e19cd2f32342fafd6ae76de4e537240784f71
Author
John MacFarlane <jgm@berkeley.edu>
Date

Changed rule for `_` emphasis and strong emphasis.

To prevent intra-word emphasis, we used to check to see if the delimiter was followed/preceded by an ASCII alphanumeric.

We now do something more elegant: whereas an opening `*` must be left-flanking, an opening `_` must be left-flanking *and not right-flanking*. And so on for the other cases.

All the original tests passed except some tests with Russian text with internal `_`, which formerly created emphasis but no longer do with the new rule. These tests have been adjusted. A few new test cases have been added to illustrate the rule.

The C and JS implementations have both been updated.

Diffstat

3 files changed, 46 insertions, 24 deletions

Status File Name N° Changes Insertions Deletions
Modified js/lib/inlines.js 16 10 6
Modified spec.txt 40 28 12
Modified src/inlines.c 14 8 6
diff --git a/js/lib/inlines.js b/js/lib/inlines.js
@@ -87,8 +87,6 @@ var reFinalSpace = / *$/;
 
 var reInitialSpace = /^ */;
 
-var reAsciiAlnum = /[a-z0-9]/i;
-
 var reLinkLabel = /^\[(?:[^\\\[\]]|\\[\[\]]){0,1000}\]/;
 
 // Matches a string of non-special characters.
@@ -238,6 +236,7 @@ var scanDelims = function(cc) {
     var numdelims = 0;
     var char_before, char_after, cc_after;
     var startpos = this.pos;
+    var left_flanking, right_flanking, can_open, can_close;
 
     char_before = this.pos === 0 ? '\n' :
         this.subject.charAt(this.pos - 1);
@@ -254,17 +253,22 @@ var scanDelims = function(cc) {
         char_after = fromCodePoint(cc_after);
     }
 
-    var can_open = numdelims > 0 && !(reWhitespaceChar.test(char_after)) &&
+    left_flanking = numdelims > 0 &&
+            !(reWhitespaceChar.test(char_after)) &&
             !(rePunctuation.test(char_after) &&
              !(/\s/.test(char_before)) &&
              !(rePunctuation.test(char_before)));
-    var can_close = numdelims > 0 && !(reWhitespaceChar.test(char_before)) &&
+    right_flanking = numdelims > 0 &&
+            !(reWhitespaceChar.test(char_before)) &&
             !(rePunctuation.test(char_before) &&
               !(reWhitespaceChar.test(char_after)) &&
               !(rePunctuation.test(char_after)));
     if (cc === C_UNDERSCORE) {
-        can_open = can_open && !((reAsciiAlnum).test(char_before));
-        can_close = can_close && !((reAsciiAlnum).test(char_after));
+        can_open = left_flanking && !right_flanking;
+        can_close = right_flanking && !left_flanking;
+    } else {
+        can_open = left_flanking;
+        can_close = right_flanking;
     }
     this.pos = startpos;
     return { numdelims: numdelims,
diff --git a/spec.txt b/spec.txt
@@ -4547,28 +4547,28 @@ The following rules define emphasis and strong emphasis:
 
 2.  A single `_` character [can open emphasis] iff
     it is part of a [left-flanking delimiter run]
-    and is not preceded by an ASCII alphanumeric character.
+    and not part of a [right-flanking delimiter run].
 
 3.  A single `*` character [can close emphasis](@can-close-emphasis)
     iff it is part of a [right-flanking delimiter run].
 
 4.  A single `_` character [can close emphasis]
-    iff it is part of a [right-flanking delimiter run].
-    and it is not followed by an ASCII alphanumeric character.
+    iff it is part of a [right-flanking delimiter run]
+    and not part of a [left-flanking delimiter run].
 
 5.  A double `**` [can open strong emphasis](@can-open-strong-emphasis)
     iff it is part of a [left-flanking delimiter run].
 
 6.  A double `__` [can open strong emphasis]
     iff it is part of a [left-flanking delimiter run]
-    and is not preceded by an ASCII alphanumeric character.
+    and not part of a [right-flanking delimiter run].
 
 7.  A double `**` [can close strong emphasis](@can-close-strong-emphasis)
     iff it is part of a [right-flanking delimiter run].
 
 8.  A double `__` [can close strong emphasis]
     iff it is part of a [right-flanking delimiter run]
-    and is not followed by an ASCII alphanumeric character.
+    and not part of a [left-flanking delimiter run].
 
 9.  Emphasis begins with a delimiter that [can open emphasis] and ends
     with a delimiter that [can close emphasis], and that uses the same
@@ -4701,7 +4701,7 @@ a_"foo"_
 <p>a_&quot;foo&quot;_</p>
 .
 
-Emphasis with `_` is not allowed inside ASCII words:
+Emphasis with `_` is not allowed inside words:
 
 .
 foo_bar_
@@ -4715,12 +4715,28 @@ foo_bar_
 <p>5_6_78</p>
 .
 
-But it is permitted inside non-ASCII words:
-
 .
 пристаням_стремятся_
 .
-<p>пристаням<em>стремятся</em></p>
+<p>пристаням_стремятся_</p>
+.
+
+Here `_` does not generate emphasis, because the first delimiter run
+is right-flanking and the second left-flanking:
+
+.
+aa_"bb"_cc
+.
+<p>aa_&quot;bb&quot;_cc</p>
+.
+
+Here there is no emphasis, because the delimiter runs are
+both left- and right-flanking:
+
+.
+"aa"_"bb"_"cc"
+.
+<p>&quot;aa&quot;_&quot;bb&quot;_&quot;cc&quot;</p>
 .
 
 Rule 3:
@@ -4810,7 +4826,7 @@ _foo_bar
 .
 _пристаням_стремятся
 .
-<p><em>пристаням</em>стремятся</p>
+<p>_пристаням_стремятся</p>
 .
 
 .
@@ -4897,7 +4913,7 @@ foo__bar__
 .
 пристаням__стремятся__
 .
-<p>пристаням<strong>стремятся</strong></p>
+<p>пристаням__стремятся__</p>
 .
 
 .
@@ -5000,7 +5016,7 @@ __foo__bar
 .
 __пристаням__стремятся
 .
-<p><strong>пристаням</strong>стремятся</p>
+<p>__пристаням__стремятся</p>
 .
 
 .
diff --git a/src/inlines.c b/src/inlines.c
@@ -250,6 +250,7 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 	int32_t after_char = 0;
 	int32_t before_char = 0;
 	int len;
+	bool left_flanking, right_flanking;
 
 	if (subj->pos == 0) {
 		before_char = 10;
@@ -277,19 +278,20 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 	if (len == -1) {
 		after_char = 10;
 	}
-	*can_open = numdelims > 0 && !utf8proc_is_space(after_char) &&
+	left_flanking = numdelims > 0 && !utf8proc_is_space(after_char) &&
 	            !(utf8proc_is_punctuation(after_char) &&
 	              !utf8proc_is_space(before_char) &&
 	              !utf8proc_is_punctuation(before_char));
-	*can_close = numdelims > 0 && !utf8proc_is_space(before_char) &&
+	right_flanking = numdelims > 0 && !utf8proc_is_space(before_char) &&
 	             !(utf8proc_is_punctuation(before_char) &&
 	               !utf8proc_is_space(after_char) &&
 	               !utf8proc_is_punctuation(after_char));
 	if (c == '_') {
-		*can_open = *can_open && !(before_char < 128 &&
-		                           cmark_isalnum((char)before_char));
-		*can_close = *can_close && !(before_char < 128 &&
-		                             cmark_isalnum((char)after_char));
+		*can_open = left_flanking && !right_flanking;
+		*can_close = right_flanking && !left_flanking;
+	} else {
+		*can_open = left_flanking;
+		*can_close = right_flanking;
 	}
 	return numdelims;
 }