cmark

My personal build of CMark ✏️

Commit
158bbebe1a0eede2122feecd6f6b5aee9a53468d
Parent
a5fa2d573185bcc565da89effcfbfdc2967ef939
Author
John MacFarlane <jgm@berkeley.edu>
Date

Removed artificial rule for emph/strong markers.

Previously there was a rule that nothing in a string of more than 3 `*` or `_` characters could close or start emphasis. This was artifical and led to strange asymmetries, e.g. you could have `*a *b**` emph within emph but not `**a **b****` strong within strong.

The new parsing strategy makes it easy to remove this limitation.

Spec, js, and c implementations have been updated. Spec might need some further grooming.

Diffstat

3 files changed, 82 insertions, 76 deletions

Status File Name N° Changes Insertions Deletions
Modified js/lib/inlines.js 40 18 22
Modified spec.txt 104 55 49
Modified src/inlines.c 14 9 5
diff --git a/js/lib/inlines.js b/js/lib/inlines.js
@@ -235,8 +235,8 @@ var scanDelims = function(cc) {
         char_after = fromCodePoint(cc_after);
     }
 
-    var can_open = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_after));
-    var can_close = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_before));
+    var can_open = numdelims > 0 && !(/\s/.test(char_after));
+    var can_close = numdelims > 0 && !(/\s/.test(char_before));
     if (cc === C_UNDERSCORE) {
         can_open = can_open && !((/[a-z0-9]/i).test(char_before));
         can_close = can_close && !((/[a-z0-9]/i).test(char_after));
@@ -265,6 +265,7 @@ var parseEmphasis = function(cc,inlines) {
 
     var res = this.scanDelims(cc);
     var numdelims = res.numdelims;
+    var usedelims;
 
     if (numdelims === 0) {
         this.pos = startpos;
@@ -279,41 +280,36 @@ var parseEmphasis = function(cc,inlines) {
 
         if (opener.cc === cc) { // we have a match!
 
-          if (opener.numdelims <= numdelims) { // all openers used
-
-            this.pos += opener.numdelims;
-            var X;
-            switch (opener.numdelims) {
-            case 3:
-                X = function(x) { return Strong([Emph(x)]); };
-                break;
-            case 2:
-                X = Strong;
-                break;
-            case 1:
-            default:
-                X = Emph;
-                break;
-            }
+          if (numdelims < 3 || opener.numdelims < 3) {
+                usedelims = numdelims <= opener.numdelims ? numdelims : opener.numdelims;
+          } else { // numdelims >= 3 && opener.numdelims >= 3
+                usedelims = numdelims % 2 === 0 ? 2 : 1;
+          }
+          var X = usedelims === 1 ? Emph : Strong;
+
+          if (opener.numdelims == usedelims) { // all openers used
+
+            this.pos += usedelims;
             inlines[opener.pos] = X(inlines.slice(opener.pos + 1));
             inlines.splice(opener.pos + 1, inlines.length - (opener.pos + 1));
             // Remove entries after this, to prevent overlapping nesting:
             this.emphasis_openers = opener.previous;
             return true;
 
-          } else if (opener.numdelims > numdelims) { // only some openers used
+          } else if (opener.numdelims > usedelims) { // only some openers used
 
-            this.pos += numdelims;
-            opener.numdelims -= numdelims;
+            this.pos += usedelims;
+            opener.numdelims -= usedelims;
             inlines[opener.pos].c =
               inlines[opener.pos].c.slice(0, opener.numdelims);
-            var X = numdelims === 2 ? Strong : Emph;
             inlines[opener.pos + 1] = X(inlines.slice(opener.pos + 1));
             inlines.splice(opener.pos + 2, inlines.length - (opener.pos + 2));
             // Remove entries after this, to prevent overlapping nesting:
             this.emphasis_openers = opener;
             return true;
 
+          } else { // usedelims > opener.numdelims, should never happen
+            throw new Error("Logic error: usedelims > opener.numdelims");
           }
 
         }
diff --git a/spec.txt b/spec.txt
@@ -4250,60 +4250,52 @@ for efficient parsing strategies that do not backtrack:
 1.  A single `*` character [can open emphasis](#can-open-emphasis)
     <a id="can-open-emphasis"></a> iff
 
-    (a) it is not part of a sequence of four or more unescaped `*`s,
-    (b) it is not followed by whitespace, and
-    (c) either it is not followed by a `*` character or it is
+    (a) it is not followed by whitespace, and
+    (b) either it is not followed by a `*` character or it is
         followed immediately by emphasis or strong emphasis.
 
 2.  A single `_` character [can open emphasis](#can-open-emphasis) iff
 
-    (a) it is not part of a sequence of four or more unescaped `_`s,
-    (b) it is not followed by whitespace,
-    (c) it is not preceded by an ASCII alphanumeric character, and
-    (d) either it is not followed by a `_` character or it is
+    (a) it is not followed by whitespace,
+    (b) it is not preceded by an ASCII alphanumeric character, and
+    (c) either it is not followed by a `_` character or it is
         followed immediately by emphasis or strong emphasis.
 
 3.  A single `*` character [can close emphasis](#can-close-emphasis)
     <a id="can-close-emphasis"></a> iff
 
-    (a) it is not part of a sequence of four or more unescaped `*`s, and
     (b) it is not preceded by whitespace.
 
 4.  A single `_` character [can close emphasis](#can-close-emphasis) iff
 
-    (a) it is not part of a sequence of four or more unescaped `_`s,
-    (b) it is not preceded by whitespace, and
-    (c) it is not followed by an ASCII alphanumeric character.
+    (a) it is not preceded by whitespace, and
+    (b) it is not followed by an ASCII alphanumeric character.
 
 5.  A double `**` [can open strong emphasis](#can-open-strong-emphasis)
     <a id="can-open-strong-emphasis" ></a> iff
 
-    (a) it is not part of a sequence of four or more unescaped `*`s,
-    (b) it is not followed by whitespace, and
-    (c) either it is not followed by a `*` character or it is
+    (a) it is not followed by whitespace, and
+    (b) either it is not followed by a `*` character or it is
         followed immediately by emphasis.
 
 6.  A double `__` [can open strong emphasis](#can-open-strong-emphasis)
     iff
 
-    (a) it is not part of a sequence of four or more unescaped `_`s,
-    (b) it is not followed by whitespace, and
-    (c) it is not preceded by an ASCII alphanumeric character, and
-    (d) either it is not followed by a `_` character or it is
+    (a) it is not followed by whitespace, and
+    (b) it is not preceded by an ASCII alphanumeric character, and
+    (c) either it is not followed by a `_` character or it is
         followed immediately by emphasis.
 
 7.  A double `**` [can close strong emphasis](#can-close-strong-emphasis)
     <a id="can-close-strong-emphasis" ></a> iff
 
-    (a) it is not part of a sequence of four or more unescaped `*`s, and
-    (b) it is not preceded by whitespace.
+    (a) it is not preceded by whitespace.
 
 8.  A double `__` [can close strong emphasis](#can-close-strong-emphasis)
     iff
 
-    (a) it is not part of a sequence of four or more unescaped `_`s,
-    (b) it is not preceded by whitespace, and
-    (c) it is not followed by an ASCII alphanumeric character.
+    (a) it is not preceded by whitespace, and
+    (b) it is not followed by an ASCII alphanumeric character.
 
 9.  Emphasis begins with a delimiter that [can open
     emphasis](#can-open-emphasis) and ends with a delimiter that [can close
@@ -4544,19 +4536,13 @@ and __foo bar __
 <p>and __foo bar __</p>
 .
 
-The rules imply that a sequence of four or more unescaped `*` or
-`_` characters will always be parsed as a literal string:
-
-.
-****hi****
-.
-<p>****hi****</p>
-.
+The rules imply that a sequence of `*` or `_` characters
+surrounded by whitespace will be parsed as a literal string:
 
 .
-_____hi_____
+foo ********
 .
-<p>_____hi_____</p>
+<p>foo ********</p>
 .
 
 .
@@ -4827,8 +4813,7 @@ the internal delimiters [can close emphasis](#can-close-emphasis),
 while in the cases with spaces, they cannot.
 
 Note that you cannot nest emphasis directly inside emphasis
-using the same delimeter, or strong emphasis directly inside
-strong emphasis:
+using the same delimeter:
 
 .
 **foo**
@@ -4836,22 +4821,25 @@ strong emphasis:
 <p><strong>foo</strong></p>
 .
 
+For this, you need to switch delimiters:
+
 .
-****foo****
+*_foo_*
 .
-<p>****foo****</p>
+<p><em><em>foo</em></em></p>
 .
 
-For these nestings, you need to switch delimiters:
+Strong within strong is possible without switching
+delimiters:
 
 .
-*_foo_*
+****foo****
 .
-<p><em><em>foo</em></em></p>
+<p><strong><strong>foo</strong></strong></p>
 .
 
 .
-**__foo__**
+____foo____
 .
 <p><strong><strong>foo</strong></strong></p>
 .
@@ -4890,21 +4878,19 @@ similarly for `_` and `__`):
 <p><em><strong>foo</strong> bar</em>**</p>
 .
 
-The following contains no strong emphasis, because the opening
-delimiter is closed by the first `*` before `bar`:
-
 .
-*foo**bar***
+*foo****
 .
-<p><em>foo</em><em>bar</em>**</p>
+<p><em>foo</em>***</p>
 .
 
-However, a string of four or more `****` can never close emphasis:
+The following contains no strong emphasis, because the opening
+delimiter is closed by the first `*` before `bar`:
 
 .
-*foo****
+*foo**bar***
 .
-<p>*foo****</p>
+<p><em>foo</em><em>bar</em>**</p>
 .
 
 We retain symmetry in these cases:
@@ -4927,6 +4913,26 @@ We retain symmetry in these cases:
 <p><em><em>foo</em> bar</em></p>
 .
 
+.
+**foo***
+
+***foo**
+.
+<p><strong>foo</strong>*</p>
+<p>*<strong>foo</strong></p>
+.
+
+.
+**foo **bar****
+
+****foo** bar**
+.
+<p><strong>foo <strong>bar</strong></strong></p>
+<p><strong><strong>foo</strong> bar</strong></p>
+.
+
+
+
 More cases with mismatched delimiters:
 
 .
diff --git a/src/inlines.c b/src/inlines.c
@@ -297,8 +297,8 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c
 		advance(subj);
 	}
 	char_after = peek_char(subj);
-	*can_open = numdelims > 0 && numdelims <= 3 && !isspace(char_after);
-	*can_close = numdelims > 0 && numdelims <= 3 && !isspace(char_before);
+	*can_open = numdelims > 0 && !isspace(char_after);
+	*can_close = numdelims > 0 && !isspace(char_before);
 	if (c == '_') {
 		*can_open = *can_open && !isalnum(char_before);
 		*can_close = *can_close && !isalnum(char_after);
@@ -324,6 +324,7 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l
 	bool can_open, can_close;
 	int numdelims;
 	int useDelims;
+	int openerDelims;
 	inline_stack * istack;
 	node_inl * inl;
 	node_inl * emph;
@@ -347,9 +348,12 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l
 		}
 
 		// calculate the actual number of delimeters used from this closer
-		useDelims = istack->delim_count;
-		if (useDelims == 3) useDelims = numdelims == 3 ? 1 : numdelims;
-		else if (useDelims > numdelims) useDelims = 1;
+		openerDelims = istack->delim_count;
+		if (numdelims < 3 || openerDelims < 3) {
+		    useDelims = numdelims <= openerDelims ? numdelims : openerDelims;
+		} else { // (numdelims >= 3 && openerDelims >= 3)
+		    useDelims = numdelims % 2 == 0 ? 2 : 1;
+		}
 
 		if (istack->delim_count == useDelims)
 		{