cmark

My personal build of CMark ✏️

Commit
e9f5a586938b926da932a9e957f801281dde4730
Parent
25f65e91293f1bfd74f81a78e2dac2cdbaa55e98
Author
John MacFarlane <jgm@berkeley.edu>
Date

New parseEmphasis algorithm.

- State machine for emphasis parsing.

- This would require some adjustments to the spec and spec examples.

- It currently blows the stack on `tricky'.

- Memoization code has been commented out.

- Inline parsers return arrays.

Diffstat

1 file changed, 228 insertions, 65 deletions

Status File Name N° Changes Insertions Deletions
Modified js/stmd.js 293 228 65
diff --git a/js/stmd.js b/js/stmd.js
@@ -166,15 +166,15 @@
         var match;
         while (!foundCode && (match = this.match(/`+/m))) {
             if (match == ticks) {
-                return { t: 'Code', c: this.subject.slice(afterOpenTicks,
+                return [{ t: 'Code', c: this.subject.slice(afterOpenTicks,
                                                           this.pos - ticks.length)
                          .replace(/[ \n]+/g,' ')
-                         .trim() };
+                          .trim() }];
             }
         }
         // If we got here, we didn't match a closing backtick sequence.
         this.pos = afterOpenTicks;
-        return { t: 'Str', c: ticks };
+        return [{ t: 'Str', c: ticks }];
     };
 
     // Parse a backslash-escaped special character, adding either the escaped
@@ -186,13 +186,13 @@
         if (subj[pos] === '\\') {
             if (subj[pos + 1] === '\n') {
                 this.pos = this.pos + 2;
-                return { t: 'Hardbreak' };
+                return [{ t: 'Hardbreak' }];
             } else if (reEscapable.test(subj[pos + 1])) {
                 this.pos = this.pos + 2;
-                return { t: 'Str', c: subj[pos + 1] };
+                return [{ t: 'Str', c: subj[pos + 1] }];
             } else {
                 this.pos++;
-                return {t: 'Str', c: '\\'};
+                return [{t: 'Str', c: '\\'}];
             }
         } else {
             return null;
@@ -205,14 +205,14 @@
         var dest;
         if ((m = this.match(/^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/))) {  // email autolink
             dest = m.slice(1,-1);
-            return {t: 'Link',
-                    label: [{ t: 'Str', c: dest }],
-                    destination: 'mailto:' + dest };
+            return [{t: 'Link',
+                     label: [{ t: 'Str', c: dest }],
+                     destination: 'mailto:' + dest }];
         } else if ((m = this.match(/^<(?:coap|doi|javascript|aaa|aaas|about|acap|cap|cid|crid|data|dav|dict|dns|file|ftp|geo|go|gopher|h323|http|https|iax|icap|im|imap|info|ipp|iris|iris.beep|iris.xpc|iris.xpcs|iris.lwz|ldap|mailto|mid|msrp|msrps|mtqp|mupdate|news|nfs|ni|nih|nntp|opaquelocktoken|pop|pres|rtsp|service|session|shttp|sieve|sip|sips|sms|snmp|soap.beep|soap.beeps|tag|tel|telnet|tftp|thismessage|tn3270|tip|tv|urn|vemmi|ws|wss|xcon|xcon-userid|xmlrpc.beep|xmlrpc.beeps|xmpp|z39.50r|z39.50s|adiumxtra|afp|afs|aim|apt|attachment|aw|beshare|bitcoin|bolo|callto|chrome|chrome-extension|com-eventbrite-attendee|content|cvs|dlna-playsingle|dlna-playcontainer|dtn|dvb|ed2k|facetime|feed|finger|fish|gg|git|gizmoproject|gtalk|hcp|icon|ipn|irc|irc6|ircs|itms|jar|jms|keyparc|lastfm|ldaps|magnet|maps|market|message|mms|ms-help|msnim|mumble|mvn|notes|oid|palm|paparazzi|platform|proxy|psyc|query|res|resource|rmi|rsync|rtmp|secondlife|sftp|sgn|skype|smb|soldat|spotify|ssh|steam|svn|teamspeak|things|udp|unreal|ut2004|ventrilo|view-source|webcal|wtai|wyciwyg|xfire|xri|ymsgr):[^<>\x00-\x20]*>/i))) {
             dest = m.slice(1,-1);
-            return { t: 'Link',
-                     label: [{ t: 'Str', c: dest }],
-                     destination: dest };
+            return [{ t: 'Link',
+                      label: [{ t: 'Str', c: dest }],
+                      destination: dest }];
         } else {
             return null;
         }
@@ -222,7 +222,7 @@
     var parseHtmlTag = function() {
         var m = this.match(reHtmlTag);
         if (m) {
-            return { t: 'Html', c: m };
+            return [{ t: 'Html', c: m }];
         } else {
             return null;
         }
@@ -285,60 +285,219 @@
 
         if (numdelims >= 4 || !res.can_open) {
             this.pos += numdelims;
-            return {t: 'Str', c: this.subject.slice(startpos, startpos + numdelims)};
+            return [{t: 'Str', c: this.subject.slice(startpos, startpos + numdelims)}];
         }
 
         this.pos += numdelims;
 
         var next_inline;
-        var last_emphasis_closer = null;
+        var first = [];
+        var second = [];
+        var current = first;
+        var state = 0;
 
-        var delims_to_match = numdelims;
+        if (numdelims === 3) {
+            state = 1;
+        } else if (numdelims === 2) {
+            state = 2;
+        } else if (numdelims === 1) {
+            state = 3;
+        }
 
-        // We need not look for closers if we have already recorded that
-        // there are no closers past this point.
-        while (this.last_emphasis_closer === null ||
-               this.last_emphasis_closer >= this.pos) {
+        while (true) {
             res = this.scanDelims(c);
-            numclosedelims = res.numdelims;
-            if (res.can_close) {
-                if (last_emphasis_closer === null ||
-                    last_emphasis_closer < this.pos) {
-                    last_emphasis_closer = this.pos;
+
+            switch (state) {
+            case 1: // ***a
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Strong', c: [{t: 'Emph', c: first}]}];
+                } else if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    current = second;
+                    state = res.can_open ? 4 : 6;
+                    continue;
+                } else if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    current = second;
+                    state = res.can_open ? 5 : 7;
+                    continue;
+                 }
+                break;
+            case 2: // **a
+                if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    return [{t: 'Strong', c: first}];
+                } else if (res.numdelims === 1 && res.can_open) {
+                    this.pos += 1;
+                    current = second;
+                    state = 8;
+                    continue;
                 }
-                if (numclosedelims === 3 && delims_to_match === 3) {
+                break;
+            case 3: // *a
+                if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    return [{t: 'Emph', c: first}];
+                } else if (res.numdelims === 2 && res.can_open) {
+                    this.pos += 2;
+                    current = second;
+                    state = 9;
+                    continue;
+                }
+                break;
+            case 4: // ***a**b
+                if (res.numdelims === 3 && res.can_close) {
                     this.pos += 3;
-                    this.last_emphasis_closer = null;
-                    return {t: 'Strong', c: [{t: 'Emph', c: inlines}]};
-                } else if (numclosedelims >= 2 && delims_to_match >= 2) {
-                    delims_to_match -= 2;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph',
+                                  c: first.concat(
+                                      [{t: 'Str', c: c+c}],
+                                       second)}]}];
+                } else if (res.numdelims === 2 && res.can_close) {
                     this.pos += 2;
-                    inlines = [{t: 'Strong', c: inlines}];
-                } else if (numclosedelims >= 1 && delims_to_match >= 1) {
-                    delims_to_match -= 1;
+                    return [{t: 'Strong',
+                             c: [{t: 'Str', c: c+c+c}].concat(
+                                 first,
+                                 [{t: 'Strong', c: second}])}];
+                } else if (res.numdelims === 1 && res.can_close) {
                     this.pos += 1;
-                    inlines = [{t: 'Emph', c: inlines}];
+                    return [{t: 'Emph',
+                             c: [{t: 'Strong', c: first}].concat(second)}];
                 }
-                if (delims_to_match === 0) {
-                    this.last_emphasis_closer = null;
-                    return inlines[0];
+                break;
+            case 5: // ***a*b
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph',
+                                  c: first.concat(
+                                      [{t: 'Str', c: c}],
+                                       second)}]}];
+                } else if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph', c: first}].concat(second)}];
+                } else if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    return [{t: 'Strong',
+                             c: [{t: 'Str', c: c+c+c}].concat(
+                                 first,
+                                 [{t: 'Emph', c: second}])}];
                 }
-            } else if ((next_inline = this.parseInline())) {
-                inlines.push(next_inline);
+                 break;
+            case 6: // ***a** b
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph',
+                                  c: first.concat(
+                                      [{t: 'Str', c: c+c}],
+                                       second)}]}];
+                } else if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    return [{t: 'Emph',
+                             c: [{t: 'Strong', c: first}].concat(second)}];
+                }
+                break;
+            case 7: // ***a* b
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph',
+                                  c: first.concat(
+                                      [{t: 'Str', c: c}],
+                                       second)}]}];
+                } else if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    return [{t: 'Strong',
+                             c: [{t: 'Emph', c: first}].concat(second)}];
+                }
+                break;
+            case 8: // **a *b
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Strong',
+                             c: first.concat([{t: 'Emph',
+                                               c: second}])}];
+                } else if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    return [{t: 'Strong',
+                             c: first.concat(
+                                 [{t: 'Str', c: c}],
+                                 second)}];
+                } else if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    return [{t: 'Str', c: c+c}].concat(
+                        first,
+                        [{t: 'Emph', c: second}]);
+                }
+                break;
+            case 9: // *a **b
+                if (res.numdelims === 3 && res.can_close) {
+                    this.pos += 3;
+                    return [{t: 'Emph',
+                             c: first.concat([{t: 'Strong',
+                                               c: second}])}];
+                } else if (res.numdelims === 2 && res.can_close) {
+                    this.pos += 2;
+                    return [{t: 'Str', c: c}].concat(
+                        first,
+                        [{t: 'Strong', c: second}]);
+                } else if (res.numdelims === 1 && res.can_close) {
+                    this.pos += 1;
+                    return [{t: 'Emph',
+                             c: first.concat(
+                                 [{t: 'Str', c: c+c}],
+                                 second)}];
+                }
+                break;
+            default:
+                break;
+            }
+
+            if ((next_inline = this.parseInline())) {
+                Array.prototype.push.apply(current, next_inline);
             } else {
                 break;
             }
+
         }
 
-        // didn't find closing delimiter
-        this.pos = startpos + numdelims;
-        if (last_emphasis_closer === null) {
-            // we know there are no closers after startpos, so:
-            this.last_emphasis_closer = startpos;
-        } else {
-            this.last_emphasis_closer = last_emphasis_closer;
+        switch (state) {
+        case 1: // ***a
+            return [{t: 'Str', c: c+c+c}].concat(first);
+        case 2: // **a
+            return [{t: 'Str', c: c+c}].concat(first);
+        case 3: // *a
+            return [{t: 'Str', c: c}].concat(first);
+        case 4: // ***a**b
+        case 6: // ***a** b
+            return [{t: 'Str', c: c+c+c}]
+                .concat(first,
+                        [{t: 'Str', c: c+c}],
+                        second);
+        case 5: // ***a*b
+        case 7: // ***a* b
+            return [{t: 'Str', c: c+c+c}]
+                .concat(first,
+                        [{t: 'Str', c: c}],
+                        second);
+        case 8: // **a *b
+            return [{t: 'Str', c: c+c}]
+                .concat(first,
+                        [{t: 'Str', c: c}],
+                        second);
+        case 9: // *a **b
+            return [{t: 'Str', c: c}]
+                .concat(first,
+                        [{t: 'Str', c: c+c}],
+                        second);
+        default:
+            console.log("Unknown state, parseEmphasis");
+            // shouldn't happen
         }
-        return {t: 'Str', c: this.subject.slice(startpos, startpos + numdelims)};
+
     };
 
     // Attempt to parse link title (sans quotes), returning the string
@@ -461,10 +620,10 @@
                  (title = this.parseLinkTitle() || '') || true) &&
                 this.spnl() &&
                 this.match(/^\)/)) {
-                return { t: 'Link',
-                         destination: dest,
-                         title: title,
-                         label: parseRawLabel(rawlabel) };
+                return [{ t: 'Link',
+                          destination: dest,
+                          title: title,
+                          label: parseRawLabel(rawlabel) }];
             } else {
                 this.pos = startpos;
                 return null;
@@ -488,10 +647,10 @@
         // lookup rawlabel in refmap
         var link = this.refmap[normalizeReference(reflabel)];
         if (link) {
-            return {t: 'Link',
-                    destination: link.destination,
-                    title: link.title,
-                    label: parseRawLabel(rawlabel) };
+            return [{t: 'Link',
+                     destination: link.destination,
+                     title: link.title,
+                     label: parseRawLabel(rawlabel) }];
         } else {
             this.pos = startpos;
             return null;
@@ -505,7 +664,7 @@
     var parseEntity = function() {
         var m;
         if ((m = this.match(/^&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});/i))) {
-            return { t: 'Entity', c: m };
+            return [{ t: 'Entity', c: m }];
         } else {
             return  null;
         }
@@ -516,7 +675,7 @@
     var parseString = function() {
         var m;
         if ((m = this.match(reMain))) {
-            return { t: 'Str', c: m };
+            return [{ t: 'Str', c: m }];
         } else {
             return null;
         }
@@ -528,9 +687,9 @@
         var m = this.match(/^ *\n/);
         if (m) {
             if (m.length > 2) {
-                return { t: 'Hardbreak' };
+                return [{ t: 'Hardbreak' }];
             } else if (m.length > 0) {
-                return { t: 'Softbreak' };
+                return [{ t: 'Softbreak' }];
             }
         }
         return null;
@@ -542,10 +701,10 @@
         if (this.match(/^!/)) {
             var link = this.parseLink();
             if (link) {
-                link.t = 'Image';
+                link[0].t = 'Image';
                 return link;
             } else {
-                return { t: 'Str', c: '!' };
+                return [{ t: 'Str', c: '!' }];
             }
         } else {
             return null;
@@ -615,11 +774,13 @@
     // and returning the inline parsed.
     var parseInline = function() {
         var startpos = this.pos;
+        /*
         var memoized = this.memo[startpos];
         if (memoized) {
             this.pos = memoized.endpos;
             return memoized.inline;
         }
+        */
         var c = this.peek();
         if (!c) {
             return null;
@@ -658,12 +819,14 @@
         }
         if (res === null) {
             this.pos += 1;
-            res = {t: 'Str', c: c};
+            res = [{t: 'Str', c: c}];
         }
+        /*
         if (res) {
             this.memo[startpos] = { inline: res,
                                     endpos: this.pos };
         }
+         */
         return res;
     };
 
@@ -672,12 +835,12 @@
         this.subject = s;
         this.pos = 0;
         this.refmap = refmap || {};
-        this.memo = {};
+        // this.memo = {};
         this.last_emphasis_closer = null;
         var inlines = [];
         var next_inline;
         while ((next_inline = this.parseInline())) {
-            inlines.push(next_inline);
+            Array.prototype.push.apply(inlines, next_inline);
         }
         return inlines;
     };
@@ -690,7 +853,7 @@
             last_emphasis_closer: null,  // used by parseEmphasis method
             pos: 0,
             refmap: {},
-            memo: {},
+            // memo: {},
             match: match,
             peek: peek,
             spnl: spnl,