cmark

My personal build of CMark ✏️

Commit
ffde0612e7e59a4d19e6c2ee0ac062ae4d049f5e
Parent
1ea9cd8ceaec181dd945d82af812a7712f232573
Author
John MacFarlane <jgm@berkeley.edu>
Date

Merge pull request #219 from github/upstream/left-angle-link

Fix URL scanner

Diffstat

1 file changed, 52 insertions, 38 deletions

Status File Name N° Changes Insertions Deletions
Modified src/inlines.c 90 52 38
diff --git a/src/inlines.c b/src/inlines.c
@@ -695,8 +695,8 @@ static cmark_node *handle_entity(subject *subj) {
   return make_str(subj->mem, cmark_chunk_buf_detach(&ent));
 }
 
-// Clean a URL: remove surrounding whitespace and surrounding <>,
-// and remove \ that escape punctuation.
+// Clean a URL: remove surrounding whitespace, and remove \ that escape
+// punctuation.
 cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
   cmark_strbuf buf = CMARK_BUF_INIT(mem);
 
@@ -707,11 +707,7 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
     return result;
   }
 
-  if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
-    houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
-  } else {
-    houdini_unescape_html_f(&buf, url->data, url->len);
-  }
+  houdini_unescape_html_f(&buf, url->data, url->len);
 
   cmark_strbuf_unescape(&buf);
   return cmark_chunk_buf_detach(&buf);
@@ -824,10 +820,43 @@ noMatch:
   subj->pos = startpos; // rewind
   return 0;
 }
-static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) {
+
+static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, cmark_chunk *output) {
   bufsize_t i = offset;
   size_t nb_p = 0;
 
+  while (i < input->len) {
+    if (input->data[i] == '\\' &&
+        i + 1 < input-> len &&
+        cmark_ispunct(input->data[i+1]))
+      i += 2;
+    else if (input->data[i] == '(') {
+      ++nb_p;
+      ++i;
+    } else if (input->data[i] == ')') {
+      if (nb_p == 0)
+        break;
+      --nb_p;
+      ++i;
+    } else if (cmark_isspace(input->data[i]))
+      break;
+    else
+      ++i;
+  }
+
+  if (i >= input->len)
+    return -1;
+
+  {
+    cmark_chunk result = {input->data + offset, i - offset, 0};
+    *output = result;
+  }
+  return i - offset;
+}
+
+static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, cmark_chunk *output) {
+  bufsize_t i = offset;
+
   if (i < input->len && input->data[i] == '<') {
     ++i;
     while (i < input->len) {
@@ -836,42 +865,30 @@ static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) {
         break;
       } else if (input->data[i] == '\\')
         i += 2;
-      else if (cmark_isspace(input->data[i]))
-        return -1;
+      else if (cmark_isspace(input->data[i]) || input->data[i] == '<')
+        return manual_scan_link_url_2(input, offset, output);
       else
         ++i;
     }
   } else {
-    while (i < input->len) {
-      if (input->data[i] == '\\' &&
-	  i + 1 < input-> len &&
-          cmark_ispunct(input->data[i+1]))
-        i += 2;
-      else if (input->data[i] == '(') {
-        ++nb_p;
-        ++i;
-      } else if (input->data[i] == ')') {
-        if (nb_p == 0)
-          break;
-        --nb_p;
-        ++i;
-      } else if (cmark_isspace(input->data[i]))
-        break;
-      else
-        ++i;
-    }
+    return manual_scan_link_url_2(input, offset, output);
   }
 
   if (i >= input->len)
     return -1;
+
+  {
+    cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0};
+    *output = result;
+  }
   return i - offset;
 }
+
 // Return a link, an image, or a literal close bracket.
 static cmark_node *handle_close_bracket(subject *subj) {
   bufsize_t initial_pos, after_link_text_pos;
-  bufsize_t starturl, endurl, starttitle, endtitle, endall;
-  bufsize_t n;
-  bufsize_t sps;
+  bufsize_t endurl, starttitle, endtitle, endall;
+  bufsize_t sps, n;
   cmark_reference *ref = NULL;
   cmark_chunk url_chunk, title_chunk;
   cmark_chunk url, title;
@@ -907,11 +924,10 @@ static cmark_node *handle_close_bracket(subject *subj) {
   // First, look for an inline link.
   if (peek_char(subj) == '(' &&
       ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
-      ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
+      ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, &url_chunk)) > -1)) {
 
     // try to parse an explicit link:
-    starturl = subj->pos + 1 + sps; // after (
-    endurl = starturl + n;
+    endurl = subj->pos + 1 + sps + n;
     starttitle = endurl + scan_spacechars(&subj->input, endurl);
 
     // ensure there are spaces btw url and title
@@ -924,7 +940,6 @@ static cmark_node *handle_close_bracket(subject *subj) {
     if (peek_at(subj, endall) == ')') {
       subj->pos = endall + 1;
 
-      url_chunk = cmark_chunk_dup(&subj->input, starturl, endurl - starturl);
       title_chunk =
           cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
       url = cmark_clean_url(subj->mem, &url_chunk);
@@ -1213,9 +1228,8 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
 
   // parse link url:
   spnl(&subj);
-  matchlen = manual_scan_link_url(&subj.input, subj.pos);
-  if (matchlen > 0) {
-    url = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
+  if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 &&
+      url.len > 0) {
     subj.pos += matchlen;
   } else {
     return 0;