cmark

My personal build of CMark ✏️

Commit: b2107ae4f05d7fe4388d37025105005592d32ef7
Parent: 258a88d6c903234831281d4495eff7382932b617
Author: Vicent Marti <tanoku@gmail.com>
Date: Mon, 23 May 2016 16:43:13 +0200

buffer: proper safety checks for unbounded memory

The previous work for unbounded memory usage and overflows on the buffer API had several shortcomings:

1. The total size of the buffer was limited by arbitrarily small precision on the storage type for buffer indexes (typedef'd as `bufsize_t`). This is not a good design pattern in secure applications, particualarly since it requires the addition of helper functions to cast to/from the native `size` types and the custom type for the buffer, and check for overflows.

2. The library was calling `abort` on overflow and memory allocation failures. This is not a good practice for production libraries, since it turns a potential RCE into a trivial, guaranteed DoS to the whole application that is linked against the library. It defeats the whole point of performing overflow or allocation checks when the checks will crash the library and the enclosing program anyway.

3. The default size limits for buffers were essentially unbounded (capped to the precision of the storage type) and could lead to DoS attacks by simple memory exhaustion (particularly critical in 32-bit platforms). This is not a good practice for a library that handles arbitrary user input.

Hence, this patchset provides slight (but in my opinion critical) improvements on this area, copying some of the patterns we've used in the past for high throughput, security sensitive Markdown parsers:

1. The storage type for buffer sizes is now platform native (`ssize_t`). Ideally, this would be a `size_t`, but several parts of the code expect buffer indexes to be possibly negative. Either way, switching to a `size` type is an strict improvement, particularly in 64-bit platforms. All the helpers that assured that values cannot escape the `size` range have been removed, since they are superfluous.

2. The overflow checks have been removed. Instead, the maximum size for a buffer has been set to a safe value for production usage (32mb) that can be proven not to overflow in practice. Users that need to parse particularly large Markdown documents can increase this value. A static, compile-time check has been added to ensure that the maximum buffer size cannot overflow on any growth operations.

3. The library no longer aborts on buffer overflow. The CMark library now follows the convention of other Markdown implementations (such as Hoedown and Sundown) and silently handles buffer overflows and allocation failures by dropping data from the buffer. The result is that pathological Markdown documents that try to exploit the library will instead generate truncated (but valid, and safe) outputs.

All tests after these small refactorings have been verified to pass.

---

NOTE: Regarding 32 bit overflows, generating test cases that crash the library is trivial (any input document larger than 2gb will crash CMark), but most Python implementations have issues with large strings to begin with, so a test case cannot be added to the pathological tests suite, since it's written in Python.

Diffstat

7 files changed, 71 insertions, 89 deletions

Status	File Name	N° Changes	Insertions	Deletions
Modified	src/blocks.c	2	1	1
Modified	src/buffer.c	93	37	56
Modified	src/buffer.h	37	20	17
Modified	src/chunk.h	4	2	2
Modified	src/commonmark.c	17	8	9
Modified	src/latex.c	5	2	3
Modified	src/render.c	2	1	1

diff --git a/src/blocks.c b/src/blocks.c
@@ -528,7 +528,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
       process = true;
     }
 
-    chunk_len = cmark_strbuf_check_bufsize(eol - buffer);
+    chunk_len = (eol - buffer);
     if (process) {
       if (parser->linebuf->size > 0) {
         cmark_strbuf_put(parser->linebuf, buffer, chunk_len);

diff --git a/src/buffer.c b/src/buffer.c
@@ -5,6 +5,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <limits.h>
 
 #include "config.h"
 #include "cmark_ctype.h"
@@ -28,68 +29,48 @@ void cmark_strbuf_init(cmark_strbuf *buf, bufsize_t initial_size) {
     cmark_strbuf_grow(buf, initial_size);
 }
 
-void cmark_strbuf_overflow_err() {
-  fprintf(stderr, "String buffer overflow");
-  abort();
+static CMARK_INLINE bool S_strbuf_grow_by(cmark_strbuf *buf, bufsize_t add) {
+  return cmark_strbuf_grow(buf, buf->size + add);
 }
 
-static CMARK_INLINE void S_strbuf_grow_by(cmark_strbuf *buf, size_t add) {
-  size_t target_size = (size_t)buf->size + add;
-
-  if (target_size < add            /* Integer overflow. */
-      || target_size > BUFSIZE_MAX /* Truncation overflow. */
-      ) {
-    cmark_strbuf_overflow_err();
-    return; /* unreachable */
-  }
-
-  if ((bufsize_t)target_size >= buf->asize)
-    cmark_strbuf_grow(buf, (bufsize_t)target_size);
-}
+#if BUFSIZE_MAX > (SIZE_MAX / 4)
+#  error "unsafe value for BUFSIZE_MAX"
+#endif
 
-void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size) {
-  unsigned char *new_ptr;
+bool cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size) {
+  assert(target_size > 0);
 
   if (target_size < buf->asize)
-    return;
-
-  if (buf->asize == 0) {
-    new_ptr = NULL;
-  } else {
-    new_ptr = buf->ptr;
-  }
+    return true;
+
+  /*
+   * Do not allow string buffers to grow past this "safe" value.
+   *
+   * Note that this is a soft cap to prevent unbounded memory growth:
+   * in practice, the buffer can get larger than this value because we
+   * overgrow it by 50%
+   *
+   * Note that there are no overflow checks for the realloc because
+   * the value of BUFSIZE_MAX is always assured to be impossible
+   * to overflow on both 32 and 64 bit systems, since it will never
+   * be larger than 1/4th of our address space.
+   */
+  if (target_size > BUFSIZE_MAX)
+    return false;
 
   /* Oversize the buffer by 50% to guarantee amortized linear time
    * complexity on append operations. */
-  size_t new_size = (size_t)target_size + (size_t)target_size / 2;
-
-  /* Account for terminating null byte. */
+  bufsize_t new_size = target_size + target_size / 2;
   new_size += 1;
-
-  /* round allocation up to multiple of 8 */
   new_size = (new_size + 7) & ~7;
 
-  if (new_size < (size_t)target_size /* Integer overflow. */
-      || new_size > BUFSIZE_MAX      /* Truncation overflow. */
-      ) {
-    if (target_size >= BUFSIZE_MAX) {
-      /* No space for terminating null byte. */
-      cmark_strbuf_overflow_err();
-      return; /* unreachable */
-    }
-    /* Oversize by the maximum possible amount. */
-    new_size = BUFSIZE_MAX;
-  }
+  unsigned char *new_ptr = realloc(buf->asize ? buf->ptr : NULL, new_size);
+  if (!new_ptr)
+    return false;
 
-  new_ptr = (unsigned char *)realloc(new_ptr, new_size);
-
-  if (!new_ptr) {
-    perror("realloc in cmark_strbuf_grow");
-    abort();
-  }
-
-  buf->asize = (bufsize_t)new_size;
+  buf->asize = new_size;
   buf->ptr = new_ptr;
+  return true;
 }
 
 bufsize_t cmark_strbuf_len(const cmark_strbuf *buf) { return buf->size; }
@@ -117,8 +98,8 @@ void cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data,
     cmark_strbuf_clear(buf);
   } else {
     if (data != buf->ptr) {
-      if (len >= buf->asize)
-        cmark_strbuf_grow(buf, len);
+      if (len >= buf->asize && !cmark_strbuf_grow(buf, len))
+        return;
       memmove(buf->ptr, data, len);
     }
     buf->size = len;
@@ -128,21 +109,21 @@ void cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data,
 
 void cmark_strbuf_sets(cmark_strbuf *buf, const char *string) {
   cmark_strbuf_set(buf, (const unsigned char *)string,
-                   string ? cmark_strbuf_safe_strlen(string) : 0);
+                   string ? strlen(string) : 0);
 }
 
 void cmark_strbuf_putc(cmark_strbuf *buf, int c) {
-  S_strbuf_grow_by(buf, 1);
+  if (!S_strbuf_grow_by(buf, 1))
+    return;
   buf->ptr[buf->size++] = (unsigned char)(c & 0xFF);
   buf->ptr[buf->size] = '\0';
 }
 
 void cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data,
                       bufsize_t len) {
-  if (len <= 0)
+  if (len <= 0 || !S_strbuf_grow_by(buf, len))
     return;
 
-  S_strbuf_grow_by(buf, len);
   memmove(buf->ptr + buf->size, data, len);
   buf->size += len;
   buf->ptr[buf->size] = '\0';
@@ -150,7 +131,7 @@ void cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data,
 
 void cmark_strbuf_puts(cmark_strbuf *buf, const char *string) {
   cmark_strbuf_put(buf, (const unsigned char *)string,
-                   cmark_strbuf_safe_strlen(string));
+                   strlen(string));
 }
 
 void cmark_strbuf_copy_cstr(char *data, bufsize_t datasize,

diff --git a/src/buffer.h b/src/buffer.h
@@ -5,13 +5,15 @@
 #include <stdarg.h>
 #include <string.h>
 #include <limits.h>
+#include <unistd.h>
+#include <stdbool.h>
 #include "config.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef int bufsize_t;
+typedef ssize_t bufsize_t;
 
 typedef struct {
   unsigned char *ptr;
@@ -22,7 +24,22 @@ extern unsigned char cmark_strbuf__initbuf[];
 
 #define GH_BUF_INIT                                                            \
   { cmark_strbuf__initbuf, 0, 0 }
-#define BUFSIZE_MAX INT_MAX
+
+/*
+ * Maximum size for memory storage on any given `cmark_strbuf` object.
+ *
+ * This is a "safe" value to prevent unbounded memory growth when
+ * parsing arbitrarily large (and potentially malicious) documents.
+ *
+ * It is currently set to 32mb, which is a reasonable default for
+ * production applications. If you need to parse documents larger than
+ * that, you can increase this value up to `SSIZE_MAX / 2` (which in
+ * practice resolves to 1/4th of the total address space for the program).
+ *
+ * Anything larger than that is a security threat and hence static checks
+ * will prevent CMark from compiling.
+ */
+#define BUFSIZE_MAX (32 * 1024 * 1024)
 
 /**
  * Initialize a cmark_strbuf structure.
@@ -35,7 +52,7 @@ void cmark_strbuf_init(cmark_strbuf *buf, bufsize_t initial_size);
 /**
  * Grow the buffer to hold at least `target_size` bytes.
  */
-void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size);
+bool cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size);
 
 void cmark_strbuf_free(cmark_strbuf *buf);
 void cmark_strbuf_swap(cmark_strbuf *buf_a, cmark_strbuf *buf_b);
@@ -72,20 +89,6 @@ void cmark_strbuf_trim(cmark_strbuf *buf);
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);
 void cmark_strbuf_unescape(cmark_strbuf *s);
 
-/* Print error and abort. */
-void cmark_strbuf_overflow_err(void);
-
-static CMARK_INLINE bufsize_t cmark_strbuf_check_bufsize(size_t size) {
-  if (size > BUFSIZE_MAX) {
-    cmark_strbuf_overflow_err();
-  }
-  return (bufsize_t)size;
-}
-
-static CMARK_INLINE bufsize_t cmark_strbuf_safe_strlen(const char *str) {
-  return cmark_strbuf_check_bufsize(strlen(str));
-}
-
 #ifdef __cplusplus
 }
 #endif

diff --git a/src/chunk.h b/src/chunk.h
@@ -83,7 +83,7 @@ static CMARK_INLINE void cmark_chunk_set_cstr(cmark_chunk *c, const char *str) {
     c->data = NULL;
     c->alloc = 0;
   } else {
-    c->len = cmark_strbuf_safe_strlen(str);
+    c->len = strlen(str);
     c->data = (unsigned char *)malloc(c->len + 1);
     c->alloc = 1;
     memcpy(c->data, str, c->len + 1);
@@ -91,7 +91,7 @@ static CMARK_INLINE void cmark_chunk_set_cstr(cmark_chunk *c, const char *str) {
 }
 
 static CMARK_INLINE cmark_chunk cmark_chunk_literal(const char *data) {
-  bufsize_t len = data ? cmark_strbuf_safe_strlen(data) : 0;
+  bufsize_t len = data ? strlen(data) : 0;
   cmark_chunk c = {(unsigned char *)data, len, 0};
   return c;
 }

diff --git a/src/commonmark.c b/src/commonmark.c
@@ -11,7 +11,6 @@
 #include "scanners.h"
 #include "render.h"
 
-#define safe_strlen(s) cmark_strbuf_safe_strlen(s)
 #define OUT(s, wrap, escaping) renderer->out(renderer, s, wrap, escaping)
 #define LIT(s) renderer->out(renderer, s, false, LITERAL)
 #define CR() renderer->cr(renderer)
@@ -67,7 +66,7 @@ static int longest_backtick_sequence(const char *code) {
   int longest = 0;
   int current = 0;
   size_t i = 0;
-  size_t code_len = safe_strlen(code);
+  size_t code_len = strlen(code);
   while (i <= code_len) {
     if (code[i] == '`') {
       current++;
@@ -86,7 +85,7 @@ static int shortest_unused_backtick_sequence(const char *code) {
   int32_t used = 1;
   int current = 0;
   size_t i = 0;
-  size_t code_len = safe_strlen(code);
+  size_t code_len = strlen(code);
   while (i <= code_len) {
     if (code[i] == '`') {
       current++;
@@ -233,7 +232,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node,
       snprintf(listmarker, LISTMARKER_SIZE, "%d%s%s", list_number,
                list_delim == CMARK_PAREN_DELIM ? ")" : ".",
                list_number < 10 ? "  " : " ");
-      marker_width = safe_strlen(listmarker);
+      marker_width = strlen(listmarker);
     }
     if (entering) {
       if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) {
@@ -275,9 +274,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node,
       BLANKLINE();
     }
     info = cmark_node_get_fence_info(node);
-    info_len = safe_strlen(info);
+    info_len = strlen(info);
     code = cmark_node_get_literal(node);
-    code_len = safe_strlen(code);
+    code_len = strlen(code);
     // use indented form if no info, and code doesn't
     // begin or end with a blank line, and code isn't
     // first thing in a list item
@@ -361,7 +360,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node,
 
   case CMARK_NODE_CODE:
     code = cmark_node_get_literal(node);
-    code_len = safe_strlen(code);
+    code_len = strlen(code);
     numticks = shortest_unused_backtick_sequence(code);
     for (i = 0; i < numticks; i++) {
       LIT("`");
@@ -431,7 +430,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node,
         LIT("](");
         OUT(cmark_node_get_url(node), false, URL);
         title = cmark_node_get_title(node);
-        if (safe_strlen(title) > 0) {
+        if (strlen(title) > 0) {
           LIT(" \"");
           OUT(title, false, TITLE);
           LIT("\"");
@@ -448,7 +447,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node,
       LIT("](");
       OUT(cmark_node_get_url(node), false, URL);
       title = cmark_node_get_title(node);
-      if (safe_strlen(title) > 0) {
+      if (strlen(title) > 0) {
         OUT(" \"", allow_wrap, LITERAL);
         OUT(title, false, TITLE);
         LIT("\"");

diff --git a/src/latex.c b/src/latex.c
@@ -11,7 +11,6 @@
 #include "scanners.h"
 #include "render.h"
 
-#define safe_strlen(s) cmark_strbuf_safe_strlen(s)
 #define OUT(s, wrap, escaping) renderer->out(renderer, s, wrap, escaping)
 #define LIT(s) renderer->out(renderer, s, false, LITERAL)
 #define CR() renderer->cr(renderer)
@@ -168,13 +167,13 @@ static link_type get_link_type(cmark_node *node) {
     return INTERNAL_LINK;
   }
 
-  url_len = (int)safe_strlen(url);
+  url_len = strlen(url);
   if (url_len == 0 || scan_scheme(&url_chunk, 0) == 0) {
     return NO_LINK;
   }
 
   const char *title = cmark_node_get_title(node);
-  title_len = safe_strlen(title);
+  title_len = strlen(title);
   // if it has a title, we can't treat it as an autolink:
   if (title_len == 0) {

diff --git a/src/render.c b/src/render.c
@@ -19,7 +19,7 @@ static CMARK_INLINE void S_blankline(cmark_renderer *renderer) {
 
 static void S_out(cmark_renderer *renderer, const char *source, bool wrap,
                   cmark_escaping escape) {
-  int length = cmark_strbuf_safe_strlen(source);
+  int length = strlen(source);
   unsigned char nextc;
   int32_t c;
   int i = 0;