Skip to content

Commit

Permalink
Fix #220 (hash collisions for references).
Browse files Browse the repository at this point in the history
This commit ports Vicent Marti's fix in cmark-gfm.
(384cc9d)

His commit message follows:

As explained on the previous commit, it is trivial to DoS the CMark
parser by generating a document where all the link reference names hash
to the same bucket in the hash table.

This will cause the lookup process for each reference to take linear
time on the amount of references in the document, and with enough link
references to lookup, the end result is a pathological O(N^2) that
causes medium-sized documents to finish parsing in 5+ minutes.

To avoid this issue, we propose the present commit.

Based on the fact that all reference lookup/resolution in a Markdown
document is always performed as a last step during the parse process,
we've reimplemented reference storage as follows:

1. New references are always inserted at the end of a linked list. This
is an O(1) operation, and does not check whether an existing (duplicate)
reference with the same label already exists in the document.

2. Upon the first call to `cmark_reference_lookup` (when it is expected
that no further references will be added to the reference map), the
linked list of references is written into a fixed-size array.

3. The fixed size array can then be efficiently sorted in-place in O(n
log n). This operation only happens once. We perform this sort in a
_stable_ manner to ensure that the earliest link reference in the
document always has preference, as the spec dictates. To accomplish
this, every reference is tagged with a generation number when initially
inserted in the linked list.

4. The sorted array is then compacted in O(n). Since it was sorted in a
stable way, the first reference for each label is preserved and the
duplicates are removed, matching the spec.

5. We can now simply perform a binary search for the current
`cmark_reference_lookup` query in O(log n). Any further lookup calls
will also be O(log n), since the sorted references table only needs to
be generated once.

The resulting implementation is notably simple (as it uses standard
library builtins `qsort` and `bsearch`), whilst performing better than
the fixed size hash table in documents that have a high number of
references and never becoming pathological regardless of the input.
  • Loading branch information
jgm committed Feb 16, 2020
1 parent 04936d6 commit b2378e4
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 53 deletions.
108 changes: 59 additions & 49 deletions src/references.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,6 @@
#include "inlines.h"
#include "chunk.h"

static unsigned int refhash(const unsigned char *link_ref) {
unsigned int hash = 0;

while (*link_ref)
hash = (*link_ref++) + (hash << 6) + (hash << 16) - hash;

return hash;
}

static void reference_free(cmark_reference_map *map, cmark_reference *ref) {
cmark_mem *mem = map->mem;
if (ref != NULL) {
Expand Down Expand Up @@ -53,21 +44,6 @@ static unsigned char *normalize_reference(cmark_mem *mem, cmark_chunk *ref) {
return result;
}

static void add_reference(cmark_reference_map *map, cmark_reference *ref) {
cmark_reference *t = ref->next = map->table[ref->hash % REFMAP_SIZE];

while (t) {
if (t->hash == ref->hash && !strcmp((char *)t->label, (char *)ref->label)) {
reference_free(map, ref);
return;
}

t = t->next;
}

map->table[ref->hash % REFMAP_SIZE] = ref;
}

void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label,
cmark_chunk *url, cmark_chunk *title) {
cmark_reference *ref;
Expand All @@ -77,64 +53,98 @@ void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label,
if (reflabel == NULL)
return;

assert(map->sorted == NULL);

ref = (cmark_reference *)map->mem->calloc(1, sizeof(*ref));
ref->label = reflabel;
ref->hash = refhash(ref->label);
ref->url = cmark_clean_url(map->mem, url);
ref->title = cmark_clean_title(map->mem, title);
ref->next = NULL;
ref->age = map->size;
ref->next = map->refs;

map->refs = ref;
map->size++;
}

static int
labelcmp(const unsigned char *a, const unsigned char *b) {
return strcmp((const char *)a, (const char *)b);
}

add_reference(map, ref);
static int
refcmp(const void *p1, const void *p2) {
cmark_reference *r1 = *(cmark_reference **)p1;
cmark_reference *r2 = *(cmark_reference **)p2;
int res = labelcmp(r1->label, r2->label);
return res ? res : ((int)r1->age - (int)r2->age);
}

static int
refsearch(const void *label, const void *p2) {
cmark_reference *ref = *(cmark_reference **)p2;
return labelcmp(label, ref->label);
}

static void sort_references(cmark_reference_map *map) {
unsigned int i = 0, last = 0, size = map->size;
cmark_reference *r = map->refs, **sorted = NULL;

sorted = map->mem->calloc(size, sizeof(cmark_reference *));
while (r) {
sorted[i++] = r;
r = r->next;
}

qsort(sorted, size, sizeof(cmark_reference *), refcmp);

for (i = 1; i < size; i++) {
if (labelcmp(sorted[i]->label, sorted[last]->label) != 0)
sorted[++last] = sorted[i];
}
map->sorted = sorted;
map->size = last + 1;
}

// Returns reference if refmap contains a reference with matching
// label, otherwise NULL.
cmark_reference *cmark_reference_lookup(cmark_reference_map *map,
cmark_chunk *label) {
cmark_reference *ref = NULL;
cmark_reference **ref = NULL;
unsigned char *norm;
unsigned int hash;

if (label->len < 1 || label->len > MAX_LINK_LABEL_LENGTH)
return NULL;

if (map == NULL)
if (map == NULL || !map->size)
return NULL;

norm = normalize_reference(map->mem, label);
if (norm == NULL)
return NULL;

hash = refhash(norm);
ref = map->table[hash % REFMAP_SIZE];

while (ref) {
if (ref->hash == hash && !strcmp((char *)ref->label, (char *)norm))
break;
ref = ref->next;
}
if (!map->sorted)
sort_references(map);

ref = bsearch(norm, map->sorted, map->size, sizeof(cmark_reference *),
refsearch);
map->mem->free(norm);
return ref;
return ref ? ref[0] : NULL;
}

void cmark_reference_map_free(cmark_reference_map *map) {
unsigned int i;
cmark_reference *ref;

if (map == NULL)
return;

for (i = 0; i < REFMAP_SIZE; ++i) {
cmark_reference *ref = map->table[i];
cmark_reference *next;

while (ref) {
next = ref->next;
reference_free(map, ref);
ref = next;
}
ref = map->refs;
while (ref) {
cmark_reference *next = ref->next;
reference_free(map, ref);
ref = next;
}

map->mem->free(map->sorted);
map->mem->free(map);
}

Expand Down
8 changes: 4 additions & 4 deletions src/references.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,21 @@
extern "C" {
#endif

#define REFMAP_SIZE 16

struct cmark_reference {
struct cmark_reference *next;
unsigned char *label;
unsigned char *url;
unsigned char *title;
unsigned int hash;
unsigned int age;
};

typedef struct cmark_reference cmark_reference;

struct cmark_reference_map {
cmark_mem *mem;
cmark_reference *table[REFMAP_SIZE];
cmark_reference *refs;
cmark_reference **sorted;
unsigned int size;
};

typedef struct cmark_reference_map cmark_reference_map;
Expand Down

0 comments on commit b2378e4

Please sign in to comment.