1/**
2 * Block parsing implementation.
3 *
4 * For a high-level overview of the block parsing process,
5 * see http://spec.commonmark.org/0.24/#phase-1-block-structure
6 */
7
8#include <stdlib.h>
9#include <assert.h>
10#include <stdio.h>
11#include <limits.h>
12
13#include "cmark_ctype.h"
14#include "config.h"
15#include "parser.h"
16#include "cmark.h"
17#include "node.h"
18#include "references.h"
19#include "utf8.h"
20#include "scanners.h"
21#include "inlines.h"
22#include "houdini.h"
23#include "buffer.h"
24#include "chunk.h"
25
26#define CODE_INDENT 4
27#define TAB_STOP 4
28
29#ifndef MIN
30#define MIN(x, y) ((x < y) ? x : y)
31#endif
32
33#define peek_at(i, n) (i)->data[n]
34
35static bool S_last_line_blank(const cmark_node *node) {
36 return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != 0;
37}
38
39static bool S_last_line_checked(const cmark_node *node) {
40 return (node->flags & CMARK_NODE__LAST_LINE_CHECKED) != 0;
41}
42
43static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) {
44 return (cmark_node_type)node->type;
45}
46
47static void S_set_last_line_blank(cmark_node *node, bool is_blank) {
48 if (is_blank)
49 node->flags |= CMARK_NODE__LAST_LINE_BLANK;
50 else
51 node->flags &= ~CMARK_NODE__LAST_LINE_BLANK;
52}
53
54static void S_set_last_line_checked(cmark_node *node) {
55 node->flags |= CMARK_NODE__LAST_LINE_CHECKED;
56}
57
58static CMARK_INLINE bool S_is_line_end_char(char c) {
59 return (c == '\n' || c == '\r');
60}
61
62static CMARK_INLINE bool S_is_space_or_tab(char c) {
63 return (c == ' ' || c == '\t');
64}
65
66static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
67 size_t len, bool eof);
68
69static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
70 bufsize_t bytes);
71
72static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag,
73 int start_line, int start_column) {
74 cmark_node *e;
75
76 e = (cmark_node *)mem->calloc(1, sizeof(*e));
77 e->mem = mem;
78 e->type = (uint16_t)tag;
79 e->flags = CMARK_NODE__OPEN;
80 e->start_line = start_line;
81 e->start_column = start_column;
82 e->end_line = start_line;
83
84 return e;
85}
86
87// Create a root document node.
88static cmark_node *make_document(cmark_mem *mem) {
89 cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, 1, 1);
90 return e;
91}
92
93cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
94 cmark_parser *parser = (cmark_parser *)mem->calloc(1, sizeof(cmark_parser));
95 parser->mem = mem;
96
97 cmark_node *document = make_document(mem);
98
99 cmark_strbuf_init(mem, &parser->curline, 256);
100 cmark_strbuf_init(mem, &parser->linebuf, 0);
101 cmark_strbuf_init(mem, &parser->content, 0);
102
103 parser->refmap = cmark_reference_map_new(mem);
104 parser->root = document;
105 parser->current = document;
106 parser->line_number = 0;
107 parser->offset = 0;
108 parser->column = 0;
109 parser->first_nonspace = 0;
110 parser->first_nonspace_column = 0;
111 parser->thematic_break_kill_pos = 0;
112 parser->indent = 0;
113 parser->blank = false;
114 parser->partially_consumed_tab = false;
115 parser->last_line_length = 0;
116 parser->options = options;
117 parser->last_buffer_ended_with_cr = false;
118
119 return parser;
120}
121
122cmark_parser *cmark_parser_new(int options) {
123 extern cmark_mem DEFAULT_MEM_ALLOCATOR;
124 return cmark_parser_new_with_mem(options, &DEFAULT_MEM_ALLOCATOR);
125}
126
127void cmark_parser_free(cmark_parser *parser) {
128 cmark_mem *mem = parser->mem;
129 cmark_strbuf_free(&parser->curline);
130 cmark_strbuf_free(&parser->linebuf);
131 cmark_reference_map_free(parser->refmap);
132 mem->free(parser);
133}
134
135static cmark_node *finalize(cmark_parser *parser, cmark_node *b);
136
137// Returns true if line has only space characters, else false.
138static bool is_blank(cmark_strbuf *s, bufsize_t offset) {
139 while (offset < s->size) {
140 switch (s->ptr[offset]) {
141 case '\r':
142 case '\n':
143 return true;
144 case ' ':
145 offset++;
146 break;
147 case '\t':
148 offset++;
149 break;
150 default:
151 return false;
152 }
153 }
154
155 return true;
156}
157
158static CMARK_INLINE bool can_contain(cmark_node_type parent_type,
159 cmark_node_type child_type) {
160 return (parent_type == CMARK_NODE_DOCUMENT ||
161 parent_type == CMARK_NODE_BLOCK_QUOTE ||
162 parent_type == CMARK_NODE_ITEM ||
163 (parent_type == CMARK_NODE_LIST && child_type == CMARK_NODE_ITEM));
164}
165
166static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) {
167 return (block_type == CMARK_NODE_PARAGRAPH ||
168 block_type == CMARK_NODE_HEADING ||
169 block_type == CMARK_NODE_CODE_BLOCK);
170}
171
172static CMARK_INLINE bool contains_inlines(cmark_node_type block_type) {
173 return (block_type == CMARK_NODE_PARAGRAPH ||
174 block_type == CMARK_NODE_HEADING);
175}
176
177static void add_line(cmark_chunk *ch, cmark_parser *parser) {
178 int chars_to_tab;
179 int i;
180 if (parser->partially_consumed_tab) {
181 parser->offset += 1; // skip over tab
182 // add space characters:
183 chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
184 for (i = 0; i < chars_to_tab; i++) {
185 cmark_strbuf_putc(&parser->content, ' ');
186 }
187 }
188 cmark_strbuf_put(&parser->content, ch->data + parser->offset,
189 ch->len - parser->offset);
190}
191
192static void remove_trailing_blank_lines(cmark_strbuf *ln) {
193 bufsize_t i;
194 unsigned char c;
195
196 for (i = ln->size - 1; i >= 0; --i) {
197 c = ln->ptr[i];
198
199 if (c != ' ' && c != '\t' && !S_is_line_end_char(c))
200 break;
201 }
202
203 if (i < 0) {
204 cmark_strbuf_clear(ln);
205 return;
206 }
207
208 for (; i < ln->size; ++i) {
209 c = ln->ptr[i];
210
211 if (!S_is_line_end_char(c))
212 continue;
213
214 cmark_strbuf_truncate(ln, i);
215 break;
216 }
217}
218
219// Check to see if a node ends with a blank line, descending
220// if needed into lists and sublists.
221static bool S_ends_with_blank_line(cmark_node *node) {
222 if (S_last_line_checked(node)) {
223 return(S_last_line_blank(node));
224 } else if ((S_type(node) == CMARK_NODE_LIST ||
225 S_type(node) == CMARK_NODE_ITEM) && node->last_child) {
226 S_set_last_line_checked(node);
227 return(S_ends_with_blank_line(node->last_child));
228 } else {
229 S_set_last_line_checked(node);
230 return (S_last_line_blank(node));
231 }
232}
233
234// returns true if content remains after link defs are resolved.
235static bool resolve_reference_link_definitions(cmark_parser *parser) {
236 bufsize_t pos;
237 cmark_strbuf *node_content = &parser->content;
238 cmark_chunk chunk = {node_content->ptr, node_content->size};
239 while (chunk.len && chunk.data[0] == '[' &&
240 (pos = cmark_parse_reference_inline(parser->mem, &chunk,
241 parser->refmap))) {
242
243 chunk.data += pos;
244 chunk.len -= pos;
245 }
246 cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
247 return !is_blank(node_content, 0);
248}
249
250static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
251 bufsize_t pos;
252 cmark_node *item;
253 cmark_node *subitem;
254 cmark_node *parent;
255 bool has_content;
256
257 parent = b->parent;
258 assert(b->flags &
259 CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
260 b->flags &= ~CMARK_NODE__OPEN;
261
262 if (parser->curline.size == 0) {
263 // end of input - line number has not been incremented
264 b->end_line = parser->line_number;
265 b->end_column = parser->last_line_length;
266 } else if (S_type(b) == CMARK_NODE_DOCUMENT ||
267 (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
268 (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
269 b->end_line = parser->line_number;
270 b->end_column = parser->curline.size;
271 if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
272 b->end_column -= 1;
273 if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\r')
274 b->end_column -= 1;
275 } else {
276 b->end_line = parser->line_number - 1;
277 b->end_column = parser->last_line_length;
278 }
279
280 cmark_strbuf *node_content = &parser->content;
281
282 switch (S_type(b)) {
283 case CMARK_NODE_PARAGRAPH:
284 {
285 has_content = resolve_reference_link_definitions(parser);
286 if (!has_content) {
287 // remove blank node (former reference def)
288 cmark_node_free(b);
289 } else {
290 b->len = node_content->size;
291 b->data = cmark_strbuf_detach(node_content);
292 }
293 break;
294 }
295
296 case CMARK_NODE_CODE_BLOCK:
297 if (!b->as.code.fenced) { // indented code
298 remove_trailing_blank_lines(node_content);
299 cmark_strbuf_putc(node_content, '\n');
300 } else {
301 // first line of contents becomes info
302 for (pos = 0; pos < node_content->size; ++pos) {
303 if (S_is_line_end_char(node_content->ptr[pos]))
304 break;
305 }
306 assert(pos < node_content->size);
307
308 if (pos == 0) {
309 b->as.code.info = NULL;
310 } else {
311 cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem);
312 houdini_unescape_html_f(&tmp, node_content->ptr, pos);
313 cmark_strbuf_trim(&tmp);
314 cmark_strbuf_unescape(&tmp);
315 b->as.code.info = cmark_strbuf_detach(&tmp);
316 }
317
318 if (node_content->ptr[pos] == '\r')
319 pos += 1;
320 if (node_content->ptr[pos] == '\n')
321 pos += 1;
322 cmark_strbuf_drop(node_content, pos);
323 }
324 b->len = node_content->size;
325 b->data = cmark_strbuf_detach(node_content);
326 break;
327
328 case CMARK_NODE_HEADING:
329 case CMARK_NODE_HTML_BLOCK:
330 b->len = node_content->size;
331 b->data = cmark_strbuf_detach(node_content);
332 break;
333
334 case CMARK_NODE_LIST: // determine tight/loose status
335 b->as.list.tight = true; // tight by default
336 item = b->first_child;
337
338 while (item) {
339 // check for non-final non-empty list item ending with blank line:
340 if (S_last_line_blank(item) && item->next) {
341 b->as.list.tight = false;
342 break;
343 }
344 // recurse into children of list item, to see if there are
345 // spaces between them:
346 subitem = item->first_child;
347 while (subitem) {
348 if ((item->next || subitem->next) &&
349 S_ends_with_blank_line(subitem)) {
350 b->as.list.tight = false;
351 break;
352 }
353 subitem = subitem->next;
354 }
355 if (!(b->as.list.tight)) {
356 break;
357 }
358 item = item->next;
359 }
360
361 break;
362
363 default:
364 break;
365 }
366
367 return parent;
368}
369
370// Add a node as child of another. Return pointer to child.
371static cmark_node *add_child(cmark_parser *parser, cmark_node *parent,
372 cmark_node_type block_type, int start_column) {
373 assert(parent);
374
375 // if 'parent' isn't the kind of node that can accept this child,
376 // then back up til we hit a node that can.
377 while (!can_contain(S_type(parent), block_type)) {
378 parent = finalize(parser, parent);
379 }
380
381 cmark_node *child =
382 make_block(parser->mem, block_type, parser->line_number, start_column);
383 child->parent = parent;
384
385 if (parent->last_child) {
386 parent->last_child->next = child;
387 child->prev = parent->last_child;
388 } else {
389 parent->first_child = child;
390 child->prev = NULL;
391 }
392 parent->last_child = child;
393 return child;
394}
395
396// Walk through node and all children, recursively, parsing
397// string content into inline content where appropriate.
398static void process_inlines(cmark_mem *mem, cmark_node *root,
399 cmark_reference_map *refmap, int options) {
400 cmark_iter *iter = cmark_iter_new(root);
401 cmark_node *cur;
402 cmark_event_type ev_type;
403
404 while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
405 cur = cmark_iter_get_node(iter);
406 if (ev_type == CMARK_EVENT_ENTER) {
407 if (contains_inlines(S_type(cur))) {
408 cmark_parse_inlines(mem, cur, refmap, options);
409 mem->free(cur->data);
410 cur->data = NULL;
411 cur->len = 0;
412 }
413 }
414 }
415
416 cmark_iter_free(iter);
417}
418
419// Attempts to parse a list item marker (bullet or enumerated).
420// On success, returns length of the marker, and populates
421// data with the details. On failure, returns 0.
422static bufsize_t parse_list_marker(cmark_mem *mem, cmark_chunk *input,
423 bufsize_t pos, bool interrupts_paragraph,
424 cmark_list **dataptr) {
425 unsigned char c;
426 bufsize_t startpos;
427 cmark_list *data;
428 bufsize_t i;
429
430 startpos = pos;
431 c = peek_at(input, pos);
432
433 if (c == '*' || c == '-' || c == '+') {
434 pos++;
435 if (!cmark_isspace(peek_at(input, pos))) {
436 return 0;
437 }
438
439 if (interrupts_paragraph) {
440 i = pos;
441 // require non-blank content after list marker:
442 while (S_is_space_or_tab(peek_at(input, i))) {
443 i++;
444 }
445 if (peek_at(input, i) == '\n') {
446 return 0;
447 }
448 }
449
450 data = (cmark_list *)mem->calloc(1, sizeof(*data));
451 data->marker_offset = 0; // will be adjusted later
452 data->list_type = CMARK_BULLET_LIST;
453 data->bullet_char = c;
454 data->start = 0;
455 data->delimiter = CMARK_NO_DELIM;
456 data->tight = false;
457 } else if (cmark_isdigit(c)) {
458 int start = 0;
459 int digits = 0;
460
461 do {
462 start = (10 * start) + (peek_at(input, pos) - '0');
463 pos++;
464 digits++;
465 // We limit to 9 digits to avoid overflow,
466 // assuming max int is 2^31 - 1
467 // This also seems to be the limit for 'start' in some browsers.
468 } while (digits < 9 && cmark_isdigit(peek_at(input, pos)));
469
470 if (interrupts_paragraph && start != 1) {
471 return 0;
472 }
473 c = peek_at(input, pos);
474 if (c == '.' || c == ')') {
475 pos++;
476 if (!cmark_isspace(peek_at(input, pos))) {
477 return 0;
478 }
479 if (interrupts_paragraph) {
480 // require non-blank content after list marker:
481 i = pos;
482 while (S_is_space_or_tab(peek_at(input, i))) {
483 i++;
484 }
485 if (S_is_line_end_char(peek_at(input, i))) {
486 return 0;
487 }
488 }
489
490 data = (cmark_list *)mem->calloc(1, sizeof(*data));
491 data->marker_offset = 0; // will be adjusted later
492 data->list_type = CMARK_ORDERED_LIST;
493 data->bullet_char = 0;
494 data->start = start;
495 data->delimiter = (c == '.' ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM);
496 data->tight = false;
497 } else {
498 return 0;
499 }
500 } else {
501 return 0;
502 }
503
504 *dataptr = data;
505 return (pos - startpos);
506}
507
508// Return 1 if list item belongs in list, else 0.
509static int lists_match(cmark_list *list_data, cmark_list *item_data) {
510 return (list_data->list_type == item_data->list_type &&
511 list_data->delimiter == item_data->delimiter &&
512 // list_data->marker_offset == item_data.marker_offset &&
513 list_data->bullet_char == item_data->bullet_char);
514}
515
516static cmark_node *finalize_document(cmark_parser *parser) {
517 while (parser->current != parser->root) {
518 parser->current = finalize(parser, parser->current);
519 }
520
521 finalize(parser, parser->root);
522
523 // Limit total size of extra content created from reference links to
524 // document size to avoid superlinear growth. Always allow 100KB.
525 if (parser->total_size > 100000)
526 parser->refmap->max_ref_size = parser->total_size;
527 else
528 parser->refmap->max_ref_size = 100000;
529
530 process_inlines(parser->mem, parser->root, parser->refmap, parser->options);
531
532 cmark_strbuf_free(&parser->content);
533
534 return parser->root;
535}
536
537cmark_node *cmark_parse_file(FILE *f, int options) {
538 unsigned char buffer[4096];
539 cmark_parser *parser = cmark_parser_new(options);
540 size_t bytes;
541 cmark_node *document;
542
543 while ((bytes = fread(buffer, 1, sizeof(buffer), f)) > 0) {
544 bool eof = bytes < sizeof(buffer);
545 S_parser_feed(parser, buffer, bytes, eof);
546 if (eof) {
547 break;
548 }
549 }
550
551 document = cmark_parser_finish(parser);
552 cmark_parser_free(parser);
553 return document;
554}
555
556cmark_node *cmark_parse_document(const char *buffer, size_t len, int options) {
557 cmark_parser *parser = cmark_parser_new(options);
558 cmark_node *document;
559
560 S_parser_feed(parser, (const unsigned char *)buffer, len, true);
561
562 document = cmark_parser_finish(parser);
563 cmark_parser_free(parser);
564 return document;
565}
566
567void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) {
568 S_parser_feed(parser, (const unsigned char *)buffer, len, false);
569}
570
571static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
572 size_t len, bool eof) {
573 const unsigned char *end = buffer + len;
574 static const uint8_t repl[] = {239, 191, 189};
575
576 if (len > UINT_MAX - parser->total_size)
577 parser->total_size = UINT_MAX;
578 else
579 parser->total_size += len;
580
581 // Skip UTF-8 BOM if present; see #334
582 if (parser->line_number == 0 && parser->column == 0 && len >= 3 &&
583 *buffer == 0xEF && *(buffer + 1) == 0xBB &&
584 *(buffer + 2) == 0xBF) {
585 buffer += 3;
586 } else if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
587 // skip NL if last buffer ended with CR ; see #117
588 buffer++;
589 }
590
591 parser->last_buffer_ended_with_cr = false;
592 while (buffer < end) {
593 const unsigned char *eol;
594 bufsize_t chunk_len;
595 bool process = false;
596 for (eol = buffer; eol < end; ++eol) {
597 if (S_is_line_end_char(*eol)) {
598 process = true;
599 break;
600 }
601 if (*eol == '\0' && eol < end) {
602 break;
603 }
604 }
605 if (eol >= end && eof) {
606 process = true;
607 }
608
609 chunk_len = (eol - buffer);
610 if (process) {
611 if (parser->linebuf.size > 0) {
612 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
613 S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
614 cmark_strbuf_clear(&parser->linebuf);
615 } else {
616 S_process_line(parser, buffer, chunk_len);
617 }
618 } else {
619 if (eol < end && *eol == '\0') {
620 // omit NULL byte
621 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
622 // add replacement character
623 cmark_strbuf_put(&parser->linebuf, repl, 3);
624 } else {
625 cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
626 }
627 }
628
629 buffer += chunk_len;
630 if (buffer < end) {
631 if (*buffer == '\0') {
632 // skip over NULL
633 buffer++;
634 } else {
635 // skip over line ending characters
636 if (*buffer == '\r') {
637 buffer++;
638 if (buffer == end)
639 parser->last_buffer_ended_with_cr = true;
640 }
641 if (buffer < end && *buffer == '\n')
642 buffer++;
643 }
644 }
645 }
646}
647
648static void chop_trailing_hashtags(cmark_chunk *ch) {
649 bufsize_t n, orig_n;
650
651 cmark_chunk_rtrim(ch);
652 orig_n = n = ch->len - 1;
653
654 // if string ends in space followed by #s, remove these:
655 while (n >= 0 && peek_at(ch, n) == '#')
656 n--;
657
658 // Check for a space before the final #s:
659 if (n != orig_n && n >= 0 && S_is_space_or_tab(peek_at(ch, n))) {
660 ch->len = n;
661 cmark_chunk_rtrim(ch);
662 }
663}
664
665// Check for thematic break. On failure, return 0 and update
666// thematic_break_kill_pos with the index at which the
667// parse fails. On success, return length of match.
668// "...three or more hyphens, asterisks,
669// or underscores on a line by themselves. If you wish, you may use
670// spaces between the hyphens or asterisks."
671static int S_scan_thematic_break(cmark_parser *parser, cmark_chunk *input,
672 bufsize_t offset) {
673 bufsize_t i;
674 char c;
675 char nextc = '\0';
676 int count;
677 i = offset;
678 c = peek_at(input, i);
679 if (!(c == '*' || c == '_' || c == '-')) {
680 parser->thematic_break_kill_pos = i;
681 return 0;
682 }
683 count = 1;
684 while ((nextc = peek_at(input, ++i))) {
685 if (nextc == c) {
686 count++;
687 } else if (nextc != ' ' && nextc != '\t') {
688 break;
689 }
690 }
691 if (count >= 3 && (nextc == '\r' || nextc == '\n')) {
692 return (i - offset) + 1;
693 } else {
694 parser->thematic_break_kill_pos = i;
695 return 0;
696 }
697}
698
699// Find first nonspace character from current offset, setting
700// parser->first_nonspace, parser->first_nonspace_column,
701// parser->indent, and parser->blank. Does not advance parser->offset.
702static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) {
703 char c;
704 int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
705
706 if (parser->first_nonspace <= parser->offset) {
707 parser->first_nonspace = parser->offset;
708 parser->first_nonspace_column = parser->column;
709 while ((c = peek_at(input, parser->first_nonspace))) {
710 if (c == ' ') {
711 parser->first_nonspace += 1;
712 parser->first_nonspace_column += 1;
713 chars_to_tab = chars_to_tab - 1;
714 if (chars_to_tab == 0) {
715 chars_to_tab = TAB_STOP;
716 }
717 } else if (c == '\t') {
718 parser->first_nonspace += 1;
719 parser->first_nonspace_column += chars_to_tab;
720 chars_to_tab = TAB_STOP;
721 } else {
722 break;
723 }
724 }
725 }
726
727 parser->indent = parser->first_nonspace_column - parser->column;
728 parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
729}
730
731// Advance parser->offset and parser->column. parser->offset is the
732// byte position in input; parser->column is a virtual column number
733// that takes into account tabs. (Multibyte characters are not taken
734// into account, because the Markdown line prefixes we are interested in
735// analyzing are entirely ASCII.) The count parameter indicates
736// how far to advance the offset. If columns is true, then count
737// indicates a number of columns; otherwise, a number of bytes.
738// If advancing a certain number of columns partially consumes
739// a tab character, parser->partially_consumed_tab is set to true.
740static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
741 bufsize_t count, bool columns) {
742 char c;
743 int chars_to_tab;
744 int chars_to_advance;
745 while (count > 0 && (c = peek_at(input, parser->offset))) {
746 if (c == '\t') {
747 chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
748 if (columns) {
749 parser->partially_consumed_tab = chars_to_tab > count;
750 chars_to_advance = MIN(count, chars_to_tab);
751 parser->column += chars_to_advance;
752 parser->offset += (parser->partially_consumed_tab ? 0 : 1);
753 count -= chars_to_advance;
754 } else {
755 parser->partially_consumed_tab = false;
756 parser->column += chars_to_tab;
757 parser->offset += 1;
758 count -= 1;
759 }
760 } else {
761 parser->partially_consumed_tab = false;
762 parser->offset += 1;
763 parser->column += 1; // assume ascii; block starts are ascii
764 count -= 1;
765 }
766 }
767}
768
769static bool S_last_child_is_open(cmark_node *container) {
770 return container->last_child &&
771 (container->last_child->flags & CMARK_NODE__OPEN);
772}
773
774static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
775 bool res = false;
776 bufsize_t matched = 0;
777
778 matched =
779 parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>';
780 if (matched) {
781
782 S_advance_offset(parser, input, parser->indent + 1, true);
783
784 if (S_is_space_or_tab(peek_at(input, parser->offset))) {
785 S_advance_offset(parser, input, 1, true);
786 }
787
788 res = true;
789 }
790 return res;
791}
792
793static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
794 cmark_node *container) {
795 bool res = false;
796
797 if (parser->indent >=
798 container->as.list.marker_offset + container->as.list.padding) {
799 S_advance_offset(parser, input, container->as.list.marker_offset +
800 container->as.list.padding,
801 true);
802 res = true;
803 } else if (parser->blank && container->first_child != NULL) {
804 // if container->first_child is NULL, then the opening line
805 // of the list item was blank after the list marker; in this
806 // case, we are done with the list item.
807 S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
808 false);
809 res = true;
810 }
811 return res;
812}
813
814static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
815 cmark_node *container,
816 bool *should_continue) {
817 bool res = false;
818
819 if (!container->as.code.fenced) { // indented
820 if (parser->indent >= CODE_INDENT) {
821 S_advance_offset(parser, input, CODE_INDENT, true);
822 res = true;
823 } else if (parser->blank) {
824 S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
825 false);
826 res = true;
827 }
828 } else { // fenced
829 bufsize_t matched = 0;
830
831 if (parser->indent <= 3 && (peek_at(input, parser->first_nonspace) ==
832 container->as.code.fence_char)) {
833 matched = scan_close_code_fence(input, parser->first_nonspace);
834 }
835
836 if (matched >= container->as.code.fence_length) {
837 // closing fence - and since we're at
838 // the end of a line, we can stop processing it:
839 *should_continue = false;
840 S_advance_offset(parser, input, matched, false);
841 parser->current = finalize(parser, container);
842 } else {
843 // skip opt. spaces of fence parser->offset
844 int i = container->as.code.fence_offset;
845
846 while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) {
847 S_advance_offset(parser, input, 1, true);
848 i--;
849 }
850 res = true;
851 }
852 }
853
854 return res;
855}
856
857static bool parse_html_block_prefix(cmark_parser *parser,
858 cmark_node *container) {
859 bool res = false;
860 int html_block_type = container->as.html_block_type;
861
862 assert(html_block_type >= 1 && html_block_type <= 7);
863 switch (html_block_type) {
864 case 1:
865 case 2:
866 case 3:
867 case 4:
868 case 5:
869 // these types of blocks can accept blanks
870 res = true;
871 break;
872 case 6:
873 case 7:
874 res = !parser->blank;
875 break;
876 }
877
878 return res;
879}
880
881/**
882 * For each containing node, try to parse the associated line start.
883 *
884 * Will not close unmatched blocks, as we may have a lazy continuation
885 * line -> http://spec.commonmark.org/0.24/#lazy-continuation-line
886 *
887 * Returns: The last matching node, or NULL
888 */
889static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input,
890 bool *all_matched) {
891 bool should_continue = true;
892 *all_matched = false;
893 cmark_node *container = parser->root;
894 cmark_node_type cont_type;
895
896 while (S_last_child_is_open(container)) {
897 container = container->last_child;
898 cont_type = S_type(container);
899
900 S_find_first_nonspace(parser, input);
901
902 switch (cont_type) {
903 case CMARK_NODE_BLOCK_QUOTE:
904 if (!parse_block_quote_prefix(parser, input))
905 goto done;
906 break;
907 case CMARK_NODE_ITEM:
908 if (!parse_node_item_prefix(parser, input, container))
909 goto done;
910 break;
911 case CMARK_NODE_CODE_BLOCK:
912 if (!parse_code_block_prefix(parser, input, container, &should_continue))
913 goto done;
914 break;
915 case CMARK_NODE_HEADING:
916 // a heading can never contain more than one line
917 goto done;
918 case CMARK_NODE_HTML_BLOCK:
919 if (!parse_html_block_prefix(parser, container))
920 goto done;
921 break;
922 case CMARK_NODE_PARAGRAPH:
923 if (parser->blank)
924 goto done;
925 break;
926 default:
927 break;
928 }
929 }
930
931 *all_matched = true;
932
933done:
934 if (!*all_matched) {
935 container = container->parent; // back up to last matching node
936 }
937
938 if (!should_continue) {
939 container = NULL;
940 }
941
942 return container;
943}
944
945static void open_new_blocks(cmark_parser *parser, cmark_node **container,
946 cmark_chunk *input, bool all_matched) {
947 bool indented;
948 cmark_list *data = NULL;
949 bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH;
950 cmark_node_type cont_type = S_type(*container);
951 bufsize_t matched = 0;
952 int lev = 0;
953 bool save_partially_consumed_tab;
954 bool has_content;
955 int save_offset;
956 int save_column;
957
958 while (cont_type != CMARK_NODE_CODE_BLOCK &&
959 cont_type != CMARK_NODE_HTML_BLOCK) {
960
961 S_find_first_nonspace(parser, input);
962 indented = parser->indent >= CODE_INDENT;
963
964 if (!indented && peek_at(input, parser->first_nonspace) == '>') {
965
966 bufsize_t blockquote_startpos = parser->first_nonspace;
967
968 S_advance_offset(parser, input,
969 parser->first_nonspace + 1 - parser->offset, false);
970 // optional following character
971 if (S_is_space_or_tab(peek_at(input, parser->offset))) {
972 S_advance_offset(parser, input, 1, true);
973 }
974 *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
975 blockquote_startpos + 1);
976
977 } else if (!indented && (matched = scan_atx_heading_start(
978 input, parser->first_nonspace))) {
979 bufsize_t hashpos;
980 int level = 0;
981 bufsize_t heading_startpos = parser->first_nonspace;
982
983 S_advance_offset(parser, input,
984 parser->first_nonspace + matched - parser->offset,
985 false);
986 *container = add_child(parser, *container, CMARK_NODE_HEADING,
987 heading_startpos + 1);
988
989 hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace);
990
991 while (peek_at(input, hashpos) == '#') {
992 level++;
993 hashpos++;
994 }
995
996 (*container)->as.heading.level = level;
997 (*container)->as.heading.setext = false;
998 (*container)->internal_offset = matched;
999
1000 } else if (!indented && (matched = scan_open_code_fence(
1001 input, parser->first_nonspace))) {
1002 *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1003 parser->first_nonspace + 1);
1004 (*container)->as.code.fenced = true;
1005 (*container)->as.code.fence_char = peek_at(input, parser->first_nonspace);
1006 (*container)->as.code.fence_length = (matched > 255) ? 255 : matched;
1007 (*container)->as.code.fence_offset =
1008 (int8_t)(parser->first_nonspace - parser->offset);
1009 (*container)->as.code.info = NULL;
1010 S_advance_offset(parser, input,
1011 parser->first_nonspace + matched - parser->offset,
1012 false);
1013
1014 } else if (!indented && ((matched = scan_html_block_start(
1015 input, parser->first_nonspace)) ||
1016 (cont_type != CMARK_NODE_PARAGRAPH &&
1017 !maybe_lazy &&
1018 (matched = scan_html_block_start_7(
1019 input, parser->first_nonspace))))) {
1020 *container = add_child(parser, *container, CMARK_NODE_HTML_BLOCK,
1021 parser->first_nonspace + 1);
1022 (*container)->as.html_block_type = matched;
1023 // note, we don't adjust parser->offset because the tag is part of the
1024 // text
1025 } else if (!indented && cont_type == CMARK_NODE_PARAGRAPH &&
1026 (lev =
1027 scan_setext_heading_line(input, parser->first_nonspace))) {
1028 // finalize paragraph, resolving reference links
1029 has_content = resolve_reference_link_definitions(parser);
1030
1031 if (has_content) {
1032
1033 (*container)->type = (uint16_t)CMARK_NODE_HEADING;
1034 (*container)->as.heading.level = lev;
1035 (*container)->as.heading.setext = true;
1036 S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1037 }
1038 } else if (!indented &&
1039 !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
1040 (parser->thematic_break_kill_pos <= parser->first_nonspace) &&
1041 (matched = S_scan_thematic_break(parser, input, parser->first_nonspace))) {
1042 // it's only now that we know the line is not part of a setext heading:
1043 *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
1044 parser->first_nonspace + 1);
1045 S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1046 } else if ((!indented || cont_type == CMARK_NODE_LIST) &&
1047 parser->indent < 4 &&
1048 (matched = parse_list_marker(
1049 parser->mem, input, parser->first_nonspace,
1050 (*container)->type == CMARK_NODE_PARAGRAPH, &data))) {
1051
1052 // Note that we can have new list items starting with >= 4
1053 // spaces indent, as long as the list container is still open.
1054 int i = 0;
1055
1056 // compute padding:
1057 S_advance_offset(parser, input,
1058 parser->first_nonspace + matched - parser->offset,
1059 false);
1060
1061 save_partially_consumed_tab = parser->partially_consumed_tab;
1062 save_offset = parser->offset;
1063 save_column = parser->column;
1064
1065 while (parser->column - save_column <= 5 &&
1066 S_is_space_or_tab(peek_at(input, parser->offset))) {
1067 S_advance_offset(parser, input, 1, true);
1068 }
1069
1070 i = parser->column - save_column;
1071 if (i >= 5 || i < 1 ||
1072 // only spaces after list marker:
1073 S_is_line_end_char(peek_at(input, parser->offset))) {
1074 data->padding = matched + 1;
1075 parser->offset = save_offset;
1076 parser->column = save_column;
1077 parser->partially_consumed_tab = save_partially_consumed_tab;
1078 if (i > 0) {
1079 S_advance_offset(parser, input, 1, true);
1080 }
1081 } else {
1082 data->padding = matched + i;
1083 }
1084
1085 // check container; if it's a list, see if this list item
1086 // can continue the list; otherwise, create a list container.
1087
1088 data->marker_offset = parser->indent;
1089
1090 if (cont_type != CMARK_NODE_LIST ||
1091 !lists_match(&((*container)->as.list), data)) {
1092 *container = add_child(parser, *container, CMARK_NODE_LIST,
1093 parser->first_nonspace + 1);
1094
1095 memcpy(&((*container)->as.list), data, sizeof(*data));
1096 }
1097
1098 // add the list item
1099 *container = add_child(parser, *container, CMARK_NODE_ITEM,
1100 parser->first_nonspace + 1);
1101 /* TODO: static */
1102 memcpy(&((*container)->as.list), data, sizeof(*data));
1103 parser->mem->free(data);
1104 } else if (indented && !maybe_lazy && !parser->blank) {
1105 S_advance_offset(parser, input, CODE_INDENT, true);
1106 *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1107 parser->offset + 1);
1108 (*container)->as.code.fenced = false;
1109 (*container)->as.code.fence_char = 0;
1110 (*container)->as.code.fence_length = 0;
1111 (*container)->as.code.fence_offset = 0;
1112 (*container)->as.code.info = NULL;
1113
1114 } else {
1115 break;
1116 }
1117
1118 if (accepts_lines(S_type(*container))) {
1119 // if it's a line container, it can't contain other containers
1120 break;
1121 }
1122
1123 cont_type = S_type(*container);
1124 maybe_lazy = false;
1125 }
1126}
1127
1128static void add_text_to_container(cmark_parser *parser, cmark_node *container,
1129 cmark_node *last_matched_container,
1130 cmark_chunk *input) {
1131 cmark_node *tmp;
1132 // what remains at parser->offset is a text line. add the text to the
1133 // appropriate container.
1134
1135 S_find_first_nonspace(parser, input);
1136
1137 if (parser->blank && container->last_child)
1138 S_set_last_line_blank(container->last_child, true);
1139
1140 // block quote lines are never blank as they start with >
1141 // and we don't count blanks in fenced code for purposes of tight/loose
1142 // lists or breaking out of lists. we also don't set last_line_blank
1143 // on an empty list item.
1144 const cmark_node_type ctype = S_type(container);
1145 const bool last_line_blank =
1146 (parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE &&
1147 ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK &&
1148 !(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) &&
1149 !(ctype == CMARK_NODE_ITEM && container->first_child == NULL &&
1150 container->start_line == parser->line_number));
1151
1152 S_set_last_line_blank(container, last_line_blank);
1153
1154 tmp = container;
1155 while (tmp->parent) {
1156 S_set_last_line_blank(tmp->parent, false);
1157 tmp = tmp->parent;
1158 }
1159
1160 // If the last line processed belonged to a paragraph node,
1161 // and we didn't match all of the line prefixes for the open containers,
1162 // and we didn't start any new containers,
1163 // and the line isn't blank,
1164 // then treat this as a "lazy continuation line" and add it to
1165 // the open paragraph.
1166 if (parser->current != last_matched_container &&
1167 container == last_matched_container && !parser->blank &&
1168 S_type(parser->current) == CMARK_NODE_PARAGRAPH) {
1169 add_line(input, parser);
1170 } else { // not a lazy continuation
1171 // Finalize any blocks that were not matched and set cur to container:
1172 while (parser->current != last_matched_container) {
1173 parser->current = finalize(parser, parser->current);
1174 assert(parser->current != NULL);
1175 }
1176
1177 if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
1178 add_line(input, parser);
1179 } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
1180 add_line(input, parser);
1181
1182 int matches_end_condition;
1183 switch (container->as.html_block_type) {
1184 case 1:
1185 // </script>, </style>, </textarea>, </pre>
1186 matches_end_condition =
1187 scan_html_block_end_1(input, parser->first_nonspace);
1188 break;
1189 case 2:
1190 // -->
1191 matches_end_condition =
1192 scan_html_block_end_2(input, parser->first_nonspace);
1193 break;
1194 case 3:
1195 // ?>
1196 matches_end_condition =
1197 scan_html_block_end_3(input, parser->first_nonspace);
1198 break;
1199 case 4:
1200 // >
1201 matches_end_condition =
1202 scan_html_block_end_4(input, parser->first_nonspace);
1203 break;
1204 case 5:
1205 // ]]>
1206 matches_end_condition =
1207 scan_html_block_end_5(input, parser->first_nonspace);
1208 break;
1209 default:
1210 matches_end_condition = 0;
1211 break;
1212 }
1213
1214 if (matches_end_condition) {
1215 container = finalize(parser, container);
1216 assert(parser->current != NULL);
1217 }
1218 } else if (parser->blank) {
1219 // ??? do nothing
1220 } else if (accepts_lines(S_type(container))) {
1221 if (S_type(container) == CMARK_NODE_HEADING &&
1222 container->as.heading.setext == false) {
1223 chop_trailing_hashtags(input);
1224 }
1225 S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1226 false);
1227 add_line(input, parser);
1228 } else {
1229 // create paragraph container for line
1230 container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
1231 parser->first_nonspace + 1);
1232 S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1233 false);
1234 add_line(input, parser);
1235 }
1236
1237 parser->current = container;
1238 }
1239}
1240
1241/* See http://spec.commonmark.org/0.24/#phase-1-block-structure */
1242static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
1243 bufsize_t bytes) {
1244 cmark_node *last_matched_container;
1245 bool all_matched = true;
1246 cmark_node *container;
1247 cmark_chunk input;
1248
1249 if (parser->options & CMARK_OPT_VALIDATE_UTF8)
1250 cmark_utf8proc_check(&parser->curline, buffer, bytes);
1251 else
1252 cmark_strbuf_put(&parser->curline, buffer, bytes);
1253
1254 bytes = parser->curline.size;
1255
1256 // ensure line ends with a newline:
1257 if (bytes == 0 || !S_is_line_end_char(parser->curline.ptr[bytes - 1]))
1258 cmark_strbuf_putc(&parser->curline, '\n');
1259
1260 parser->offset = 0;
1261 parser->column = 0;
1262 parser->first_nonspace = 0;
1263 parser->first_nonspace_column = 0;
1264 parser->thematic_break_kill_pos = 0;
1265 parser->indent = 0;
1266 parser->blank = false;
1267 parser->partially_consumed_tab = false;
1268
1269 input.data = parser->curline.ptr;
1270 input.len = parser->curline.size;
1271
1272 parser->line_number++;
1273
1274 last_matched_container = check_open_blocks(parser, &input, &all_matched);
1275
1276 if (!last_matched_container)
1277 goto finished;
1278
1279 container = last_matched_container;
1280
1281 open_new_blocks(parser, &container, &input, all_matched);
1282
1283 add_text_to_container(parser, container, last_matched_container, &input);
1284
1285finished:
1286 parser->last_line_length = input.len;
1287 if (parser->last_line_length &&
1288 input.data[parser->last_line_length - 1] == '\n')
1289 parser->last_line_length -= 1;
1290 if (parser->last_line_length &&
1291 input.data[parser->last_line_length - 1] == '\r')
1292 parser->last_line_length -= 1;
1293
1294 cmark_strbuf_clear(&parser->curline);
1295}
1296
1297cmark_node *cmark_parser_finish(cmark_parser *parser) {
1298 if (parser->linebuf.size) {
1299 S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
1300 cmark_strbuf_clear(&parser->linebuf);
1301 }
1302
1303 finalize_document(parser);
1304
1305 cmark_consolidate_text_nodes(parser->root);
1306
1307 cmark_strbuf_free(&parser->curline);
1308
1309#if CMARK_DEBUG_NODES
1310 if (cmark_node_check(parser->root, stderr)) {
1311 abort();
1312 }
1313#endif
1314 return parser->root;
1315}
1316