blocks.c source code [Aseprite/third_party/cmark/src/blocks.c]

1	/**
2	* Block parsing implementation.
3	*
4	* For a high-level overview of the block parsing process,
5	* see http://spec.commonmark.org/0.24/#phase-1-block-structure
6	*/
7
8	#include <stdlib.h>
9	#include <assert.h>
10	#include <stdio.h>
11	#include <limits.h>
12
13	#include "cmark_ctype.h"
14	#include "config.h"
15	#include "parser.h"
16	#include "cmark.h"
17	#include "node.h"
18	#include "references.h"
19	#include "utf8.h"
20	#include "scanners.h"
21	#include "inlines.h"
22	#include "houdini.h"
23	#include "buffer.h"
24	#include "chunk.h"
25
26	#define CODE_INDENT 4
27	#define TAB_STOP 4
28
29	#ifndef MIN
30	#define MIN(x, y) ((x < y) ? x : y)
31	#endif
32
33	#define peek_at(i, n) (i)->data[n]
34
35	static bool S_last_line_blank(const cmark_node *node) {
36	return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != `0`;
37	}
38
39	static bool S_last_line_checked(const cmark_node *node) {
40	return (node->flags & CMARK_NODE__LAST_LINE_CHECKED) != `0`;
41	}
42
43	static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) {
44	return (cmark_node_type)node->type;
45	}
46
47	static void S_set_last_line_blank(cmark_node *node, bool is_blank) {
48	if (is_blank)
49	node->flags \|= CMARK_NODE__LAST_LINE_BLANK;
50	else
51	node->flags &= ~CMARK_NODE__LAST_LINE_BLANK;
52	}
53
54	static void S_set_last_line_checked(cmark_node *node) {
55	node->flags \|= CMARK_NODE__LAST_LINE_CHECKED;
56	}
57
58	static CMARK_INLINE bool S_is_line_end_char(char c) {
59	return (c == `'\n'` \|\| c == `'\r'`);
60	}
61
62	static CMARK_INLINE bool S_is_space_or_tab(char c) {
63	return (c == `' '` \|\| c == `'\t'`);
64	}
65
66	static void S_parser_feed(cmark_parser parser, const* unsigned char *buffer,
67	size_t len, bool eof);
68
69	static void S_process_line(cmark_parser parser, const* unsigned char *buffer,
70	bufsize_t bytes);
71
72	static cmark_node make_block(cmark_mem mem, cmark_node_type tag,
73	int start_line, int start_column) {
74	cmark_node *e;
75
76	e = (cmark_node )mem->calloc(`1`, sizeof(e));
77	e->mem = mem;
78	e->type = (uint16_t)tag;
79	e->flags = CMARK_NODE__OPEN;
80	e->start_line = start_line;
81	e->start_column = start_column;
82	e->end_line = start_line;
83
84	return e;
85	}
86
87	// Create a root document node.
88	static cmark_node make_document(cmark_mem mem) {
89	cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, `1`, `1`);
90	return e;
91	}
92
93	cmark_parser cmark_parser_new_with_mem(int* options, cmark_mem *mem) {
94	cmark_parser parser = (cmark_parser )mem->calloc(`1`, sizeof(cmark_parser));
95	parser->mem = mem;
96
97	cmark_node *document = make_document(mem);
98
99	cmark_strbuf_init(mem, &parser->curline, `256`);
100	cmark_strbuf_init(mem, &parser->linebuf, `0`);
101	cmark_strbuf_init(mem, &parser->content, `0`);
102
103	parser->refmap = cmark_reference_map_new(mem);
104	parser->root = document;
105	parser->current = document;
106	parser->line_number = `0`;
107	parser->offset = `0`;
108	parser->column = `0`;
109	parser->first_nonspace = `0`;
110	parser->first_nonspace_column = `0`;
111	parser->thematic_break_kill_pos = `0`;
112	parser->indent = `0`;
113	parser->blank = false;
114	parser->partially_consumed_tab = false;
115	parser->last_line_length = `0`;
116	parser->options = options;
117	parser->last_buffer_ended_with_cr = false;
118
119	return parser;
120	}
121
122	cmark_parser cmark_parser_new(int* options) {
123	extern cmark_mem DEFAULT_MEM_ALLOCATOR;
124	return cmark_parser_new_with_mem(options, &DEFAULT_MEM_ALLOCATOR);
125	}
126
127	void cmark_parser_free(cmark_parser *parser) {
128	cmark_mem *mem = parser->mem;
129	cmark_strbuf_free(&parser->curline);
130	cmark_strbuf_free(&parser->linebuf);
131	cmark_reference_map_free(parser->refmap);
132	mem->free(parser);
133	}
134
135	static cmark_node finalize(cmark_parser parser, cmark_node *b);
136
137	// Returns true if line has only space characters, else false.
138	static bool is_blank(cmark_strbuf *s, bufsize_t offset) {
139	while (offset < s->size) {
140	switch (s->ptr[offset]) {
141	case `'\r'`:
142	case `'\n'`:
143	return true;
144	case `' '`:
145	offset++;
146	break;
147	case `'\t'`:
148	offset++;
149	break;
150	default:
151	return false;
152	}
153	}
154
155	return true;
156	}
157
158	static CMARK_INLINE bool can_contain(cmark_node_type parent_type,
159	cmark_node_type child_type) {
160	return (parent_type == CMARK_NODE_DOCUMENT \|\|
161	parent_type == CMARK_NODE_BLOCK_QUOTE \|\|
162	parent_type == CMARK_NODE_ITEM \|\|
163	(parent_type == CMARK_NODE_LIST && child_type == CMARK_NODE_ITEM));
164	}
165
166	static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) {
167	return (block_type == CMARK_NODE_PARAGRAPH \|\|
168	block_type == CMARK_NODE_HEADING \|\|
169	block_type == CMARK_NODE_CODE_BLOCK);
170	}
171
172	static CMARK_INLINE bool contains_inlines(cmark_node_type block_type) {
173	return (block_type == CMARK_NODE_PARAGRAPH \|\|
174	block_type == CMARK_NODE_HEADING);
175	}
176
177	static void add_line(cmark_chunk ch, cmark_parser parser) {
178	int chars_to_tab;
179	int i;
180	if (parser->partially_consumed_tab) {
181	parser->offset += `1`; // skip over tab
182	// add space characters:
183	chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
184	for (i = `0`; i < chars_to_tab; i++) {
185	cmark_strbuf_putc(&parser->content, `' '`);
186	}
187	}
188	cmark_strbuf_put(&parser->content, ch->data + parser->offset,
189	ch->len - parser->offset);
190	}
191
192	static void remove_trailing_blank_lines(cmark_strbuf *ln) {
193	bufsize_t i;
194	unsigned char c;
195
196	for (i = ln->size - `1`; i >= `0`; --i) {
197	c = ln->ptr[i];
198
199	if (c != `' '` && c != `'\t'` && !S_is_line_end_char(c))
200	break;
201	}
202
203	if (i < `0`) {
204	cmark_strbuf_clear(ln);
205	return;
206	}
207
208	for (; i < ln->size; ++i) {
209	c = ln->ptr[i];
210
211	if (!S_is_line_end_char(c))
212	continue;
213
214	cmark_strbuf_truncate(ln, i);
215	break;
216	}
217	}
218
219	// Check to see if a node ends with a blank line, descending
220	// if needed into lists and sublists.
221	static bool S_ends_with_blank_line(cmark_node *node) {
222	if (S_last_line_checked(node)) {
223	return(S_last_line_blank(node));
224	} else if ((S_type(node) == CMARK_NODE_LIST \|\|
225	S_type(node) == CMARK_NODE_ITEM) && node->last_child) {
226	S_set_last_line_checked(node);
227	return(S_ends_with_blank_line(node->last_child));
228	} else {
229	S_set_last_line_checked(node);
230	return (S_last_line_blank(node));
231	}
232	}
233
234	// returns true if content remains after link defs are resolved.
235	static bool resolve_reference_link_definitions(cmark_parser *parser) {
236	bufsize_t pos;
237	cmark_strbuf *node_content = &parser->content;
238	cmark_chunk chunk = {node_content->ptr, node_content->size};
239	while (chunk.len && chunk.data[`0`] == `'['` &&
240	(pos = cmark_parse_reference_inline(parser->mem, &chunk,
241	parser->refmap))) {
242
243	chunk.data += pos;
244	chunk.len -= pos;
245	}
246	cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
247	return !is_blank(node_content, `0`);
248	}
249
250	static cmark_node finalize(cmark_parser parser, cmark_node *b) {
251	bufsize_t pos;
252	cmark_node *item;
253	cmark_node *subitem;
254	cmark_node *parent;
255	bool has_content;
256
257	parent = b->parent;
258	assert(b->flags &
259	CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
260	b->flags &= ~CMARK_NODE__OPEN;
261
262	if (parser->curline.size == `0`) {
263	// end of input - line number has not been incremented
264	b->end_line = parser->line_number;
265	b->end_column = parser->last_line_length;
266	} else if (S_type(b) == CMARK_NODE_DOCUMENT \|\|
267	(S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) \|\|
268	(S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
269	b->end_line = parser->line_number;
270	b->end_column = parser->curline.size;
271	if (b->end_column && parser->curline.ptr[b->end_column - `1`] == `'\n'`)
272	b->end_column -= `1`;
273	if (b->end_column && parser->curline.ptr[b->end_column - `1`] == `'\r'`)
274	b->end_column -= `1`;
275	} else {
276	b->end_line = parser->line_number - `1`;
277	b->end_column = parser->last_line_length;
278	}
279
280	cmark_strbuf *node_content = &parser->content;
281
282	switch (S_type(b)) {
283	case CMARK_NODE_PARAGRAPH:
284	{
285	has_content = resolve_reference_link_definitions(parser);
286	if (!has_content) {
287	// remove blank node (former reference def)
288	cmark_node_free(b);
289	} else {
290	b->len = node_content->size;
291	b->data = cmark_strbuf_detach(node_content);
292	}
293	break;
294	}
295
296	case CMARK_NODE_CODE_BLOCK:
297	if (!b->as.code.fenced) { // indented code
298	remove_trailing_blank_lines(node_content);
299	cmark_strbuf_putc(node_content, `'\n'`);
300	} else {
301	// first line of contents becomes info
302	for (pos = `0`; pos < node_content->size; ++pos) {
303	if (S_is_line_end_char(node_content->ptr[pos]))
304	break;
305	}
306	assert(pos < node_content->size);
307
308	if (pos == `0`) {
309	b->as.code.info = NULL;
310	} else {
311	cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem);
312	houdini_unescape_html_f(&tmp, node_content->ptr, pos);
313	cmark_strbuf_trim(&tmp);
314	cmark_strbuf_unescape(&tmp);
315	b->as.code.info = cmark_strbuf_detach(&tmp);
316	}
317
318	if (node_content->ptr[pos] == `'\r'`)
319	pos += `1`;
320	if (node_content->ptr[pos] == `'\n'`)
321	pos += `1`;
322	cmark_strbuf_drop(node_content, pos);
323	}
324	b->len = node_content->size;
325	b->data = cmark_strbuf_detach(node_content);
326	break;
327
328	case CMARK_NODE_HEADING:
329	case CMARK_NODE_HTML_BLOCK:
330	b->len = node_content->size;
331	b->data = cmark_strbuf_detach(node_content);
332	break;
333
334	case CMARK_NODE_LIST: // determine tight/loose status
335	b->as.list.tight = true; // tight by default
336	item = b->first_child;
337
338	while (item) {
339	// check for non-final non-empty list item ending with blank line:
340	if (S_last_line_blank(item) && item->next) {
341	b->as.list.tight = false;
342	break;
343	}
344	// recurse into children of list item, to see if there are
345	// spaces between them:
346	subitem = item->first_child;
347	while (subitem) {
348	if ((item->next \|\| subitem->next) &&
349	S_ends_with_blank_line(subitem)) {
350	b->as.list.tight = false;
351	break;
352	}
353	subitem = subitem->next;
354	}
355	if (!(b->as.list.tight)) {
356	break;
357	}
358	item = item->next;
359	}
360
361	break;
362
363	default:
364	break;
365	}
366
367	return parent;
368	}
369
370	// Add a node as child of another. Return pointer to child.
371	static cmark_node add_child(cmark_parser parser, cmark_node *parent,
372	cmark_node_type block_type, int start_column) {
373	assert(parent);
374
375	// if 'parent' isn't the kind of node that can accept this child,
376	// then back up til we hit a node that can.
377	while (!can_contain(S_type(parent), block_type)) {
378	parent = finalize(parser, parent);
379	}
380
381	cmark_node *child =
382	make_block(parser->mem, block_type, parser->line_number, start_column);
383	child->parent = parent;
384
385	if (parent->last_child) {
386	parent->last_child->next = child;
387	child->prev = parent->last_child;
388	} else {
389	parent->first_child = child;
390	child->prev = NULL;
391	}
392	parent->last_child = child;
393	return child;
394	}
395
396	// Walk through node and all children, recursively, parsing
397	// string content into inline content where appropriate.
398	static void process_inlines(cmark_mem mem, cmark_node root,
399	cmark_reference_map refmap, int* options) {
400	cmark_iter *iter = cmark_iter_new(root);
401	cmark_node *cur;
402	cmark_event_type ev_type;
403
404	while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
405	cur = cmark_iter_get_node(iter);
406	if (ev_type == CMARK_EVENT_ENTER) {
407	if (contains_inlines(S_type(cur))) {
408	cmark_parse_inlines(mem, cur, refmap, options);
409	mem->free(cur->data);
410	cur->data = NULL;
411	cur->len = `0`;
412	}
413	}
414	}
415
416	cmark_iter_free(iter);
417	}
418
419	// Attempts to parse a list item marker (bullet or enumerated).
420	// On success, returns length of the marker, and populates
421	// data with the details. On failure, returns 0.
422	static bufsize_t parse_list_marker(cmark_mem mem, cmark_chunk input,
423	bufsize_t pos, bool interrupts_paragraph,
424	cmark_list **dataptr) {
425	unsigned char c;
426	bufsize_t startpos;
427	cmark_list *data;
428	bufsize_t i;
429
430	startpos = pos;
431	c = peek_at(input, pos);
432
433	if (c == `'*'` \|\| c == `'-'` \|\| c == `'+'`) {
434	pos++;
435	if (!cmark_isspace(peek_at(input, pos))) {
436	return `0`;
437	}
438
439	if (interrupts_paragraph) {
440	i = pos;
441	// require non-blank content after list marker:
442	while (S_is_space_or_tab(peek_at(input, i))) {
443	i++;
444	}
445	if (peek_at(input, i) == `'\n'`) {
446	return `0`;
447	}
448	}
449
450	data = (cmark_list )mem->calloc(`1`, sizeof(data));
451	data->marker_offset = `0`; // will be adjusted later
452	data->list_type = CMARK_BULLET_LIST;
453	data->bullet_char = c;
454	data->start = `0`;
455	data->delimiter = CMARK_NO_DELIM;
456	data->tight = false;
457	} else if (cmark_isdigit(c)) {
458	int start = `0`;
459	int digits = `0`;
460
461	do {
462	start = (`10` * start) + (peek_at(input, pos) - `'0'`);
463	pos++;
464	digits++;
465	// We limit to 9 digits to avoid overflow,
466	// assuming max int is 2^31 - 1
467	// This also seems to be the limit for 'start' in some browsers.
468	} while (digits < `9` && cmark_isdigit(peek_at(input, pos)));
469
470	if (interrupts_paragraph && start != `1`) {
471	return `0`;
472	}
473	c = peek_at(input, pos);
474	if (c == `'.'` \|\| c == `')'`) {
475	pos++;
476	if (!cmark_isspace(peek_at(input, pos))) {
477	return `0`;
478	}
479	if (interrupts_paragraph) {
480	// require non-blank content after list marker:
481	i = pos;
482	while (S_is_space_or_tab(peek_at(input, i))) {
483	i++;
484	}
485	if (S_is_line_end_char(peek_at(input, i))) {
486	return `0`;
487	}
488	}
489
490	data = (cmark_list )mem->calloc(`1`, sizeof(data));
491	data->marker_offset = `0`; // will be adjusted later
492	data->list_type = CMARK_ORDERED_LIST;
493	data->bullet_char = `0`;
494	data->start = start;
495	data->delimiter = (c == `'.'` ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM);
496	data->tight = false;
497	} else {
498	return `0`;
499	}
500	} else {
501	return `0`;
502	}
503
504	*dataptr = data;
505	return (pos - startpos);
506	}
507
508	// Return 1 if list item belongs in list, else 0.
509	static int lists_match(cmark_list list_data, cmark_list item_data) {
510	return (list_data->list_type == item_data->list_type &&
511	list_data->delimiter == item_data->delimiter &&
512	// list_data->marker_offset == item_data.marker_offset &&
513	list_data->bullet_char == item_data->bullet_char);
514	}
515
516	static cmark_node finalize_document(cmark_parser parser) {
517	while (parser->current != parser->root) {
518	parser->current = finalize(parser, parser->current);
519	}
520
521	finalize(parser, parser->root);
522
523	// Limit total size of extra content created from reference links to
524	// document size to avoid superlinear growth. Always allow 100KB.
525	if (parser->total_size > `100000`)
526	parser->refmap->max_ref_size = parser->total_size;
527	else
528	parser->refmap->max_ref_size = `100000`;
529
530	process_inlines(parser->mem, parser->root, parser->refmap, parser->options);
531
532	cmark_strbuf_free(&parser->content);
533
534	return parser->root;
535	}
536
537	cmark_node cmark_parse_file(FILE f, int options) {
538	unsigned char buffer[`4096`];
539	cmark_parser *parser = cmark_parser_new(options);
540	size_t bytes;
541	cmark_node *document;
542
543	while ((bytes = fread(buffer, `1`, sizeof(buffer), f)) > `0`) {
544	bool eof = bytes < sizeof(buffer);
545	S_parser_feed(parser, buffer, bytes, eof);
546	if (eof) {
547	break;
548	}
549	}
550
551	document = cmark_parser_finish(parser);
552	cmark_parser_free(parser);
553	return document;
554	}
555
556	cmark_node cmark_parse_document(const* char buffer, size_t len, int* options) {
557	cmark_parser *parser = cmark_parser_new(options);
558	cmark_node *document;
559
560	S_parser_feed(parser, (const unsigned char *)buffer, len, true);
561
562	document = cmark_parser_finish(parser);
563	cmark_parser_free(parser);
564	return document;
565	}
566
567	void cmark_parser_feed(cmark_parser parser, const* char *buffer, size_t len) {
568	S_parser_feed(parser, (const unsigned char *)buffer, len, false);
569	}
570
571	static void S_parser_feed(cmark_parser parser, const* unsigned char *buffer,
572	size_t len, bool eof) {
573	const unsigned char *end = buffer + len;
574	static const uint8_t repl[] = {`239`, `191`, `189`};
575
576	if (len > UINT_MAX - parser->total_size)
577	parser->total_size = UINT_MAX;
578	else
579	parser->total_size += len;
580
581	// Skip UTF-8 BOM if present; see #334
582	if (parser->line_number == `0` && parser->column == `0` && len >= `3` &&
583	buffer == `0xEF` && (buffer + `1`) == `0xBB` &&
584	*(buffer + `2`) == `0xBF`) {
585	buffer += `3`;
586	} else if (parser->last_buffer_ended_with_cr && *buffer == `'\n'`) {
587	// skip NL if last buffer ended with CR ; see #117
588	buffer++;
589	}
590
591	parser->last_buffer_ended_with_cr = false;
592	while (buffer < end) {
593	const unsigned char *eol;
594	bufsize_t chunk_len;
595	bool process = false;
596	for (eol = buffer; eol < end; ++eol) {
597	if (S_is_line_end_char(*eol)) {
598	process = true;
599	break;
600	}
601	if (*eol == `'\0'` && eol < end) {
602	break;
603	}
604	}
605	if (eol >= end && eof) {
606	process = true;
607	}
608
609	chunk_len = (eol - buffer);
610	if (process) {
611	if (parser->linebuf.size > `0`) {
612	cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
613	S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
614	cmark_strbuf_clear(&parser->linebuf);
615	} else {
616	S_process_line(parser, buffer, chunk_len);
617	}
618	} else {
619	if (eol < end && *eol == `'\0'`) {
620	// omit NULL byte
621	cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
622	// add replacement character
623	cmark_strbuf_put(&parser->linebuf, repl, `3`);
624	} else {
625	cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
626	}
627	}
628
629	buffer += chunk_len;
630	if (buffer < end) {
631	if (*buffer == `'\0'`) {
632	// skip over NULL
633	buffer++;
634	} else {
635	// skip over line ending characters
636	if (*buffer == `'\r'`) {
637	buffer++;
638	if (buffer == end)
639	parser->last_buffer_ended_with_cr = true;
640	}
641	if (buffer < end && *buffer == `'\n'`)
642	buffer++;
643	}
644	}
645	}
646	}
647
648	static void chop_trailing_hashtags(cmark_chunk *ch) {
649	bufsize_t n, orig_n;
650
651	cmark_chunk_rtrim(ch);
652	orig_n = n = ch->len - `1`;
653
654	// if string ends in space followed by #s, remove these:
655	while (n >= `0` && peek_at(ch, n) == `'#'`)
656	n--;
657
658	// Check for a space before the final #s:
659	if (n != orig_n && n >= `0` && S_is_space_or_tab(peek_at(ch, n))) {
660	ch->len = n;
661	cmark_chunk_rtrim(ch);
662	}
663	}
664
665	// Check for thematic break. On failure, return 0 and update
666	// thematic_break_kill_pos with the index at which the
667	// parse fails. On success, return length of match.
668	// "...three or more hyphens, asterisks,
669	// or underscores on a line by themselves. If you wish, you may use
670	// spaces between the hyphens or asterisks."
671	static int S_scan_thematic_break(cmark_parser parser, cmark_chunk input,
672	bufsize_t offset) {
673	bufsize_t i;
674	char c;
675	char nextc = `'\0'`;
676	int count;
677	i = offset;
678	c = peek_at(input, i);
679	if (!(c == `'*'` \|\| c == `'_'` \|\| c == `'-'`)) {
680	parser->thematic_break_kill_pos = i;
681	return `0`;
682	}
683	count = `1`;
684	while ((nextc = peek_at(input, ++i))) {
685	if (nextc == c) {
686	count++;
687	} else if (nextc != `' '` && nextc != `'\t'`) {
688	break;
689	}
690	}
691	if (count >= `3` && (nextc == `'\r'` \|\| nextc == `'\n'`)) {
692	return (i - offset) + `1`;
693	} else {
694	parser->thematic_break_kill_pos = i;
695	return `0`;
696	}
697	}
698
699	// Find first nonspace character from current offset, setting
700	// parser->first_nonspace, parser->first_nonspace_column,
701	// parser->indent, and parser->blank. Does not advance parser->offset.
702	static void S_find_first_nonspace(cmark_parser parser, cmark_chunk input) {
703	char c;
704	int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
705
706	if (parser->first_nonspace <= parser->offset) {
707	parser->first_nonspace = parser->offset;
708	parser->first_nonspace_column = parser->column;
709	while ((c = peek_at(input, parser->first_nonspace))) {
710	if (c == `' '`) {
711	parser->first_nonspace += `1`;
712	parser->first_nonspace_column += `1`;
713	chars_to_tab = chars_to_tab - `1`;
714	if (chars_to_tab == `0`) {
715	chars_to_tab = TAB_STOP;
716	}
717	} else if (c == `'\t'`) {
718	parser->first_nonspace += `1`;
719	parser->first_nonspace_column += chars_to_tab;
720	chars_to_tab = TAB_STOP;
721	} else {
722	break;
723	}
724	}
725	}
726
727	parser->indent = parser->first_nonspace_column - parser->column;
728	parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
729	}
730
731	// Advance parser->offset and parser->column. parser->offset is the
732	// byte position in input; parser->column is a virtual column number
733	// that takes into account tabs. (Multibyte characters are not taken
734	// into account, because the Markdown line prefixes we are interested in
735	// analyzing are entirely ASCII.) The count parameter indicates
736	// how far to advance the offset. If columns is true, then count
737	// indicates a number of columns; otherwise, a number of bytes.
738	// If advancing a certain number of columns partially consumes
739	// a tab character, parser->partially_consumed_tab is set to true.
740	static void S_advance_offset(cmark_parser parser, cmark_chunk input,
741	bufsize_t count, bool columns) {
742	char c;
743	int chars_to_tab;
744	int chars_to_advance;
745	while (count > `0` && (c = peek_at(input, parser->offset))) {
746	if (c == `'\t'`) {
747	chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
748	if (columns) {
749	parser->partially_consumed_tab = chars_to_tab > count;
750	chars_to_advance = MIN(count, chars_to_tab);
751	parser->column += chars_to_advance;
752	parser->offset += (parser->partially_consumed_tab ? `0` : `1`);
753	count -= chars_to_advance;
754	} else {
755	parser->partially_consumed_tab = false;
756	parser->column += chars_to_tab;
757	parser->offset += `1`;
758	count -= `1`;
759	}
760	} else {
761	parser->partially_consumed_tab = false;
762	parser->offset += `1`;
763	parser->column += `1`; // assume ascii; block starts are ascii
764	count -= `1`;
765	}
766	}
767	}
768
769	static bool S_last_child_is_open(cmark_node *container) {
770	return container->last_child &&
771	(container->last_child->flags & CMARK_NODE__OPEN);
772	}
773
774	static bool parse_block_quote_prefix(cmark_parser parser, cmark_chunk input) {
775	bool res = false;
776	bufsize_t matched = `0`;
777
778	matched =
779	parser->indent <= `3` && peek_at(input, parser->first_nonspace) == `'>'`;
780	if (matched) {
781
782	S_advance_offset(parser, input, parser->indent + `1`, true);
783
784	if (S_is_space_or_tab(peek_at(input, parser->offset))) {
785	S_advance_offset(parser, input, `1`, true);
786	}
787
788	res = true;
789	}
790	return res;
791	}
792
793	static bool parse_node_item_prefix(cmark_parser parser, cmark_chunk input,
794	cmark_node *container) {
795	bool res = false;
796
797	if (parser->indent >=
798	container->as.list.marker_offset + container->as.list.padding) {
799	S_advance_offset(parser, input, container->as.list.marker_offset +
800	container->as.list.padding,
801	true);
802	res = true;
803	} else if (parser->blank && container->first_child != NULL) {
804	// if container->first_child is NULL, then the opening line
805	// of the list item was blank after the list marker; in this
806	// case, we are done with the list item.
807	S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
808	false);
809	res = true;
810	}
811	return res;
812	}
813
814	static bool parse_code_block_prefix(cmark_parser parser, cmark_chunk input,
815	cmark_node *container,
816	bool *should_continue) {
817	bool res = false;
818
819	if (!container->as.code.fenced) { // indented
820	if (parser->indent >= CODE_INDENT) {
821	S_advance_offset(parser, input, CODE_INDENT, true);
822	res = true;
823	} else if (parser->blank) {
824	S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
825	false);
826	res = true;
827	}
828	} else { // fenced
829	bufsize_t matched = `0`;
830
831	if (parser->indent <= `3` && (peek_at(input, parser->first_nonspace) ==
832	container->as.code.fence_char)) {
833	matched = scan_close_code_fence(input, parser->first_nonspace);
834	}
835
836	if (matched >= container->as.code.fence_length) {
837	// closing fence - and since we're at
838	// the end of a line, we can stop processing it:
839	*should_continue = false;
840	S_advance_offset(parser, input, matched, false);
841	parser->current = finalize(parser, container);
842	} else {
843	// skip opt. spaces of fence parser->offset
844	int i = container->as.code.fence_offset;
845
846	while (i > `0` && S_is_space_or_tab(peek_at(input, parser->offset))) {
847	S_advance_offset(parser, input, `1`, true);
848	i--;
849	}
850	res = true;
851	}
852	}
853
854	return res;
855	}
856
857	static bool parse_html_block_prefix(cmark_parser *parser,
858	cmark_node *container) {
859	bool res = false;
860	int html_block_type = container->as.html_block_type;
861
862	assert(html_block_type >= `1` && html_block_type <= `7`);
863	switch (html_block_type) {
864	case `1`:
865	case `2`:
866	case `3`:
867	case `4`:
868	case `5`:
869	// these types of blocks can accept blanks
870	res = true;
871	break;
872	case `6`:
873	case `7`:
874	res = !parser->blank;
875	break;
876	}
877
878	return res;
879	}
880
881	/**
882	* For each containing node, try to parse the associated line start.
883	*
884	* Will not close unmatched blocks, as we may have a lazy continuation
885	* line -> http://spec.commonmark.org/0.24/#lazy-continuation-line
886	*
887	* Returns: The last matching node, or NULL
888	*/
889	static cmark_node check_open_blocks(cmark_parser parser, cmark_chunk *input,
890	bool *all_matched) {
891	bool should_continue = true;
892	*all_matched = false;
893	cmark_node *container = parser->root;
894	cmark_node_type cont_type;
895
896	while (S_last_child_is_open(container)) {
897	container = container->last_child;
898	cont_type = S_type(container);
899
900	S_find_first_nonspace(parser, input);
901
902	switch (cont_type) {
903	case CMARK_NODE_BLOCK_QUOTE:
904	if (!parse_block_quote_prefix(parser, input))
905	goto done;
906	break;
907	case CMARK_NODE_ITEM:
908	if (!parse_node_item_prefix(parser, input, container))
909	goto done;
910	break;
911	case CMARK_NODE_CODE_BLOCK:
912	if (!parse_code_block_prefix(parser, input, container, &should_continue))
913	goto done;
914	break;
915	case CMARK_NODE_HEADING:
916	// a heading can never contain more than one line
917	goto done;
918	case CMARK_NODE_HTML_BLOCK:
919	if (!parse_html_block_prefix(parser, container))
920	goto done;
921	break;
922	case CMARK_NODE_PARAGRAPH:
923	if (parser->blank)
924	goto done;
925	break;
926	default:
927	break;
928	}
929	}
930
931	*all_matched = true;
932
933	done:
934	if (!*all_matched) {
935	container = container->parent; // back up to last matching node
936	}
937
938	if (!should_continue) {
939	container = NULL;
940	}
941
942	return container;
943	}
944
945	static void open_new_blocks(cmark_parser parser, cmark_node *container,
946	cmark_chunk *input, bool all_matched) {
947	bool indented;
948	cmark_list *data = NULL;
949	bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH;
950	cmark_node_type cont_type = S_type(*container);
951	bufsize_t matched = `0`;
952	int lev = `0`;
953	bool save_partially_consumed_tab;
954	bool has_content;
955	int save_offset;
956	int save_column;
957
958	while (cont_type != CMARK_NODE_CODE_BLOCK &&
959	cont_type != CMARK_NODE_HTML_BLOCK) {
960
961	S_find_first_nonspace(parser, input);
962	indented = parser->indent >= CODE_INDENT;
963
964	if (!indented && peek_at(input, parser->first_nonspace) == `'>'`) {
965
966	bufsize_t blockquote_startpos = parser->first_nonspace;
967
968	S_advance_offset(parser, input,
969	parser->first_nonspace + `1` - parser->offset, false);
970	// optional following character
971	if (S_is_space_or_tab(peek_at(input, parser->offset))) {
972	S_advance_offset(parser, input, `1`, true);
973	}
974	container = add_child(parser, container, CMARK_NODE_BLOCK_QUOTE,
975	blockquote_startpos + `1`);
976
977	} else if (!indented && (matched = scan_atx_heading_start(
978	input, parser->first_nonspace))) {
979	bufsize_t hashpos;
980	int level = `0`;
981	bufsize_t heading_startpos = parser->first_nonspace;
982
983	S_advance_offset(parser, input,
984	parser->first_nonspace + matched - parser->offset,
985	false);
986	container = add_child(parser, container, CMARK_NODE_HEADING,
987	heading_startpos + `1`);
988
989	hashpos = cmark_chunk_strchr(input, `'#'`, parser->first_nonspace);
990
991	while (peek_at(input, hashpos) == `'#'`) {
992	level++;
993	hashpos++;
994	}
995
996	(*container)->as.heading.level = level;
997	(*container)->as.heading.setext = false;
998	(*container)->internal_offset = matched;
999
1000	} else if (!indented && (matched = scan_open_code_fence(
1001	input, parser->first_nonspace))) {
1002	container = add_child(parser, container, CMARK_NODE_CODE_BLOCK,
1003	parser->first_nonspace + `1`);
1004	(*container)->as.code.fenced = true;
1005	(*container)->as.code.fence_char = peek_at(input, parser->first_nonspace);
1006	(*container)->as.code.fence_length = (matched > `255`) ? `255` : matched;
1007	(*container)->as.code.fence_offset =
1008	(int8_t)(parser->first_nonspace - parser->offset);
1009	(*container)->as.code.info = NULL;
1010	S_advance_offset(parser, input,
1011	parser->first_nonspace + matched - parser->offset,
1012	false);
1013
1014	} else if (!indented && ((matched = scan_html_block_start(
1015	input, parser->first_nonspace)) \|\|
1016	(cont_type != CMARK_NODE_PARAGRAPH &&
1017	!maybe_lazy &&
1018	(matched = scan_html_block_start_7(
1019	input, parser->first_nonspace))))) {
1020	container = add_child(parser, container, CMARK_NODE_HTML_BLOCK,
1021	parser->first_nonspace + `1`);
1022	(*container)->as.html_block_type = matched;
1023	// note, we don't adjust parser->offset because the tag is part of the
1024	// text
1025	} else if (!indented && cont_type == CMARK_NODE_PARAGRAPH &&
1026	(lev =
1027	scan_setext_heading_line(input, parser->first_nonspace))) {
1028	// finalize paragraph, resolving reference links
1029	has_content = resolve_reference_link_definitions(parser);
1030
1031	if (has_content) {
1032
1033	(*container)->type = (uint16_t)CMARK_NODE_HEADING;
1034	(*container)->as.heading.level = lev;
1035	(*container)->as.heading.setext = true;
1036	S_advance_offset(parser, input, input->len - `1` - parser->offset, false);
1037	}
1038	} else if (!indented &&
1039	!(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
1040	(parser->thematic_break_kill_pos <= parser->first_nonspace) &&
1041	(matched = S_scan_thematic_break(parser, input, parser->first_nonspace))) {
1042	// it's only now that we know the line is not part of a setext heading:
1043	container = add_child(parser, container, CMARK_NODE_THEMATIC_BREAK,
1044	parser->first_nonspace + `1`);
1045	S_advance_offset(parser, input, input->len - `1` - parser->offset, false);
1046	} else if ((!indented \|\| cont_type == CMARK_NODE_LIST) &&
1047	parser->indent < `4` &&
1048	(matched = parse_list_marker(
1049	parser->mem, input, parser->first_nonspace,
1050	(*container)->type == CMARK_NODE_PARAGRAPH, &data))) {
1051
1052	// Note that we can have new list items starting with >= 4
1053	// spaces indent, as long as the list container is still open.
1054	int i = `0`;
1055
1056	// compute padding:
1057	S_advance_offset(parser, input,
1058	parser->first_nonspace + matched - parser->offset,
1059	false);
1060
1061	save_partially_consumed_tab = parser->partially_consumed_tab;
1062	save_offset = parser->offset;
1063	save_column = parser->column;
1064
1065	while (parser->column - save_column <= `5` &&
1066	S_is_space_or_tab(peek_at(input, parser->offset))) {
1067	S_advance_offset(parser, input, `1`, true);
1068	}
1069
1070	i = parser->column - save_column;
1071	if (i >= `5` \|\| i < `1` \|\|
1072	// only spaces after list marker:
1073	S_is_line_end_char(peek_at(input, parser->offset))) {
1074	data->padding = matched + `1`;
1075	parser->offset = save_offset;
1076	parser->column = save_column;
1077	parser->partially_consumed_tab = save_partially_consumed_tab;
1078	if (i > `0`) {
1079	S_advance_offset(parser, input, `1`, true);
1080	}
1081	} else {
1082	data->padding = matched + i;
1083	}
1084
1085	// check container; if it's a list, see if this list item
1086	// can continue the list; otherwise, create a list container.
1087
1088	data->marker_offset = parser->indent;
1089
1090	if (cont_type != CMARK_NODE_LIST \|\|
1091	!lists_match(&((*container)->as.list), data)) {
1092	container = add_child(parser, container, CMARK_NODE_LIST,
1093	parser->first_nonspace + `1`);
1094
1095	memcpy(&((container)->as.list), data, sizeof(data));
1096	}
1097
1098	// add the list item
1099	container = add_child(parser, container, CMARK_NODE_ITEM,
1100	parser->first_nonspace + `1`);
1101	/ TODO: static /
1102	memcpy(&((container)->as.list), data, sizeof(data));
1103	parser->mem->free(data);
1104	} else if (indented && !maybe_lazy && !parser->blank) {
1105	S_advance_offset(parser, input, CODE_INDENT, true);
1106	container = add_child(parser, container, CMARK_NODE_CODE_BLOCK,
1107	parser->offset + `1`);
1108	(*container)->as.code.fenced = false;
1109	(*container)->as.code.fence_char = `0`;
1110	(*container)->as.code.fence_length = `0`;
1111	(*container)->as.code.fence_offset = `0`;
1112	(*container)->as.code.info = NULL;
1113
1114	} else {
1115	break;
1116	}
1117
1118	if (accepts_lines(S_type(*container))) {
1119	// if it's a line container, it can't contain other containers
1120	break;
1121	}
1122
1123	cont_type = S_type(*container);
1124	maybe_lazy = false;
1125	}
1126	}
1127
1128	static void add_text_to_container(cmark_parser parser, cmark_node container,
1129	cmark_node *last_matched_container,
1130	cmark_chunk *input) {
1131	cmark_node *tmp;
1132	// what remains at parser->offset is a text line. add the text to the
1133	// appropriate container.
1134
1135	S_find_first_nonspace(parser, input);
1136
1137	if (parser->blank && container->last_child)
1138	S_set_last_line_blank(container->last_child, true);
1139
1140	// block quote lines are never blank as they start with >
1141	// and we don't count blanks in fenced code for purposes of tight/loose
1142	// lists or breaking out of lists. we also don't set last_line_blank
1143	// on an empty list item.
1144	const cmark_node_type ctype = S_type(container);
1145	const bool last_line_blank =
1146	(parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE &&
1147	ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK &&
1148	!(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) &&
1149	!(ctype == CMARK_NODE_ITEM && container->first_child == NULL &&
1150	container->start_line == parser->line_number));
1151
1152	S_set_last_line_blank(container, last_line_blank);
1153
1154	tmp = container;
1155	while (tmp->parent) {
1156	S_set_last_line_blank(tmp->parent, false);
1157	tmp = tmp->parent;
1158	}
1159
1160	// If the last line processed belonged to a paragraph node,
1161	// and we didn't match all of the line prefixes for the open containers,
1162	// and we didn't start any new containers,
1163	// and the line isn't blank,
1164	// then treat this as a "lazy continuation line" and add it to
1165	// the open paragraph.
1166	if (parser->current != last_matched_container &&
1167	container == last_matched_container && !parser->blank &&
1168	S_type(parser->current) == CMARK_NODE_PARAGRAPH) {
1169	add_line(input, parser);
1170	} else { // not a lazy continuation
1171	// Finalize any blocks that were not matched and set cur to container:
1172	while (parser->current != last_matched_container) {
1173	parser->current = finalize(parser, parser->current);
1174	assert(parser->current != NULL);
1175	}
1176
1177	if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
1178	add_line(input, parser);
1179	} else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
1180	add_line(input, parser);
1181
1182	int matches_end_condition;
1183	switch (container->as.html_block_type) {
1184	case `1`:
1185	// </script>, </style>, </textarea>, </pre>
1186	matches_end_condition =
1187	scan_html_block_end_1(input, parser->first_nonspace);
1188	break;
1189	case `2`:
1190	// -->
1191	matches_end_condition =
1192	scan_html_block_end_2(input, parser->first_nonspace);
1193	break;
1194	case `3`:
1195	// ?>
1196	matches_end_condition =
1197	scan_html_block_end_3(input, parser->first_nonspace);
1198	break;
1199	case `4`:
1200	// >
1201	matches_end_condition =
1202	scan_html_block_end_4(input, parser->first_nonspace);
1203	break;
1204	case `5`:
1205	// ]]>
1206	matches_end_condition =
1207	scan_html_block_end_5(input, parser->first_nonspace);
1208	break;
1209	default:
1210	matches_end_condition = `0`;
1211	break;
1212	}
1213
1214	if (matches_end_condition) {
1215	container = finalize(parser, container);
1216	assert(parser->current != NULL);
1217	}
1218	} else if (parser->blank) {
1219	// ??? do nothing
1220	} else if (accepts_lines(S_type(container))) {
1221	if (S_type(container) == CMARK_NODE_HEADING &&
1222	container->as.heading.setext == false) {
1223	chop_trailing_hashtags(input);
1224	}
1225	S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1226	false);
1227	add_line(input, parser);
1228	} else {
1229	// create paragraph container for line
1230	container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
1231	parser->first_nonspace + `1`);
1232	S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1233	false);
1234	add_line(input, parser);
1235	}
1236
1237	parser->current = container;
1238	}
1239	}
1240
1241	/ See http://spec.commonmark.org/0.24/#phase-1-block-structure /
1242	static void S_process_line(cmark_parser parser, const* unsigned char *buffer,
1243	bufsize_t bytes) {
1244	cmark_node *last_matched_container;
1245	bool all_matched = true;
1246	cmark_node *container;
1247	cmark_chunk input;
1248
1249	if (parser->options & CMARK_OPT_VALIDATE_UTF8)
1250	cmark_utf8proc_check(&parser->curline, buffer, bytes);
1251	else
1252	cmark_strbuf_put(&parser->curline, buffer, bytes);
1253
1254	bytes = parser->curline.size;
1255
1256	// ensure line ends with a newline:
1257	if (bytes == `0` \|\| !S_is_line_end_char(parser->curline.ptr[bytes - `1`]))
1258	cmark_strbuf_putc(&parser->curline, `'\n'`);
1259
1260	parser->offset = `0`;
1261	parser->column = `0`;
1262	parser->first_nonspace = `0`;
1263	parser->first_nonspace_column = `0`;
1264	parser->thematic_break_kill_pos = `0`;
1265	parser->indent = `0`;
1266	parser->blank = false;
1267	parser->partially_consumed_tab = false;
1268
1269	input.data = parser->curline.ptr;
1270	input.len = parser->curline.size;
1271
1272	parser->line_number++;
1273
1274	last_matched_container = check_open_blocks(parser, &input, &all_matched);
1275
1276	if (!last_matched_container)
1277	goto finished;
1278
1279	container = last_matched_container;
1280
1281	open_new_blocks(parser, &container, &input, all_matched);
1282
1283	add_text_to_container(parser, container, last_matched_container, &input);
1284
1285	finished:
1286	parser->last_line_length = input.len;
1287	if (parser->last_line_length &&
1288	input.data[parser->last_line_length - `1`] == `'\n'`)
1289	parser->last_line_length -= `1`;
1290	if (parser->last_line_length &&
1291	input.data[parser->last_line_length - `1`] == `'\r'`)
1292	parser->last_line_length -= `1`;
1293
1294	cmark_strbuf_clear(&parser->curline);
1295	}
1296
1297	cmark_node cmark_parser_finish(cmark_parser parser) {
1298	if (parser->linebuf.size) {
1299	S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
1300	cmark_strbuf_clear(&parser->linebuf);
1301	}
1302
1303	finalize_document(parser);
1304
1305	cmark_consolidate_text_nodes(parser->root);
1306
1307	cmark_strbuf_free(&parser->curline);
1308
1309	#if CMARK_DEBUG_NODES
1310	if (cmark_node_check(parser->root, stderr)) {
1311	abort();
1312	}
1313	#endif
1314	return parser->root;
1315	}
1316

Browse the source code of Aseprite/third_party/cmark/src/blocks.c