1 | #include <stdlib.h> |
2 | #include <string.h> |
3 | #include <stdio.h> |
4 | |
5 | #include "cmark_ctype.h" |
6 | #include "config.h" |
7 | #include "node.h" |
8 | #include "parser.h" |
9 | #include "references.h" |
10 | #include "cmark.h" |
11 | #include "houdini.h" |
12 | #include "utf8.h" |
13 | #include "scanners.h" |
14 | #include "inlines.h" |
15 | |
16 | static const char *EMDASH = "\xE2\x80\x94" ; |
17 | static const char *ENDASH = "\xE2\x80\x93" ; |
18 | static const char *ELLIPSES = "\xE2\x80\xA6" ; |
19 | static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C" ; |
20 | static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D" ; |
21 | static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98" ; |
22 | static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99" ; |
23 | |
24 | // Macros for creating various kinds of simple. |
25 | #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) |
26 | #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) |
27 | #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) |
28 | #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG) |
29 | |
30 | #define MAXBACKTICKS 1000 |
31 | |
32 | typedef struct delimiter { |
33 | struct delimiter *previous; |
34 | struct delimiter *next; |
35 | cmark_node *inl_text; |
36 | bufsize_t length; |
37 | unsigned char delim_char; |
38 | bool can_open; |
39 | bool can_close; |
40 | } delimiter; |
41 | |
42 | typedef struct bracket { |
43 | struct bracket *previous; |
44 | struct delimiter *previous_delimiter; |
45 | cmark_node *inl_text; |
46 | bufsize_t position; |
47 | bool image; |
48 | bool active; |
49 | bool bracket_after; |
50 | } bracket; |
51 | |
52 | #define FLAG_SKIP_HTML_CDATA (1u << 0) |
53 | #define FLAG_SKIP_HTML_DECLARATION (1u << 1) |
54 | #define FLAG_SKIP_HTML_PI (1u << 2) |
55 | |
56 | typedef struct { |
57 | cmark_mem *mem; |
58 | cmark_chunk input; |
59 | unsigned flags; |
60 | int line; |
61 | bufsize_t pos; |
62 | int block_offset; |
63 | int column_offset; |
64 | cmark_reference_map *refmap; |
65 | delimiter *last_delim; |
66 | bracket *last_bracket; |
67 | bufsize_t backticks[MAXBACKTICKS + 1]; |
68 | bool scanned_for_backticks; |
69 | } subject; |
70 | |
71 | static CMARK_INLINE bool S_is_line_end_char(char c) { |
72 | return (c == '\n' || c == '\r'); |
73 | } |
74 | |
75 | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
76 | delimiter *closer); |
77 | |
78 | static int parse_inline(subject *subj, cmark_node *parent, int options); |
79 | |
80 | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
81 | cmark_chunk *chunk, cmark_reference_map *refmap); |
82 | static bufsize_t subject_find_special_char(subject *subj, int options); |
83 | |
84 | // Create an inline with a literal string value. |
85 | static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, |
86 | int start_column, int end_column) { |
87 | cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); |
88 | e->mem = subj->mem; |
89 | e->type = (uint16_t)t; |
90 | e->start_line = e->end_line = subj->line; |
91 | // columns are 1 based. |
92 | e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; |
93 | e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; |
94 | return e; |
95 | } |
96 | |
97 | // Create an inline with no value. |
98 | static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { |
99 | cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); |
100 | e->mem = mem; |
101 | e->type = t; |
102 | return e; |
103 | } |
104 | |
105 | static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { |
106 | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
107 | e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1); |
108 | if (s.data != NULL) { |
109 | memcpy(e->data, s.data, s.len); |
110 | } |
111 | e->data[s.len] = 0; |
112 | e->len = s.len; |
113 | return e; |
114 | } |
115 | |
116 | static cmark_node *make_str_from_buf(subject *subj, int sc, int ec, |
117 | cmark_strbuf *buf) { |
118 | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
119 | e->len = buf->size; |
120 | e->data = cmark_strbuf_detach(buf); |
121 | return e; |
122 | } |
123 | |
124 | // Like make_str, but parses entities. |
125 | static cmark_node *make_str_with_entities(subject *subj, |
126 | int start_column, int end_column, |
127 | cmark_chunk *content) { |
128 | cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); |
129 | |
130 | if (houdini_unescape_html(&unescaped, content->data, content->len)) { |
131 | return make_str_from_buf(subj, start_column, end_column, &unescaped); |
132 | } else { |
133 | return make_str(subj, start_column, end_column, *content); |
134 | } |
135 | } |
136 | |
137 | // Like cmark_node_append_child but without costly sanity checks. |
138 | // Assumes that child was newly created. |
139 | static void append_child(cmark_node *node, cmark_node *child) { |
140 | cmark_node *old_last_child = node->last_child; |
141 | |
142 | child->next = NULL; |
143 | child->prev = old_last_child; |
144 | child->parent = node; |
145 | node->last_child = child; |
146 | |
147 | if (old_last_child) { |
148 | old_last_child->next = child; |
149 | } else { |
150 | // Also set first_child if node previously had no children. |
151 | node->first_child = child; |
152 | } |
153 | } |
154 | |
155 | // Duplicate a chunk by creating a copy of the buffer not by reusing the |
156 | // buffer like cmark_chunk_dup does. |
157 | static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) { |
158 | if (src == NULL) { |
159 | return NULL; |
160 | } |
161 | size_t len = strlen((char *)src); |
162 | unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1); |
163 | memcpy(data, src, len + 1); |
164 | return data; |
165 | } |
166 | |
167 | static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, |
168 | int is_email) { |
169 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
170 | |
171 | cmark_chunk_trim(url); |
172 | |
173 | if (is_email) |
174 | cmark_strbuf_puts(&buf, "mailto:" ); |
175 | |
176 | houdini_unescape_html_f(&buf, url->data, url->len); |
177 | return cmark_strbuf_detach(&buf); |
178 | } |
179 | |
180 | static CMARK_INLINE cmark_node *make_autolink(subject *subj, |
181 | int start_column, int end_column, |
182 | cmark_chunk url, int is_email) { |
183 | cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); |
184 | link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); |
185 | link->as.link.title = NULL; |
186 | link->start_line = link->end_line = subj->line; |
187 | link->start_column = start_column + 1; |
188 | link->end_column = end_column + 1; |
189 | append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); |
190 | return link; |
191 | } |
192 | |
193 | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
194 | cmark_chunk *chunk, cmark_reference_map *refmap) { |
195 | int i; |
196 | e->mem = mem; |
197 | e->input = *chunk; |
198 | e->flags = 0; |
199 | e->line = line_number; |
200 | e->pos = 0; |
201 | e->block_offset = block_offset; |
202 | e->column_offset = 0; |
203 | e->refmap = refmap; |
204 | e->last_delim = NULL; |
205 | e->last_bracket = NULL; |
206 | for (i = 0; i <= MAXBACKTICKS; i++) { |
207 | e->backticks[i] = 0; |
208 | } |
209 | e->scanned_for_backticks = false; |
210 | } |
211 | |
212 | static CMARK_INLINE int isbacktick(int c) { return (c == '`'); } |
213 | |
214 | static CMARK_INLINE unsigned char peek_char(subject *subj) { |
215 | // NULL bytes should have been stripped out by now. If they're |
216 | // present, it's a programming error: |
217 | assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0)); |
218 | return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0; |
219 | } |
220 | |
221 | static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) { |
222 | return subj->input.data[pos]; |
223 | } |
224 | |
225 | // Return true if there are more characters in the subject. |
226 | static CMARK_INLINE int is_eof(subject *subj) { |
227 | return (subj->pos >= subj->input.len); |
228 | } |
229 | |
230 | // Advance the subject. Doesn't check for eof. |
231 | #define advance(subj) (subj)->pos += 1 |
232 | |
233 | static CMARK_INLINE bool skip_spaces(subject *subj) { |
234 | bool skipped = false; |
235 | while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { |
236 | advance(subj); |
237 | skipped = true; |
238 | } |
239 | return skipped; |
240 | } |
241 | |
242 | static CMARK_INLINE bool skip_line_end(subject *subj) { |
243 | bool seen_line_end_char = false; |
244 | if (peek_char(subj) == '\r') { |
245 | advance(subj); |
246 | seen_line_end_char = true; |
247 | } |
248 | if (peek_char(subj) == '\n') { |
249 | advance(subj); |
250 | seen_line_end_char = true; |
251 | } |
252 | return seen_line_end_char || is_eof(subj); |
253 | } |
254 | |
255 | // Take characters while a predicate holds, and return a string. |
256 | static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { |
257 | unsigned char c; |
258 | bufsize_t startpos = subj->pos; |
259 | bufsize_t len = 0; |
260 | |
261 | while ((c = peek_char(subj)) && (*f)(c)) { |
262 | advance(subj); |
263 | len++; |
264 | } |
265 | |
266 | return cmark_chunk_dup(&subj->input, startpos, len); |
267 | } |
268 | |
269 | // Return the number of newlines in a given span of text in a subject. If |
270 | // the number is greater than zero, also return the number of characters |
271 | // between the last newline and the end of the span in `since_newline`. |
272 | static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { |
273 | int nls = 0; |
274 | int since_nl = 0; |
275 | |
276 | while (len--) { |
277 | if (subj->input.data[from++] == '\n') { |
278 | ++nls; |
279 | since_nl = 0; |
280 | } else { |
281 | ++since_nl; |
282 | } |
283 | } |
284 | |
285 | if (!nls) |
286 | return 0; |
287 | |
288 | *since_newline = since_nl; |
289 | return nls; |
290 | } |
291 | |
292 | // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and |
293 | // `column_offset` according to the number of newlines in a just-matched span |
294 | // of text in `subj`. |
295 | static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int , int options) { |
296 | if (!(options & CMARK_OPT_SOURCEPOS)) { |
297 | return; |
298 | } |
299 | |
300 | int since_newline; |
301 | int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline); |
302 | if (newlines) { |
303 | subj->line += newlines; |
304 | node->end_line += newlines; |
305 | node->end_column = since_newline; |
306 | subj->column_offset = -subj->pos + since_newline + extra; |
307 | } |
308 | } |
309 | |
310 | // Try to process a backtick code span that began with a |
311 | // span of ticks of length openticklength length (already |
312 | // parsed). Return 0 if you don't find matching closing |
313 | // backticks, otherwise return the position in the subject |
314 | // after the closing backticks. |
315 | static bufsize_t scan_to_closing_backticks(subject *subj, |
316 | bufsize_t openticklength) { |
317 | |
318 | bool found = false; |
319 | if (openticklength > MAXBACKTICKS) { |
320 | // we limit backtick string length because of the array subj->backticks: |
321 | return 0; |
322 | } |
323 | if (subj->scanned_for_backticks && |
324 | subj->backticks[openticklength] <= subj->pos) { |
325 | // return if we already know there's no closer |
326 | return 0; |
327 | } |
328 | while (!found) { |
329 | // read non backticks |
330 | unsigned char c; |
331 | while ((c = peek_char(subj)) && c != '`') { |
332 | advance(subj); |
333 | } |
334 | if (is_eof(subj)) { |
335 | break; |
336 | } |
337 | bufsize_t numticks = 0; |
338 | while (peek_char(subj) == '`') { |
339 | advance(subj); |
340 | numticks++; |
341 | } |
342 | // store position of ender |
343 | if (numticks <= MAXBACKTICKS) { |
344 | subj->backticks[numticks] = subj->pos - numticks; |
345 | } |
346 | if (numticks == openticklength) { |
347 | return (subj->pos); |
348 | } |
349 | } |
350 | // got through whole input without finding closer |
351 | subj->scanned_for_backticks = true; |
352 | return 0; |
353 | } |
354 | |
355 | // Destructively modify string, converting newlines to |
356 | // spaces, then removing a single leading + trailing space, |
357 | // unless the code span consists entirely of space characters. |
358 | static void S_normalize_code(cmark_strbuf *s) { |
359 | bufsize_t r, w; |
360 | bool contains_nonspace = false; |
361 | |
362 | for (r = 0, w = 0; r < s->size; ++r) { |
363 | switch (s->ptr[r]) { |
364 | case '\r': |
365 | if (s->ptr[r + 1] != '\n') { |
366 | s->ptr[w++] = ' '; |
367 | } |
368 | break; |
369 | case '\n': |
370 | s->ptr[w++] = ' '; |
371 | break; |
372 | default: |
373 | s->ptr[w++] = s->ptr[r]; |
374 | } |
375 | if (s->ptr[r] != ' ') { |
376 | contains_nonspace = true; |
377 | } |
378 | } |
379 | |
380 | // begins and ends with space? |
381 | if (contains_nonspace && |
382 | s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') { |
383 | cmark_strbuf_drop(s, 1); |
384 | cmark_strbuf_truncate(s, w - 2); |
385 | } else { |
386 | cmark_strbuf_truncate(s, w); |
387 | } |
388 | |
389 | } |
390 | |
391 | |
392 | // Parse backtick code section or raw backticks, return an inline. |
393 | // Assumes that the subject has a backtick at the current position. |
394 | static cmark_node *handle_backticks(subject *subj, int options) { |
395 | cmark_chunk openticks = take_while(subj, isbacktick); |
396 | bufsize_t startpos = subj->pos; |
397 | bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); |
398 | |
399 | if (endpos == 0) { // not found |
400 | subj->pos = startpos; // rewind |
401 | return make_str(subj, subj->pos, subj->pos, openticks); |
402 | } else { |
403 | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
404 | |
405 | cmark_strbuf_set(&buf, subj->input.data + startpos, |
406 | endpos - startpos - openticks.len); |
407 | S_normalize_code(&buf); |
408 | |
409 | cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos, |
410 | endpos - openticks.len - 1); |
411 | node->len = buf.size; |
412 | node->data = cmark_strbuf_detach(&buf); |
413 | adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); |
414 | return node; |
415 | } |
416 | } |
417 | |
418 | |
419 | // Scan ***, **, or * and return number scanned, or 0. |
420 | // Advances position. |
421 | static int scan_delims(subject *subj, unsigned char c, bool *can_open, |
422 | bool *can_close) { |
423 | int numdelims = 0; |
424 | bufsize_t before_char_pos; |
425 | int32_t after_char = 0; |
426 | int32_t before_char = 0; |
427 | int len; |
428 | bool left_flanking, right_flanking; |
429 | |
430 | if (subj->pos == 0) { |
431 | before_char = 10; |
432 | } else { |
433 | before_char_pos = subj->pos - 1; |
434 | // walk back to the beginning of the UTF_8 sequence: |
435 | while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { |
436 | before_char_pos -= 1; |
437 | } |
438 | len = cmark_utf8proc_iterate(subj->input.data + before_char_pos, |
439 | subj->pos - before_char_pos, &before_char); |
440 | if (len == -1) { |
441 | before_char = 10; |
442 | } |
443 | } |
444 | |
445 | if (c == '\'' || c == '"') { |
446 | numdelims++; |
447 | advance(subj); // limit to 1 delim for quotes |
448 | } else { |
449 | while (peek_char(subj) == c) { |
450 | numdelims++; |
451 | advance(subj); |
452 | } |
453 | } |
454 | |
455 | len = cmark_utf8proc_iterate(subj->input.data + subj->pos, |
456 | subj->input.len - subj->pos, &after_char); |
457 | if (len == -1) { |
458 | after_char = 10; |
459 | } |
460 | left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) && |
461 | (!cmark_utf8proc_is_punctuation(after_char) || |
462 | cmark_utf8proc_is_space(before_char) || |
463 | cmark_utf8proc_is_punctuation(before_char)); |
464 | right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) && |
465 | (!cmark_utf8proc_is_punctuation(before_char) || |
466 | cmark_utf8proc_is_space(after_char) || |
467 | cmark_utf8proc_is_punctuation(after_char)); |
468 | if (c == '_') { |
469 | *can_open = left_flanking && |
470 | (!right_flanking || cmark_utf8proc_is_punctuation(before_char)); |
471 | *can_close = right_flanking && |
472 | (!left_flanking || cmark_utf8proc_is_punctuation(after_char)); |
473 | } else if (c == '\'' || c == '"') { |
474 | *can_open = left_flanking && |
475 | (!right_flanking || before_char == '(' || before_char == '[') && |
476 | before_char != ']' && before_char != ')'; |
477 | *can_close = right_flanking; |
478 | } else { |
479 | *can_open = left_flanking; |
480 | *can_close = right_flanking; |
481 | } |
482 | return numdelims; |
483 | } |
484 | |
485 | /* |
486 | static void print_delimiters(subject *subj) |
487 | { |
488 | delimiter *delim; |
489 | delim = subj->last_delim; |
490 | while (delim != NULL) { |
491 | printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n", |
492 | (void*)delim, delim->delim_char, |
493 | delim->can_open, delim->can_close, |
494 | (void*)delim->next, (void*)delim->previous); |
495 | delim = delim->previous; |
496 | } |
497 | } |
498 | */ |
499 | |
500 | static void remove_delimiter(subject *subj, delimiter *delim) { |
501 | if (delim == NULL) |
502 | return; |
503 | if (delim->next == NULL) { |
504 | // end of list: |
505 | assert(delim == subj->last_delim); |
506 | subj->last_delim = delim->previous; |
507 | } else { |
508 | delim->next->previous = delim->previous; |
509 | } |
510 | if (delim->previous != NULL) { |
511 | delim->previous->next = delim->next; |
512 | } |
513 | subj->mem->free(delim); |
514 | } |
515 | |
516 | static void pop_bracket(subject *subj) { |
517 | bracket *b; |
518 | if (subj->last_bracket == NULL) |
519 | return; |
520 | b = subj->last_bracket; |
521 | subj->last_bracket = subj->last_bracket->previous; |
522 | subj->mem->free(b); |
523 | } |
524 | |
525 | static void push_delimiter(subject *subj, unsigned char c, bool can_open, |
526 | bool can_close, cmark_node *inl_text) { |
527 | delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter)); |
528 | delim->delim_char = c; |
529 | delim->can_open = can_open; |
530 | delim->can_close = can_close; |
531 | delim->inl_text = inl_text; |
532 | delim->length = inl_text->len; |
533 | delim->previous = subj->last_delim; |
534 | delim->next = NULL; |
535 | if (delim->previous != NULL) { |
536 | delim->previous->next = delim; |
537 | } |
538 | subj->last_delim = delim; |
539 | } |
540 | |
541 | static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { |
542 | bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket)); |
543 | if (subj->last_bracket != NULL) { |
544 | subj->last_bracket->bracket_after = true; |
545 | } |
546 | b->image = image; |
547 | b->active = true; |
548 | b->inl_text = inl_text; |
549 | b->previous = subj->last_bracket; |
550 | b->previous_delimiter = subj->last_delim; |
551 | b->position = subj->pos; |
552 | b->bracket_after = false; |
553 | subj->last_bracket = b; |
554 | } |
555 | |
556 | // Assumes the subject has a c at the current position. |
557 | static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { |
558 | bufsize_t numdelims; |
559 | cmark_node *inl_text; |
560 | bool can_open, can_close; |
561 | cmark_chunk contents; |
562 | |
563 | numdelims = scan_delims(subj, c, &can_open, &can_close); |
564 | |
565 | if (c == '\'' && smart) { |
566 | contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); |
567 | } else if (c == '"' && smart) { |
568 | contents = |
569 | cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); |
570 | } else { |
571 | contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); |
572 | } |
573 | |
574 | inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); |
575 | |
576 | if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { |
577 | push_delimiter(subj, c, can_open, can_close, inl_text); |
578 | } |
579 | |
580 | return inl_text; |
581 | } |
582 | |
583 | // Assumes we have a hyphen at the current position. |
584 | static cmark_node *handle_hyphen(subject *subj, bool smart) { |
585 | int startpos = subj->pos; |
586 | |
587 | advance(subj); |
588 | |
589 | if (!smart || peek_char(subj) != '-') { |
590 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-" )); |
591 | } |
592 | |
593 | while (smart && peek_char(subj) == '-') { |
594 | advance(subj); |
595 | } |
596 | |
597 | int numhyphens = subj->pos - startpos; |
598 | int en_count = 0; |
599 | int em_count = 0; |
600 | int i; |
601 | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
602 | |
603 | if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes |
604 | em_count = numhyphens / 3; |
605 | } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes |
606 | en_count = numhyphens / 2; |
607 | } else if (numhyphens % 3 == 2) { // use one en dash at end |
608 | en_count = 1; |
609 | em_count = (numhyphens - 2) / 3; |
610 | } else { // use two en dashes at the end |
611 | en_count = 2; |
612 | em_count = (numhyphens - 4) / 3; |
613 | } |
614 | |
615 | for (i = em_count; i > 0; i--) { |
616 | cmark_strbuf_puts(&buf, EMDASH); |
617 | } |
618 | |
619 | for (i = en_count; i > 0; i--) { |
620 | cmark_strbuf_puts(&buf, ENDASH); |
621 | } |
622 | |
623 | return make_str_from_buf(subj, startpos, subj->pos - 1, &buf); |
624 | } |
625 | |
626 | // Assumes we have a period at the current position. |
627 | static cmark_node *handle_period(subject *subj, bool smart) { |
628 | advance(subj); |
629 | if (smart && peek_char(subj) == '.') { |
630 | advance(subj); |
631 | if (peek_char(subj) == '.') { |
632 | advance(subj); |
633 | return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); |
634 | } else { |
635 | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".." )); |
636 | } |
637 | } else { |
638 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("." )); |
639 | } |
640 | } |
641 | |
642 | static void process_emphasis(subject *subj, delimiter *stack_bottom) { |
643 | delimiter *closer = subj->last_delim; |
644 | delimiter *opener; |
645 | delimiter *old_closer; |
646 | delimiter *new_stack_bottom; |
647 | bool opener_found; |
648 | |
649 | int openers_bottom_index = 0; |
650 | delimiter *openers_bottom[6] = {stack_bottom, stack_bottom, stack_bottom, |
651 | stack_bottom}; |
652 | |
653 | // move back to first relevant delim. |
654 | while (closer != NULL && closer->previous != stack_bottom) { |
655 | closer = closer->previous; |
656 | } |
657 | |
658 | // now move forward, looking for closers, and handling each |
659 | while (closer != NULL) { |
660 | if (closer->can_close) { |
661 | switch (closer->delim_char) { |
662 | case '"': |
663 | openers_bottom_index = 0; |
664 | break; |
665 | case '\'': |
666 | openers_bottom_index = 1; |
667 | break; |
668 | case '_': |
669 | openers_bottom_index = 2; |
670 | break; |
671 | case '*': |
672 | openers_bottom_index = 3; |
673 | break; |
674 | default: |
675 | assert(false); |
676 | } |
677 | |
678 | // Now look backwards for first matching opener: |
679 | opener = closer->previous; |
680 | opener_found = false; |
681 | new_stack_bottom = closer->previous; |
682 | |
683 | while (opener != NULL && opener != openers_bottom[openers_bottom_index]) { |
684 | if (opener->can_open && opener->delim_char == closer->delim_char) { |
685 | // interior closer of size 2 can't match opener of size 1 |
686 | // or of size 1 can't match 2 |
687 | if (!(closer->can_open || opener->can_close) || |
688 | closer->length % 3 == 0 || |
689 | (opener->length + closer->length) % 3 != 0) { |
690 | opener_found = true; |
691 | break; |
692 | } else { |
693 | // If we failed to match because of the mod-3 rule, |
694 | // then we want to make sure the stack bottom extends |
695 | // back to here at least, since a later closer might |
696 | // match this same opener... (see #383) |
697 | new_stack_bottom = opener->previous; |
698 | } |
699 | } |
700 | opener = opener->previous; |
701 | } |
702 | old_closer = closer; |
703 | if (closer->delim_char == '*' || closer->delim_char == '_') { |
704 | if (opener_found) { |
705 | closer = S_insert_emph(subj, opener, closer); |
706 | } else { |
707 | closer = closer->next; |
708 | } |
709 | } else if (closer->delim_char == '\'') { |
710 | cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE); |
711 | if (opener_found) { |
712 | cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE); |
713 | } |
714 | closer = closer->next; |
715 | } else if (closer->delim_char == '"') { |
716 | cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE); |
717 | if (opener_found) { |
718 | cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE); |
719 | } |
720 | closer = closer->next; |
721 | } |
722 | if (!opener_found) { |
723 | // set lower bound for future searches for openers (see #383). |
724 | openers_bottom[openers_bottom_index] = new_stack_bottom; |
725 | if (!old_closer->can_open) { |
726 | // we can remove a closer that can't be an |
727 | // opener, once we've seen there's no |
728 | // matching opener: |
729 | remove_delimiter(subj, old_closer); |
730 | } |
731 | } |
732 | } else { |
733 | closer = closer->next; |
734 | } |
735 | } |
736 | // free all delimiters in list until stack_bottom: |
737 | while (subj->last_delim != NULL && subj->last_delim != stack_bottom) { |
738 | remove_delimiter(subj, subj->last_delim); |
739 | } |
740 | } |
741 | |
742 | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
743 | delimiter *closer) { |
744 | delimiter *delim, *tmp_delim; |
745 | bufsize_t use_delims; |
746 | cmark_node *opener_inl = opener->inl_text; |
747 | cmark_node *closer_inl = closer->inl_text; |
748 | bufsize_t opener_num_chars = opener_inl->len; |
749 | bufsize_t closer_num_chars = closer_inl->len; |
750 | cmark_node *tmp, *tmpnext, *emph; |
751 | |
752 | // calculate the actual number of characters used from this closer |
753 | use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1; |
754 | |
755 | // remove used characters from associated inlines. |
756 | opener_num_chars -= use_delims; |
757 | closer_num_chars -= use_delims; |
758 | opener_inl->len = opener_num_chars; |
759 | opener_inl->data[opener_num_chars] = 0; |
760 | closer_inl->len = closer_num_chars; |
761 | closer_inl->data[closer_num_chars] = 0; |
762 | |
763 | // free delimiters between opener and closer |
764 | delim = closer->previous; |
765 | while (delim != NULL && delim != opener) { |
766 | tmp_delim = delim->previous; |
767 | remove_delimiter(subj, delim); |
768 | delim = tmp_delim; |
769 | } |
770 | |
771 | // create new emph or strong, and splice it in to our inlines |
772 | // between the opener and closer |
773 | emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem); |
774 | |
775 | tmp = opener_inl->next; |
776 | while (tmp && tmp != closer_inl) { |
777 | tmpnext = tmp->next; |
778 | cmark_node_unlink(tmp); |
779 | append_child(emph, tmp); |
780 | tmp = tmpnext; |
781 | } |
782 | cmark_node_insert_after(opener_inl, emph); |
783 | |
784 | emph->start_line = opener_inl->start_line; |
785 | emph->end_line = closer_inl->end_line; |
786 | emph->start_column = opener_inl->start_column; |
787 | emph->end_column = closer_inl->end_column; |
788 | |
789 | // if opener has 0 characters, remove it and its associated inline |
790 | if (opener_num_chars == 0) { |
791 | cmark_node_free(opener_inl); |
792 | remove_delimiter(subj, opener); |
793 | } |
794 | |
795 | // if closer has 0 characters, remove it and its associated inline |
796 | if (closer_num_chars == 0) { |
797 | // remove empty closer inline |
798 | cmark_node_free(closer_inl); |
799 | // remove closer from list |
800 | tmp_delim = closer->next; |
801 | remove_delimiter(subj, closer); |
802 | closer = tmp_delim; |
803 | } |
804 | |
805 | return closer; |
806 | } |
807 | |
808 | // Parse backslash-escape or just a backslash, returning an inline. |
809 | static cmark_node *handle_backslash(subject *subj) { |
810 | advance(subj); |
811 | unsigned char nextchar = peek_char(subj); |
812 | if (cmark_ispunct( |
813 | nextchar)) { // only ascii symbols and newline can be escaped |
814 | advance(subj); |
815 | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); |
816 | } else if (!is_eof(subj) && skip_line_end(subj)) { |
817 | return make_linebreak(subj->mem); |
818 | } else { |
819 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\" )); |
820 | } |
821 | } |
822 | |
823 | // Parse an entity or a regular "&" string. |
824 | // Assumes the subject has an '&' character at the current position. |
825 | static cmark_node *handle_entity(subject *subj) { |
826 | cmark_strbuf ent = CMARK_BUF_INIT(subj->mem); |
827 | bufsize_t len; |
828 | |
829 | advance(subj); |
830 | |
831 | len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, |
832 | subj->input.len - subj->pos); |
833 | |
834 | if (len <= 0) |
835 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&" )); |
836 | |
837 | subj->pos += len; |
838 | return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent); |
839 | } |
840 | |
841 | // Clean a URL: remove surrounding whitespace, and remove \ that escape |
842 | // punctuation. |
843 | unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { |
844 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
845 | |
846 | cmark_chunk_trim(url); |
847 | |
848 | houdini_unescape_html_f(&buf, url->data, url->len); |
849 | |
850 | cmark_strbuf_unescape(&buf); |
851 | return cmark_strbuf_detach(&buf); |
852 | } |
853 | |
854 | unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { |
855 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
856 | unsigned char first, last; |
857 | |
858 | if (title->len == 0) { |
859 | return NULL; |
860 | } |
861 | |
862 | first = title->data[0]; |
863 | last = title->data[title->len - 1]; |
864 | |
865 | // remove surrounding quotes if any: |
866 | if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || |
867 | (first == '"' && last == '"')) { |
868 | houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); |
869 | } else { |
870 | houdini_unescape_html_f(&buf, title->data, title->len); |
871 | } |
872 | |
873 | cmark_strbuf_unescape(&buf); |
874 | return cmark_strbuf_detach(&buf); |
875 | } |
876 | |
877 | // Parse an autolink or HTML tag. |
878 | // Assumes the subject has a '<' character at the current position. |
879 | static cmark_node *handle_pointy_brace(subject *subj, int options) { |
880 | bufsize_t matchlen = 0; |
881 | cmark_chunk contents; |
882 | |
883 | advance(subj); // advance past first < |
884 | |
885 | // first try to match a URL autolink |
886 | matchlen = scan_autolink_uri(&subj->input, subj->pos); |
887 | if (matchlen > 0) { |
888 | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
889 | subj->pos += matchlen; |
890 | |
891 | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); |
892 | } |
893 | |
894 | // next try to match an email autolink |
895 | matchlen = scan_autolink_email(&subj->input, subj->pos); |
896 | if (matchlen > 0) { |
897 | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
898 | subj->pos += matchlen; |
899 | |
900 | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); |
901 | } |
902 | |
903 | // finally, try to match an html tag |
904 | if (subj->pos + 2 <= subj->input.len) { |
905 | int c = subj->input.data[subj->pos]; |
906 | if (c == '!') { |
907 | c = subj->input.data[subj->pos+1]; |
908 | if (c == '-') { |
909 | matchlen = scan_html_comment(&subj->input, subj->pos + 2); |
910 | if (matchlen > 0) |
911 | matchlen += 2; // prefix "<-" |
912 | } else if (c == '[') { |
913 | if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) { |
914 | matchlen = scan_html_cdata(&subj->input, subj->pos + 2); |
915 | if (matchlen > 0) { |
916 | // The regex doesn't require the final "]]>". But if we're not at |
917 | // the end of input, it must come after the match. Otherwise, |
918 | // disable subsequent scans to avoid quadratic behavior. |
919 | matchlen += 5; // prefix "![", suffix "]]>" |
920 | if (subj->pos + matchlen > subj->input.len) { |
921 | subj->flags |= FLAG_SKIP_HTML_CDATA; |
922 | matchlen = 0; |
923 | } |
924 | } |
925 | } |
926 | } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) { |
927 | matchlen = scan_html_declaration(&subj->input, subj->pos + 1); |
928 | if (matchlen > 0) { |
929 | matchlen += 2; // prefix "!", suffix ">" |
930 | if (subj->pos + matchlen > subj->input.len) { |
931 | subj->flags |= FLAG_SKIP_HTML_DECLARATION; |
932 | matchlen = 0; |
933 | } |
934 | } |
935 | } |
936 | } else if (c == '?') { |
937 | if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) { |
938 | // Note that we allow an empty match. |
939 | matchlen = scan_html_pi(&subj->input, subj->pos + 1); |
940 | matchlen += 3; // prefix "?", suffix "?>" |
941 | if (subj->pos + matchlen > subj->input.len) { |
942 | subj->flags |= FLAG_SKIP_HTML_PI; |
943 | matchlen = 0; |
944 | } |
945 | } |
946 | } else { |
947 | matchlen = scan_html_tag(&subj->input, subj->pos); |
948 | } |
949 | } |
950 | if (matchlen > 0) { |
951 | const unsigned char *src = subj->input.data + subj->pos - 1; |
952 | bufsize_t len = matchlen + 1; |
953 | subj->pos += matchlen; |
954 | cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE, |
955 | subj->pos - matchlen - 1, subj->pos - 1); |
956 | node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1); |
957 | memcpy(node->data, src, len); |
958 | node->data[len] = 0; |
959 | node->len = len; |
960 | adjust_subj_node_newlines(subj, node, matchlen, 1, options); |
961 | return node; |
962 | } |
963 | |
964 | // if nothing matches, just return the opening <: |
965 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<" )); |
966 | } |
967 | |
968 | // Parse a link label. Returns 1 if successful. |
969 | // Note: unescaped brackets are not allowed in labels. |
970 | // The label begins with `[` and ends with the first `]` character |
971 | // encountered. Backticks in labels do not start code spans. |
972 | static int link_label(subject *subj, cmark_chunk *raw_label) { |
973 | bufsize_t startpos = subj->pos; |
974 | int length = 0; |
975 | unsigned char c; |
976 | |
977 | // advance past [ |
978 | if (peek_char(subj) == '[') { |
979 | advance(subj); |
980 | } else { |
981 | return 0; |
982 | } |
983 | |
984 | while ((c = peek_char(subj)) && c != '[' && c != ']') { |
985 | if (c == '\\') { |
986 | advance(subj); |
987 | length++; |
988 | if (cmark_ispunct(peek_char(subj))) { |
989 | advance(subj); |
990 | length++; |
991 | } |
992 | } else { |
993 | advance(subj); |
994 | length++; |
995 | } |
996 | if (length > MAX_LINK_LABEL_LENGTH) { |
997 | goto noMatch; |
998 | } |
999 | } |
1000 | |
1001 | if (c == ']') { // match found |
1002 | *raw_label = |
1003 | cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); |
1004 | cmark_chunk_trim(raw_label); |
1005 | advance(subj); // advance past ] |
1006 | return 1; |
1007 | } |
1008 | |
1009 | noMatch: |
1010 | subj->pos = startpos; // rewind |
1011 | return 0; |
1012 | } |
1013 | |
1014 | static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, |
1015 | cmark_chunk *output) { |
1016 | bufsize_t i = offset; |
1017 | size_t nb_p = 0; |
1018 | |
1019 | while (i < input->len) { |
1020 | if (input->data[i] == '\\' && |
1021 | i + 1 < input-> len && |
1022 | cmark_ispunct(input->data[i+1])) |
1023 | i += 2; |
1024 | else if (input->data[i] == '(') { |
1025 | ++nb_p; |
1026 | ++i; |
1027 | if (nb_p > 32) |
1028 | return -1; |
1029 | } else if (input->data[i] == ')') { |
1030 | if (nb_p == 0) |
1031 | break; |
1032 | --nb_p; |
1033 | ++i; |
1034 | } else if (cmark_isspace(input->data[i])) { |
1035 | if (i == offset) { |
1036 | return -1; |
1037 | } |
1038 | break; |
1039 | } else { |
1040 | ++i; |
1041 | } |
1042 | } |
1043 | |
1044 | if (i >= input->len || nb_p != 0) |
1045 | return -1; |
1046 | |
1047 | { |
1048 | cmark_chunk result = {input->data + offset, i - offset}; |
1049 | *output = result; |
1050 | } |
1051 | return i - offset; |
1052 | } |
1053 | |
1054 | static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, |
1055 | cmark_chunk *output) { |
1056 | bufsize_t i = offset; |
1057 | |
1058 | if (i < input->len && input->data[i] == '<') { |
1059 | ++i; |
1060 | while (i < input->len) { |
1061 | if (input->data[i] == '>') { |
1062 | ++i; |
1063 | break; |
1064 | } else if (input->data[i] == '\\') |
1065 | i += 2; |
1066 | else if (input->data[i] == '\n' || input->data[i] == '<') |
1067 | return -1; |
1068 | else |
1069 | ++i; |
1070 | } |
1071 | } else { |
1072 | return manual_scan_link_url_2(input, offset, output); |
1073 | } |
1074 | |
1075 | if (i >= input->len) |
1076 | return -1; |
1077 | |
1078 | { |
1079 | cmark_chunk result = {input->data + offset + 1, i - 2 - offset}; |
1080 | *output = result; |
1081 | } |
1082 | return i - offset; |
1083 | } |
1084 | |
1085 | // Return a link, an image, or a literal close bracket. |
1086 | static cmark_node *handle_close_bracket(subject *subj) { |
1087 | bufsize_t initial_pos, after_link_text_pos; |
1088 | bufsize_t endurl, starttitle, endtitle, endall; |
1089 | bufsize_t sps, n; |
1090 | cmark_reference *ref = NULL; |
1091 | cmark_chunk url_chunk, title_chunk; |
1092 | unsigned char *url, *title; |
1093 | bracket *opener; |
1094 | cmark_node *inl; |
1095 | cmark_chunk raw_label; |
1096 | int found_label; |
1097 | cmark_node *tmp, *tmpnext; |
1098 | bool is_image; |
1099 | |
1100 | advance(subj); // advance past ] |
1101 | initial_pos = subj->pos; |
1102 | |
1103 | // get last [ or ![ |
1104 | opener = subj->last_bracket; |
1105 | |
1106 | if (opener == NULL) { |
1107 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
1108 | } |
1109 | |
1110 | if (!opener->active) { |
1111 | // take delimiter off stack |
1112 | pop_bracket(subj); |
1113 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
1114 | } |
1115 | |
1116 | // If we got here, we matched a potential link/image text. |
1117 | // Now we check to see if it's a link/image. |
1118 | is_image = opener->image; |
1119 | |
1120 | after_link_text_pos = subj->pos; |
1121 | |
1122 | // First, look for an inline link. |
1123 | if (peek_char(subj) == '(' && |
1124 | ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && |
1125 | ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, |
1126 | &url_chunk)) > -1)) { |
1127 | |
1128 | // try to parse an explicit link: |
1129 | endurl = subj->pos + 1 + sps + n; |
1130 | starttitle = endurl + scan_spacechars(&subj->input, endurl); |
1131 | |
1132 | // ensure there are spaces btw url and title |
1133 | endtitle = (starttitle == endurl) |
1134 | ? starttitle |
1135 | : starttitle + scan_link_title(&subj->input, starttitle); |
1136 | |
1137 | endall = endtitle + scan_spacechars(&subj->input, endtitle); |
1138 | |
1139 | if (peek_at(subj, endall) == ')') { |
1140 | subj->pos = endall + 1; |
1141 | |
1142 | title_chunk = |
1143 | cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); |
1144 | url = cmark_clean_url(subj->mem, &url_chunk); |
1145 | title = cmark_clean_title(subj->mem, &title_chunk); |
1146 | cmark_chunk_free(&url_chunk); |
1147 | cmark_chunk_free(&title_chunk); |
1148 | goto match; |
1149 | |
1150 | } else { |
1151 | // it could still be a shortcut reference link |
1152 | subj->pos = after_link_text_pos; |
1153 | } |
1154 | } |
1155 | |
1156 | // Next, look for a following [link label] that matches in refmap. |
1157 | // skip spaces |
1158 | raw_label = cmark_chunk_literal("" ); |
1159 | found_label = link_label(subj, &raw_label); |
1160 | if (!found_label) { |
1161 | // If we have a shortcut reference link, back up |
1162 | // to before the spacse we skipped. |
1163 | subj->pos = initial_pos; |
1164 | } |
1165 | |
1166 | if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { |
1167 | cmark_chunk_free(&raw_label); |
1168 | raw_label = cmark_chunk_dup(&subj->input, opener->position, |
1169 | initial_pos - opener->position - 1); |
1170 | found_label = true; |
1171 | } |
1172 | |
1173 | if (found_label) { |
1174 | ref = cmark_reference_lookup(subj->refmap, &raw_label); |
1175 | cmark_chunk_free(&raw_label); |
1176 | } |
1177 | |
1178 | if (ref != NULL) { // found |
1179 | url = cmark_strdup(subj->mem, ref->url); |
1180 | title = cmark_strdup(subj->mem, ref->title); |
1181 | goto match; |
1182 | } else { |
1183 | goto noMatch; |
1184 | } |
1185 | |
1186 | noMatch: |
1187 | // If we fall through to here, it means we didn't match a link: |
1188 | pop_bracket(subj); // remove this opener from delimiter list |
1189 | subj->pos = initial_pos; |
1190 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
1191 | |
1192 | match: |
1193 | inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); |
1194 | inl->as.link.url = url; |
1195 | inl->as.link.title = title; |
1196 | inl->start_line = inl->end_line = subj->line; |
1197 | inl->start_column = opener->inl_text->start_column; |
1198 | inl->end_column = subj->pos + subj->column_offset + subj->block_offset; |
1199 | cmark_node_insert_before(opener->inl_text, inl); |
1200 | // Add link text: |
1201 | tmp = opener->inl_text->next; |
1202 | while (tmp) { |
1203 | tmpnext = tmp->next; |
1204 | cmark_node_unlink(tmp); |
1205 | append_child(inl, tmp); |
1206 | tmp = tmpnext; |
1207 | } |
1208 | |
1209 | // Free the bracket [: |
1210 | cmark_node_free(opener->inl_text); |
1211 | |
1212 | process_emphasis(subj, opener->previous_delimiter); |
1213 | pop_bracket(subj); |
1214 | |
1215 | // Now, if we have a link, we also want to deactivate earlier link |
1216 | // delimiters. (This code can be removed if we decide to allow links |
1217 | // inside links.) |
1218 | if (!is_image) { |
1219 | opener = subj->last_bracket; |
1220 | while (opener != NULL) { |
1221 | if (!opener->image) { |
1222 | if (!opener->active) { |
1223 | break; |
1224 | } else { |
1225 | opener->active = false; |
1226 | } |
1227 | } |
1228 | opener = opener->previous; |
1229 | } |
1230 | } |
1231 | |
1232 | return NULL; |
1233 | } |
1234 | |
1235 | // Parse a hard or soft linebreak, returning an inline. |
1236 | // Assumes the subject has a cr or newline at the current position. |
1237 | static cmark_node *handle_newline(subject *subj) { |
1238 | bufsize_t nlpos = subj->pos; |
1239 | // skip over cr, crlf, or lf: |
1240 | if (peek_at(subj, subj->pos) == '\r') { |
1241 | advance(subj); |
1242 | } |
1243 | if (peek_at(subj, subj->pos) == '\n') { |
1244 | advance(subj); |
1245 | } |
1246 | ++subj->line; |
1247 | subj->column_offset = -subj->pos; |
1248 | // skip spaces at beginning of line |
1249 | skip_spaces(subj); |
1250 | if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && |
1251 | peek_at(subj, nlpos - 2) == ' ') { |
1252 | return make_linebreak(subj->mem); |
1253 | } else { |
1254 | return make_softbreak(subj->mem); |
1255 | } |
1256 | } |
1257 | |
1258 | static bufsize_t subject_find_special_char(subject *subj, int options) { |
1259 | // "\r\n\\`&_*[]<!" |
1260 | static const int8_t SPECIAL_CHARS[256] = { |
1261 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1262 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, |
1263 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1264 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, |
1265 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1266 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1267 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1268 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1269 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1270 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1271 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1272 | |
1273 | // " ' . - |
1274 | static const char SMART_PUNCT_CHARS[] = { |
1275 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1276 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, |
1277 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1278 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1279 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1280 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1281 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1282 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1283 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1284 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1285 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1286 | }; |
1287 | |
1288 | bufsize_t n = subj->pos + 1; |
1289 | |
1290 | while (n < subj->input.len) { |
1291 | if (SPECIAL_CHARS[subj->input.data[n]]) |
1292 | return n; |
1293 | if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]]) |
1294 | return n; |
1295 | n++; |
1296 | } |
1297 | |
1298 | return subj->input.len; |
1299 | } |
1300 | |
1301 | // Parse an inline, advancing subject, and add it as a child of parent. |
1302 | // Return 0 if no inline can be parsed, 1 otherwise. |
1303 | static int parse_inline(subject *subj, cmark_node *parent, int options) { |
1304 | cmark_node *new_inl = NULL; |
1305 | cmark_chunk contents; |
1306 | unsigned char c; |
1307 | bufsize_t startpos, endpos; |
1308 | c = peek_char(subj); |
1309 | if (c == 0) { |
1310 | return 0; |
1311 | } |
1312 | switch (c) { |
1313 | case '\r': |
1314 | case '\n': |
1315 | new_inl = handle_newline(subj); |
1316 | break; |
1317 | case '`': |
1318 | new_inl = handle_backticks(subj, options); |
1319 | break; |
1320 | case '\\': |
1321 | new_inl = handle_backslash(subj); |
1322 | break; |
1323 | case '&': |
1324 | new_inl = handle_entity(subj); |
1325 | break; |
1326 | case '<': |
1327 | new_inl = handle_pointy_brace(subj, options); |
1328 | break; |
1329 | case '*': |
1330 | case '_': |
1331 | case '\'': |
1332 | case '"': |
1333 | new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); |
1334 | break; |
1335 | case '-': |
1336 | new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); |
1337 | break; |
1338 | case '.': |
1339 | new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0); |
1340 | break; |
1341 | case '[': |
1342 | advance(subj); |
1343 | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[" )); |
1344 | push_bracket(subj, false, new_inl); |
1345 | break; |
1346 | case ']': |
1347 | new_inl = handle_close_bracket(subj); |
1348 | break; |
1349 | case '!': |
1350 | advance(subj); |
1351 | if (peek_char(subj) == '[') { |
1352 | advance(subj); |
1353 | new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![" )); |
1354 | push_bracket(subj, true, new_inl); |
1355 | } else { |
1356 | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!" )); |
1357 | } |
1358 | break; |
1359 | default: |
1360 | endpos = subject_find_special_char(subj, options); |
1361 | contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); |
1362 | startpos = subj->pos; |
1363 | subj->pos = endpos; |
1364 | |
1365 | // if we're at a newline, strip trailing spaces. |
1366 | if (S_is_line_end_char(peek_char(subj))) { |
1367 | cmark_chunk_rtrim(&contents); |
1368 | } |
1369 | |
1370 | new_inl = make_str(subj, startpos, endpos - 1, contents); |
1371 | } |
1372 | if (new_inl != NULL) { |
1373 | append_child(parent, new_inl); |
1374 | } |
1375 | |
1376 | return 1; |
1377 | } |
1378 | |
1379 | // Parse inlines from parent's string_content, adding as children of parent. |
1380 | void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, |
1381 | cmark_reference_map *refmap, int options) { |
1382 | subject subj; |
1383 | cmark_chunk content = {parent->data, parent->len}; |
1384 | subject_from_buf(mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap); |
1385 | cmark_chunk_rtrim(&subj.input); |
1386 | |
1387 | while (!is_eof(&subj) && parse_inline(&subj, parent, options)) |
1388 | ; |
1389 | |
1390 | process_emphasis(&subj, NULL); |
1391 | // free bracket and delim stack |
1392 | while (subj.last_delim) { |
1393 | remove_delimiter(&subj, subj.last_delim); |
1394 | } |
1395 | while (subj.last_bracket) { |
1396 | pop_bracket(&subj); |
1397 | } |
1398 | } |
1399 | |
1400 | // Parse zero or more space characters, including at most one newline. |
1401 | static void spnl(subject *subj) { |
1402 | skip_spaces(subj); |
1403 | if (skip_line_end(subj)) { |
1404 | skip_spaces(subj); |
1405 | } |
1406 | } |
1407 | |
1408 | // Parse reference. Assumes string begins with '[' character. |
1409 | // Modify refmap if a reference is encountered. |
1410 | // Return 0 if no reference found, otherwise position of subject |
1411 | // after reference is parsed. |
1412 | bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input, |
1413 | cmark_reference_map *refmap) { |
1414 | subject subj; |
1415 | |
1416 | cmark_chunk lab; |
1417 | cmark_chunk url; |
1418 | cmark_chunk title; |
1419 | |
1420 | bufsize_t matchlen = 0; |
1421 | bufsize_t beforetitle; |
1422 | |
1423 | subject_from_buf(mem, -1, 0, &subj, input, NULL); |
1424 | |
1425 | // parse label: |
1426 | if (!link_label(&subj, &lab) || lab.len == 0) |
1427 | return 0; |
1428 | |
1429 | // colon: |
1430 | if (peek_char(&subj) == ':') { |
1431 | advance(&subj); |
1432 | } else { |
1433 | return 0; |
1434 | } |
1435 | |
1436 | // parse link url: |
1437 | spnl(&subj); |
1438 | if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) { |
1439 | subj.pos += matchlen; |
1440 | } else { |
1441 | return 0; |
1442 | } |
1443 | |
1444 | // parse optional link_title |
1445 | beforetitle = subj.pos; |
1446 | spnl(&subj); |
1447 | matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos); |
1448 | if (matchlen) { |
1449 | title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); |
1450 | subj.pos += matchlen; |
1451 | } else { |
1452 | subj.pos = beforetitle; |
1453 | title = cmark_chunk_literal("" ); |
1454 | } |
1455 | |
1456 | // parse final spaces and newline: |
1457 | skip_spaces(&subj); |
1458 | if (!skip_line_end(&subj)) { |
1459 | if (matchlen) { // try rewinding before title |
1460 | subj.pos = beforetitle; |
1461 | skip_spaces(&subj); |
1462 | if (!skip_line_end(&subj)) { |
1463 | return 0; |
1464 | } |
1465 | } else { |
1466 | return 0; |
1467 | } |
1468 | } |
1469 | // insert reference into refmap |
1470 | cmark_reference_create(refmap, &lab, &url, &title); |
1471 | return subj.pos; |
1472 | } |
1473 | |