| 1 | #include <stdlib.h> |
| 2 | #include <string.h> |
| 3 | #include <stdio.h> |
| 4 | |
| 5 | #include "cmark_ctype.h" |
| 6 | #include "config.h" |
| 7 | #include "node.h" |
| 8 | #include "parser.h" |
| 9 | #include "references.h" |
| 10 | #include "cmark.h" |
| 11 | #include "houdini.h" |
| 12 | #include "utf8.h" |
| 13 | #include "scanners.h" |
| 14 | #include "inlines.h" |
| 15 | |
| 16 | static const char *EMDASH = "\xE2\x80\x94" ; |
| 17 | static const char *ENDASH = "\xE2\x80\x93" ; |
| 18 | static const char *ELLIPSES = "\xE2\x80\xA6" ; |
| 19 | static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C" ; |
| 20 | static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D" ; |
| 21 | static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98" ; |
| 22 | static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99" ; |
| 23 | |
| 24 | // Macros for creating various kinds of simple. |
| 25 | #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) |
| 26 | #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) |
| 27 | #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) |
| 28 | #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG) |
| 29 | |
| 30 | #define MAXBACKTICKS 1000 |
| 31 | |
| 32 | typedef struct delimiter { |
| 33 | struct delimiter *previous; |
| 34 | struct delimiter *next; |
| 35 | cmark_node *inl_text; |
| 36 | bufsize_t length; |
| 37 | unsigned char delim_char; |
| 38 | bool can_open; |
| 39 | bool can_close; |
| 40 | } delimiter; |
| 41 | |
| 42 | typedef struct bracket { |
| 43 | struct bracket *previous; |
| 44 | struct delimiter *previous_delimiter; |
| 45 | cmark_node *inl_text; |
| 46 | bufsize_t position; |
| 47 | bool image; |
| 48 | bool active; |
| 49 | bool bracket_after; |
| 50 | } bracket; |
| 51 | |
| 52 | #define FLAG_SKIP_HTML_CDATA (1u << 0) |
| 53 | #define FLAG_SKIP_HTML_DECLARATION (1u << 1) |
| 54 | #define FLAG_SKIP_HTML_PI (1u << 2) |
| 55 | |
| 56 | typedef struct { |
| 57 | cmark_mem *mem; |
| 58 | cmark_chunk input; |
| 59 | unsigned flags; |
| 60 | int line; |
| 61 | bufsize_t pos; |
| 62 | int block_offset; |
| 63 | int column_offset; |
| 64 | cmark_reference_map *refmap; |
| 65 | delimiter *last_delim; |
| 66 | bracket *last_bracket; |
| 67 | bufsize_t backticks[MAXBACKTICKS + 1]; |
| 68 | bool scanned_for_backticks; |
| 69 | } subject; |
| 70 | |
| 71 | static CMARK_INLINE bool S_is_line_end_char(char c) { |
| 72 | return (c == '\n' || c == '\r'); |
| 73 | } |
| 74 | |
| 75 | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
| 76 | delimiter *closer); |
| 77 | |
| 78 | static int parse_inline(subject *subj, cmark_node *parent, int options); |
| 79 | |
| 80 | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
| 81 | cmark_chunk *chunk, cmark_reference_map *refmap); |
| 82 | static bufsize_t subject_find_special_char(subject *subj, int options); |
| 83 | |
| 84 | // Create an inline with a literal string value. |
| 85 | static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, |
| 86 | int start_column, int end_column) { |
| 87 | cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); |
| 88 | e->mem = subj->mem; |
| 89 | e->type = (uint16_t)t; |
| 90 | e->start_line = e->end_line = subj->line; |
| 91 | // columns are 1 based. |
| 92 | e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; |
| 93 | e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; |
| 94 | return e; |
| 95 | } |
| 96 | |
| 97 | // Create an inline with no value. |
| 98 | static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { |
| 99 | cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); |
| 100 | e->mem = mem; |
| 101 | e->type = t; |
| 102 | return e; |
| 103 | } |
| 104 | |
| 105 | static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { |
| 106 | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
| 107 | e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1); |
| 108 | if (s.data != NULL) { |
| 109 | memcpy(e->data, s.data, s.len); |
| 110 | } |
| 111 | e->data[s.len] = 0; |
| 112 | e->len = s.len; |
| 113 | return e; |
| 114 | } |
| 115 | |
| 116 | static cmark_node *make_str_from_buf(subject *subj, int sc, int ec, |
| 117 | cmark_strbuf *buf) { |
| 118 | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
| 119 | e->len = buf->size; |
| 120 | e->data = cmark_strbuf_detach(buf); |
| 121 | return e; |
| 122 | } |
| 123 | |
| 124 | // Like make_str, but parses entities. |
| 125 | static cmark_node *make_str_with_entities(subject *subj, |
| 126 | int start_column, int end_column, |
| 127 | cmark_chunk *content) { |
| 128 | cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); |
| 129 | |
| 130 | if (houdini_unescape_html(&unescaped, content->data, content->len)) { |
| 131 | return make_str_from_buf(subj, start_column, end_column, &unescaped); |
| 132 | } else { |
| 133 | return make_str(subj, start_column, end_column, *content); |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | // Like cmark_node_append_child but without costly sanity checks. |
| 138 | // Assumes that child was newly created. |
| 139 | static void append_child(cmark_node *node, cmark_node *child) { |
| 140 | cmark_node *old_last_child = node->last_child; |
| 141 | |
| 142 | child->next = NULL; |
| 143 | child->prev = old_last_child; |
| 144 | child->parent = node; |
| 145 | node->last_child = child; |
| 146 | |
| 147 | if (old_last_child) { |
| 148 | old_last_child->next = child; |
| 149 | } else { |
| 150 | // Also set first_child if node previously had no children. |
| 151 | node->first_child = child; |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | // Duplicate a chunk by creating a copy of the buffer not by reusing the |
| 156 | // buffer like cmark_chunk_dup does. |
| 157 | static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) { |
| 158 | if (src == NULL) { |
| 159 | return NULL; |
| 160 | } |
| 161 | size_t len = strlen((char *)src); |
| 162 | unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1); |
| 163 | memcpy(data, src, len + 1); |
| 164 | return data; |
| 165 | } |
| 166 | |
| 167 | static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, |
| 168 | int is_email) { |
| 169 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
| 170 | |
| 171 | cmark_chunk_trim(url); |
| 172 | |
| 173 | if (is_email) |
| 174 | cmark_strbuf_puts(&buf, "mailto:" ); |
| 175 | |
| 176 | houdini_unescape_html_f(&buf, url->data, url->len); |
| 177 | return cmark_strbuf_detach(&buf); |
| 178 | } |
| 179 | |
| 180 | static CMARK_INLINE cmark_node *make_autolink(subject *subj, |
| 181 | int start_column, int end_column, |
| 182 | cmark_chunk url, int is_email) { |
| 183 | cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); |
| 184 | link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); |
| 185 | link->as.link.title = NULL; |
| 186 | link->start_line = link->end_line = subj->line; |
| 187 | link->start_column = start_column + 1; |
| 188 | link->end_column = end_column + 1; |
| 189 | append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); |
| 190 | return link; |
| 191 | } |
| 192 | |
| 193 | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
| 194 | cmark_chunk *chunk, cmark_reference_map *refmap) { |
| 195 | int i; |
| 196 | e->mem = mem; |
| 197 | e->input = *chunk; |
| 198 | e->flags = 0; |
| 199 | e->line = line_number; |
| 200 | e->pos = 0; |
| 201 | e->block_offset = block_offset; |
| 202 | e->column_offset = 0; |
| 203 | e->refmap = refmap; |
| 204 | e->last_delim = NULL; |
| 205 | e->last_bracket = NULL; |
| 206 | for (i = 0; i <= MAXBACKTICKS; i++) { |
| 207 | e->backticks[i] = 0; |
| 208 | } |
| 209 | e->scanned_for_backticks = false; |
| 210 | } |
| 211 | |
| 212 | static CMARK_INLINE int isbacktick(int c) { return (c == '`'); } |
| 213 | |
| 214 | static CMARK_INLINE unsigned char peek_char(subject *subj) { |
| 215 | // NULL bytes should have been stripped out by now. If they're |
| 216 | // present, it's a programming error: |
| 217 | assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0)); |
| 218 | return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0; |
| 219 | } |
| 220 | |
| 221 | static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) { |
| 222 | return subj->input.data[pos]; |
| 223 | } |
| 224 | |
| 225 | // Return true if there are more characters in the subject. |
| 226 | static CMARK_INLINE int is_eof(subject *subj) { |
| 227 | return (subj->pos >= subj->input.len); |
| 228 | } |
| 229 | |
| 230 | // Advance the subject. Doesn't check for eof. |
| 231 | #define advance(subj) (subj)->pos += 1 |
| 232 | |
| 233 | static CMARK_INLINE bool skip_spaces(subject *subj) { |
| 234 | bool skipped = false; |
| 235 | while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { |
| 236 | advance(subj); |
| 237 | skipped = true; |
| 238 | } |
| 239 | return skipped; |
| 240 | } |
| 241 | |
| 242 | static CMARK_INLINE bool skip_line_end(subject *subj) { |
| 243 | bool seen_line_end_char = false; |
| 244 | if (peek_char(subj) == '\r') { |
| 245 | advance(subj); |
| 246 | seen_line_end_char = true; |
| 247 | } |
| 248 | if (peek_char(subj) == '\n') { |
| 249 | advance(subj); |
| 250 | seen_line_end_char = true; |
| 251 | } |
| 252 | return seen_line_end_char || is_eof(subj); |
| 253 | } |
| 254 | |
| 255 | // Take characters while a predicate holds, and return a string. |
| 256 | static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { |
| 257 | unsigned char c; |
| 258 | bufsize_t startpos = subj->pos; |
| 259 | bufsize_t len = 0; |
| 260 | |
| 261 | while ((c = peek_char(subj)) && (*f)(c)) { |
| 262 | advance(subj); |
| 263 | len++; |
| 264 | } |
| 265 | |
| 266 | return cmark_chunk_dup(&subj->input, startpos, len); |
| 267 | } |
| 268 | |
| 269 | // Return the number of newlines in a given span of text in a subject. If |
| 270 | // the number is greater than zero, also return the number of characters |
| 271 | // between the last newline and the end of the span in `since_newline`. |
| 272 | static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { |
| 273 | int nls = 0; |
| 274 | int since_nl = 0; |
| 275 | |
| 276 | while (len--) { |
| 277 | if (subj->input.data[from++] == '\n') { |
| 278 | ++nls; |
| 279 | since_nl = 0; |
| 280 | } else { |
| 281 | ++since_nl; |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | if (!nls) |
| 286 | return 0; |
| 287 | |
| 288 | *since_newline = since_nl; |
| 289 | return nls; |
| 290 | } |
| 291 | |
| 292 | // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and |
| 293 | // `column_offset` according to the number of newlines in a just-matched span |
| 294 | // of text in `subj`. |
| 295 | static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int , int options) { |
| 296 | if (!(options & CMARK_OPT_SOURCEPOS)) { |
| 297 | return; |
| 298 | } |
| 299 | |
| 300 | int since_newline; |
| 301 | int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline); |
| 302 | if (newlines) { |
| 303 | subj->line += newlines; |
| 304 | node->end_line += newlines; |
| 305 | node->end_column = since_newline; |
| 306 | subj->column_offset = -subj->pos + since_newline + extra; |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | // Try to process a backtick code span that began with a |
| 311 | // span of ticks of length openticklength length (already |
| 312 | // parsed). Return 0 if you don't find matching closing |
| 313 | // backticks, otherwise return the position in the subject |
| 314 | // after the closing backticks. |
| 315 | static bufsize_t scan_to_closing_backticks(subject *subj, |
| 316 | bufsize_t openticklength) { |
| 317 | |
| 318 | bool found = false; |
| 319 | if (openticklength > MAXBACKTICKS) { |
| 320 | // we limit backtick string length because of the array subj->backticks: |
| 321 | return 0; |
| 322 | } |
| 323 | if (subj->scanned_for_backticks && |
| 324 | subj->backticks[openticklength] <= subj->pos) { |
| 325 | // return if we already know there's no closer |
| 326 | return 0; |
| 327 | } |
| 328 | while (!found) { |
| 329 | // read non backticks |
| 330 | unsigned char c; |
| 331 | while ((c = peek_char(subj)) && c != '`') { |
| 332 | advance(subj); |
| 333 | } |
| 334 | if (is_eof(subj)) { |
| 335 | break; |
| 336 | } |
| 337 | bufsize_t numticks = 0; |
| 338 | while (peek_char(subj) == '`') { |
| 339 | advance(subj); |
| 340 | numticks++; |
| 341 | } |
| 342 | // store position of ender |
| 343 | if (numticks <= MAXBACKTICKS) { |
| 344 | subj->backticks[numticks] = subj->pos - numticks; |
| 345 | } |
| 346 | if (numticks == openticklength) { |
| 347 | return (subj->pos); |
| 348 | } |
| 349 | } |
| 350 | // got through whole input without finding closer |
| 351 | subj->scanned_for_backticks = true; |
| 352 | return 0; |
| 353 | } |
| 354 | |
| 355 | // Destructively modify string, converting newlines to |
| 356 | // spaces, then removing a single leading + trailing space, |
| 357 | // unless the code span consists entirely of space characters. |
| 358 | static void S_normalize_code(cmark_strbuf *s) { |
| 359 | bufsize_t r, w; |
| 360 | bool contains_nonspace = false; |
| 361 | |
| 362 | for (r = 0, w = 0; r < s->size; ++r) { |
| 363 | switch (s->ptr[r]) { |
| 364 | case '\r': |
| 365 | if (s->ptr[r + 1] != '\n') { |
| 366 | s->ptr[w++] = ' '; |
| 367 | } |
| 368 | break; |
| 369 | case '\n': |
| 370 | s->ptr[w++] = ' '; |
| 371 | break; |
| 372 | default: |
| 373 | s->ptr[w++] = s->ptr[r]; |
| 374 | } |
| 375 | if (s->ptr[r] != ' ') { |
| 376 | contains_nonspace = true; |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | // begins and ends with space? |
| 381 | if (contains_nonspace && |
| 382 | s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') { |
| 383 | cmark_strbuf_drop(s, 1); |
| 384 | cmark_strbuf_truncate(s, w - 2); |
| 385 | } else { |
| 386 | cmark_strbuf_truncate(s, w); |
| 387 | } |
| 388 | |
| 389 | } |
| 390 | |
| 391 | |
| 392 | // Parse backtick code section or raw backticks, return an inline. |
| 393 | // Assumes that the subject has a backtick at the current position. |
| 394 | static cmark_node *handle_backticks(subject *subj, int options) { |
| 395 | cmark_chunk openticks = take_while(subj, isbacktick); |
| 396 | bufsize_t startpos = subj->pos; |
| 397 | bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); |
| 398 | |
| 399 | if (endpos == 0) { // not found |
| 400 | subj->pos = startpos; // rewind |
| 401 | return make_str(subj, subj->pos, subj->pos, openticks); |
| 402 | } else { |
| 403 | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
| 404 | |
| 405 | cmark_strbuf_set(&buf, subj->input.data + startpos, |
| 406 | endpos - startpos - openticks.len); |
| 407 | S_normalize_code(&buf); |
| 408 | |
| 409 | cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos, |
| 410 | endpos - openticks.len - 1); |
| 411 | node->len = buf.size; |
| 412 | node->data = cmark_strbuf_detach(&buf); |
| 413 | adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); |
| 414 | return node; |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | |
| 419 | // Scan ***, **, or * and return number scanned, or 0. |
| 420 | // Advances position. |
| 421 | static int scan_delims(subject *subj, unsigned char c, bool *can_open, |
| 422 | bool *can_close) { |
| 423 | int numdelims = 0; |
| 424 | bufsize_t before_char_pos; |
| 425 | int32_t after_char = 0; |
| 426 | int32_t before_char = 0; |
| 427 | int len; |
| 428 | bool left_flanking, right_flanking; |
| 429 | |
| 430 | if (subj->pos == 0) { |
| 431 | before_char = 10; |
| 432 | } else { |
| 433 | before_char_pos = subj->pos - 1; |
| 434 | // walk back to the beginning of the UTF_8 sequence: |
| 435 | while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { |
| 436 | before_char_pos -= 1; |
| 437 | } |
| 438 | len = cmark_utf8proc_iterate(subj->input.data + before_char_pos, |
| 439 | subj->pos - before_char_pos, &before_char); |
| 440 | if (len == -1) { |
| 441 | before_char = 10; |
| 442 | } |
| 443 | } |
| 444 | |
| 445 | if (c == '\'' || c == '"') { |
| 446 | numdelims++; |
| 447 | advance(subj); // limit to 1 delim for quotes |
| 448 | } else { |
| 449 | while (peek_char(subj) == c) { |
| 450 | numdelims++; |
| 451 | advance(subj); |
| 452 | } |
| 453 | } |
| 454 | |
| 455 | len = cmark_utf8proc_iterate(subj->input.data + subj->pos, |
| 456 | subj->input.len - subj->pos, &after_char); |
| 457 | if (len == -1) { |
| 458 | after_char = 10; |
| 459 | } |
| 460 | left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) && |
| 461 | (!cmark_utf8proc_is_punctuation(after_char) || |
| 462 | cmark_utf8proc_is_space(before_char) || |
| 463 | cmark_utf8proc_is_punctuation(before_char)); |
| 464 | right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) && |
| 465 | (!cmark_utf8proc_is_punctuation(before_char) || |
| 466 | cmark_utf8proc_is_space(after_char) || |
| 467 | cmark_utf8proc_is_punctuation(after_char)); |
| 468 | if (c == '_') { |
| 469 | *can_open = left_flanking && |
| 470 | (!right_flanking || cmark_utf8proc_is_punctuation(before_char)); |
| 471 | *can_close = right_flanking && |
| 472 | (!left_flanking || cmark_utf8proc_is_punctuation(after_char)); |
| 473 | } else if (c == '\'' || c == '"') { |
| 474 | *can_open = left_flanking && |
| 475 | (!right_flanking || before_char == '(' || before_char == '[') && |
| 476 | before_char != ']' && before_char != ')'; |
| 477 | *can_close = right_flanking; |
| 478 | } else { |
| 479 | *can_open = left_flanking; |
| 480 | *can_close = right_flanking; |
| 481 | } |
| 482 | return numdelims; |
| 483 | } |
| 484 | |
| 485 | /* |
| 486 | static void print_delimiters(subject *subj) |
| 487 | { |
| 488 | delimiter *delim; |
| 489 | delim = subj->last_delim; |
| 490 | while (delim != NULL) { |
| 491 | printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n", |
| 492 | (void*)delim, delim->delim_char, |
| 493 | delim->can_open, delim->can_close, |
| 494 | (void*)delim->next, (void*)delim->previous); |
| 495 | delim = delim->previous; |
| 496 | } |
| 497 | } |
| 498 | */ |
| 499 | |
| 500 | static void remove_delimiter(subject *subj, delimiter *delim) { |
| 501 | if (delim == NULL) |
| 502 | return; |
| 503 | if (delim->next == NULL) { |
| 504 | // end of list: |
| 505 | assert(delim == subj->last_delim); |
| 506 | subj->last_delim = delim->previous; |
| 507 | } else { |
| 508 | delim->next->previous = delim->previous; |
| 509 | } |
| 510 | if (delim->previous != NULL) { |
| 511 | delim->previous->next = delim->next; |
| 512 | } |
| 513 | subj->mem->free(delim); |
| 514 | } |
| 515 | |
| 516 | static void pop_bracket(subject *subj) { |
| 517 | bracket *b; |
| 518 | if (subj->last_bracket == NULL) |
| 519 | return; |
| 520 | b = subj->last_bracket; |
| 521 | subj->last_bracket = subj->last_bracket->previous; |
| 522 | subj->mem->free(b); |
| 523 | } |
| 524 | |
| 525 | static void push_delimiter(subject *subj, unsigned char c, bool can_open, |
| 526 | bool can_close, cmark_node *inl_text) { |
| 527 | delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter)); |
| 528 | delim->delim_char = c; |
| 529 | delim->can_open = can_open; |
| 530 | delim->can_close = can_close; |
| 531 | delim->inl_text = inl_text; |
| 532 | delim->length = inl_text->len; |
| 533 | delim->previous = subj->last_delim; |
| 534 | delim->next = NULL; |
| 535 | if (delim->previous != NULL) { |
| 536 | delim->previous->next = delim; |
| 537 | } |
| 538 | subj->last_delim = delim; |
| 539 | } |
| 540 | |
| 541 | static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { |
| 542 | bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket)); |
| 543 | if (subj->last_bracket != NULL) { |
| 544 | subj->last_bracket->bracket_after = true; |
| 545 | } |
| 546 | b->image = image; |
| 547 | b->active = true; |
| 548 | b->inl_text = inl_text; |
| 549 | b->previous = subj->last_bracket; |
| 550 | b->previous_delimiter = subj->last_delim; |
| 551 | b->position = subj->pos; |
| 552 | b->bracket_after = false; |
| 553 | subj->last_bracket = b; |
| 554 | } |
| 555 | |
| 556 | // Assumes the subject has a c at the current position. |
| 557 | static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { |
| 558 | bufsize_t numdelims; |
| 559 | cmark_node *inl_text; |
| 560 | bool can_open, can_close; |
| 561 | cmark_chunk contents; |
| 562 | |
| 563 | numdelims = scan_delims(subj, c, &can_open, &can_close); |
| 564 | |
| 565 | if (c == '\'' && smart) { |
| 566 | contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); |
| 567 | } else if (c == '"' && smart) { |
| 568 | contents = |
| 569 | cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); |
| 570 | } else { |
| 571 | contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); |
| 572 | } |
| 573 | |
| 574 | inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); |
| 575 | |
| 576 | if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { |
| 577 | push_delimiter(subj, c, can_open, can_close, inl_text); |
| 578 | } |
| 579 | |
| 580 | return inl_text; |
| 581 | } |
| 582 | |
| 583 | // Assumes we have a hyphen at the current position. |
| 584 | static cmark_node *handle_hyphen(subject *subj, bool smart) { |
| 585 | int startpos = subj->pos; |
| 586 | |
| 587 | advance(subj); |
| 588 | |
| 589 | if (!smart || peek_char(subj) != '-') { |
| 590 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-" )); |
| 591 | } |
| 592 | |
| 593 | while (smart && peek_char(subj) == '-') { |
| 594 | advance(subj); |
| 595 | } |
| 596 | |
| 597 | int numhyphens = subj->pos - startpos; |
| 598 | int en_count = 0; |
| 599 | int em_count = 0; |
| 600 | int i; |
| 601 | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
| 602 | |
| 603 | if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes |
| 604 | em_count = numhyphens / 3; |
| 605 | } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes |
| 606 | en_count = numhyphens / 2; |
| 607 | } else if (numhyphens % 3 == 2) { // use one en dash at end |
| 608 | en_count = 1; |
| 609 | em_count = (numhyphens - 2) / 3; |
| 610 | } else { // use two en dashes at the end |
| 611 | en_count = 2; |
| 612 | em_count = (numhyphens - 4) / 3; |
| 613 | } |
| 614 | |
| 615 | for (i = em_count; i > 0; i--) { |
| 616 | cmark_strbuf_puts(&buf, EMDASH); |
| 617 | } |
| 618 | |
| 619 | for (i = en_count; i > 0; i--) { |
| 620 | cmark_strbuf_puts(&buf, ENDASH); |
| 621 | } |
| 622 | |
| 623 | return make_str_from_buf(subj, startpos, subj->pos - 1, &buf); |
| 624 | } |
| 625 | |
| 626 | // Assumes we have a period at the current position. |
| 627 | static cmark_node *handle_period(subject *subj, bool smart) { |
| 628 | advance(subj); |
| 629 | if (smart && peek_char(subj) == '.') { |
| 630 | advance(subj); |
| 631 | if (peek_char(subj) == '.') { |
| 632 | advance(subj); |
| 633 | return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); |
| 634 | } else { |
| 635 | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".." )); |
| 636 | } |
| 637 | } else { |
| 638 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("." )); |
| 639 | } |
| 640 | } |
| 641 | |
| 642 | static void process_emphasis(subject *subj, delimiter *stack_bottom) { |
| 643 | delimiter *closer = subj->last_delim; |
| 644 | delimiter *opener; |
| 645 | delimiter *old_closer; |
| 646 | delimiter *new_stack_bottom; |
| 647 | bool opener_found; |
| 648 | |
| 649 | int openers_bottom_index = 0; |
| 650 | delimiter *openers_bottom[6] = {stack_bottom, stack_bottom, stack_bottom, |
| 651 | stack_bottom}; |
| 652 | |
| 653 | // move back to first relevant delim. |
| 654 | while (closer != NULL && closer->previous != stack_bottom) { |
| 655 | closer = closer->previous; |
| 656 | } |
| 657 | |
| 658 | // now move forward, looking for closers, and handling each |
| 659 | while (closer != NULL) { |
| 660 | if (closer->can_close) { |
| 661 | switch (closer->delim_char) { |
| 662 | case '"': |
| 663 | openers_bottom_index = 0; |
| 664 | break; |
| 665 | case '\'': |
| 666 | openers_bottom_index = 1; |
| 667 | break; |
| 668 | case '_': |
| 669 | openers_bottom_index = 2; |
| 670 | break; |
| 671 | case '*': |
| 672 | openers_bottom_index = 3; |
| 673 | break; |
| 674 | default: |
| 675 | assert(false); |
| 676 | } |
| 677 | |
| 678 | // Now look backwards for first matching opener: |
| 679 | opener = closer->previous; |
| 680 | opener_found = false; |
| 681 | new_stack_bottom = closer->previous; |
| 682 | |
| 683 | while (opener != NULL && opener != openers_bottom[openers_bottom_index]) { |
| 684 | if (opener->can_open && opener->delim_char == closer->delim_char) { |
| 685 | // interior closer of size 2 can't match opener of size 1 |
| 686 | // or of size 1 can't match 2 |
| 687 | if (!(closer->can_open || opener->can_close) || |
| 688 | closer->length % 3 == 0 || |
| 689 | (opener->length + closer->length) % 3 != 0) { |
| 690 | opener_found = true; |
| 691 | break; |
| 692 | } else { |
| 693 | // If we failed to match because of the mod-3 rule, |
| 694 | // then we want to make sure the stack bottom extends |
| 695 | // back to here at least, since a later closer might |
| 696 | // match this same opener... (see #383) |
| 697 | new_stack_bottom = opener->previous; |
| 698 | } |
| 699 | } |
| 700 | opener = opener->previous; |
| 701 | } |
| 702 | old_closer = closer; |
| 703 | if (closer->delim_char == '*' || closer->delim_char == '_') { |
| 704 | if (opener_found) { |
| 705 | closer = S_insert_emph(subj, opener, closer); |
| 706 | } else { |
| 707 | closer = closer->next; |
| 708 | } |
| 709 | } else if (closer->delim_char == '\'') { |
| 710 | cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE); |
| 711 | if (opener_found) { |
| 712 | cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE); |
| 713 | } |
| 714 | closer = closer->next; |
| 715 | } else if (closer->delim_char == '"') { |
| 716 | cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE); |
| 717 | if (opener_found) { |
| 718 | cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE); |
| 719 | } |
| 720 | closer = closer->next; |
| 721 | } |
| 722 | if (!opener_found) { |
| 723 | // set lower bound for future searches for openers (see #383). |
| 724 | openers_bottom[openers_bottom_index] = new_stack_bottom; |
| 725 | if (!old_closer->can_open) { |
| 726 | // we can remove a closer that can't be an |
| 727 | // opener, once we've seen there's no |
| 728 | // matching opener: |
| 729 | remove_delimiter(subj, old_closer); |
| 730 | } |
| 731 | } |
| 732 | } else { |
| 733 | closer = closer->next; |
| 734 | } |
| 735 | } |
| 736 | // free all delimiters in list until stack_bottom: |
| 737 | while (subj->last_delim != NULL && subj->last_delim != stack_bottom) { |
| 738 | remove_delimiter(subj, subj->last_delim); |
| 739 | } |
| 740 | } |
| 741 | |
| 742 | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
| 743 | delimiter *closer) { |
| 744 | delimiter *delim, *tmp_delim; |
| 745 | bufsize_t use_delims; |
| 746 | cmark_node *opener_inl = opener->inl_text; |
| 747 | cmark_node *closer_inl = closer->inl_text; |
| 748 | bufsize_t opener_num_chars = opener_inl->len; |
| 749 | bufsize_t closer_num_chars = closer_inl->len; |
| 750 | cmark_node *tmp, *tmpnext, *emph; |
| 751 | |
| 752 | // calculate the actual number of characters used from this closer |
| 753 | use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1; |
| 754 | |
| 755 | // remove used characters from associated inlines. |
| 756 | opener_num_chars -= use_delims; |
| 757 | closer_num_chars -= use_delims; |
| 758 | opener_inl->len = opener_num_chars; |
| 759 | opener_inl->data[opener_num_chars] = 0; |
| 760 | closer_inl->len = closer_num_chars; |
| 761 | closer_inl->data[closer_num_chars] = 0; |
| 762 | |
| 763 | // free delimiters between opener and closer |
| 764 | delim = closer->previous; |
| 765 | while (delim != NULL && delim != opener) { |
| 766 | tmp_delim = delim->previous; |
| 767 | remove_delimiter(subj, delim); |
| 768 | delim = tmp_delim; |
| 769 | } |
| 770 | |
| 771 | // create new emph or strong, and splice it in to our inlines |
| 772 | // between the opener and closer |
| 773 | emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem); |
| 774 | |
| 775 | tmp = opener_inl->next; |
| 776 | while (tmp && tmp != closer_inl) { |
| 777 | tmpnext = tmp->next; |
| 778 | cmark_node_unlink(tmp); |
| 779 | append_child(emph, tmp); |
| 780 | tmp = tmpnext; |
| 781 | } |
| 782 | cmark_node_insert_after(opener_inl, emph); |
| 783 | |
| 784 | emph->start_line = opener_inl->start_line; |
| 785 | emph->end_line = closer_inl->end_line; |
| 786 | emph->start_column = opener_inl->start_column; |
| 787 | emph->end_column = closer_inl->end_column; |
| 788 | |
| 789 | // if opener has 0 characters, remove it and its associated inline |
| 790 | if (opener_num_chars == 0) { |
| 791 | cmark_node_free(opener_inl); |
| 792 | remove_delimiter(subj, opener); |
| 793 | } |
| 794 | |
| 795 | // if closer has 0 characters, remove it and its associated inline |
| 796 | if (closer_num_chars == 0) { |
| 797 | // remove empty closer inline |
| 798 | cmark_node_free(closer_inl); |
| 799 | // remove closer from list |
| 800 | tmp_delim = closer->next; |
| 801 | remove_delimiter(subj, closer); |
| 802 | closer = tmp_delim; |
| 803 | } |
| 804 | |
| 805 | return closer; |
| 806 | } |
| 807 | |
| 808 | // Parse backslash-escape or just a backslash, returning an inline. |
| 809 | static cmark_node *handle_backslash(subject *subj) { |
| 810 | advance(subj); |
| 811 | unsigned char nextchar = peek_char(subj); |
| 812 | if (cmark_ispunct( |
| 813 | nextchar)) { // only ascii symbols and newline can be escaped |
| 814 | advance(subj); |
| 815 | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); |
| 816 | } else if (!is_eof(subj) && skip_line_end(subj)) { |
| 817 | return make_linebreak(subj->mem); |
| 818 | } else { |
| 819 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\" )); |
| 820 | } |
| 821 | } |
| 822 | |
| 823 | // Parse an entity or a regular "&" string. |
| 824 | // Assumes the subject has an '&' character at the current position. |
| 825 | static cmark_node *handle_entity(subject *subj) { |
| 826 | cmark_strbuf ent = CMARK_BUF_INIT(subj->mem); |
| 827 | bufsize_t len; |
| 828 | |
| 829 | advance(subj); |
| 830 | |
| 831 | len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, |
| 832 | subj->input.len - subj->pos); |
| 833 | |
| 834 | if (len <= 0) |
| 835 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&" )); |
| 836 | |
| 837 | subj->pos += len; |
| 838 | return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent); |
| 839 | } |
| 840 | |
| 841 | // Clean a URL: remove surrounding whitespace, and remove \ that escape |
| 842 | // punctuation. |
| 843 | unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { |
| 844 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
| 845 | |
| 846 | cmark_chunk_trim(url); |
| 847 | |
| 848 | houdini_unescape_html_f(&buf, url->data, url->len); |
| 849 | |
| 850 | cmark_strbuf_unescape(&buf); |
| 851 | return cmark_strbuf_detach(&buf); |
| 852 | } |
| 853 | |
| 854 | unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { |
| 855 | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
| 856 | unsigned char first, last; |
| 857 | |
| 858 | if (title->len == 0) { |
| 859 | return NULL; |
| 860 | } |
| 861 | |
| 862 | first = title->data[0]; |
| 863 | last = title->data[title->len - 1]; |
| 864 | |
| 865 | // remove surrounding quotes if any: |
| 866 | if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || |
| 867 | (first == '"' && last == '"')) { |
| 868 | houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); |
| 869 | } else { |
| 870 | houdini_unescape_html_f(&buf, title->data, title->len); |
| 871 | } |
| 872 | |
| 873 | cmark_strbuf_unescape(&buf); |
| 874 | return cmark_strbuf_detach(&buf); |
| 875 | } |
| 876 | |
| 877 | // Parse an autolink or HTML tag. |
| 878 | // Assumes the subject has a '<' character at the current position. |
| 879 | static cmark_node *handle_pointy_brace(subject *subj, int options) { |
| 880 | bufsize_t matchlen = 0; |
| 881 | cmark_chunk contents; |
| 882 | |
| 883 | advance(subj); // advance past first < |
| 884 | |
| 885 | // first try to match a URL autolink |
| 886 | matchlen = scan_autolink_uri(&subj->input, subj->pos); |
| 887 | if (matchlen > 0) { |
| 888 | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
| 889 | subj->pos += matchlen; |
| 890 | |
| 891 | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); |
| 892 | } |
| 893 | |
| 894 | // next try to match an email autolink |
| 895 | matchlen = scan_autolink_email(&subj->input, subj->pos); |
| 896 | if (matchlen > 0) { |
| 897 | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
| 898 | subj->pos += matchlen; |
| 899 | |
| 900 | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); |
| 901 | } |
| 902 | |
| 903 | // finally, try to match an html tag |
| 904 | if (subj->pos + 2 <= subj->input.len) { |
| 905 | int c = subj->input.data[subj->pos]; |
| 906 | if (c == '!') { |
| 907 | c = subj->input.data[subj->pos+1]; |
| 908 | if (c == '-') { |
| 909 | matchlen = scan_html_comment(&subj->input, subj->pos + 2); |
| 910 | if (matchlen > 0) |
| 911 | matchlen += 2; // prefix "<-" |
| 912 | } else if (c == '[') { |
| 913 | if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) { |
| 914 | matchlen = scan_html_cdata(&subj->input, subj->pos + 2); |
| 915 | if (matchlen > 0) { |
| 916 | // The regex doesn't require the final "]]>". But if we're not at |
| 917 | // the end of input, it must come after the match. Otherwise, |
| 918 | // disable subsequent scans to avoid quadratic behavior. |
| 919 | matchlen += 5; // prefix "![", suffix "]]>" |
| 920 | if (subj->pos + matchlen > subj->input.len) { |
| 921 | subj->flags |= FLAG_SKIP_HTML_CDATA; |
| 922 | matchlen = 0; |
| 923 | } |
| 924 | } |
| 925 | } |
| 926 | } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) { |
| 927 | matchlen = scan_html_declaration(&subj->input, subj->pos + 1); |
| 928 | if (matchlen > 0) { |
| 929 | matchlen += 2; // prefix "!", suffix ">" |
| 930 | if (subj->pos + matchlen > subj->input.len) { |
| 931 | subj->flags |= FLAG_SKIP_HTML_DECLARATION; |
| 932 | matchlen = 0; |
| 933 | } |
| 934 | } |
| 935 | } |
| 936 | } else if (c == '?') { |
| 937 | if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) { |
| 938 | // Note that we allow an empty match. |
| 939 | matchlen = scan_html_pi(&subj->input, subj->pos + 1); |
| 940 | matchlen += 3; // prefix "?", suffix "?>" |
| 941 | if (subj->pos + matchlen > subj->input.len) { |
| 942 | subj->flags |= FLAG_SKIP_HTML_PI; |
| 943 | matchlen = 0; |
| 944 | } |
| 945 | } |
| 946 | } else { |
| 947 | matchlen = scan_html_tag(&subj->input, subj->pos); |
| 948 | } |
| 949 | } |
| 950 | if (matchlen > 0) { |
| 951 | const unsigned char *src = subj->input.data + subj->pos - 1; |
| 952 | bufsize_t len = matchlen + 1; |
| 953 | subj->pos += matchlen; |
| 954 | cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE, |
| 955 | subj->pos - matchlen - 1, subj->pos - 1); |
| 956 | node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1); |
| 957 | memcpy(node->data, src, len); |
| 958 | node->data[len] = 0; |
| 959 | node->len = len; |
| 960 | adjust_subj_node_newlines(subj, node, matchlen, 1, options); |
| 961 | return node; |
| 962 | } |
| 963 | |
| 964 | // if nothing matches, just return the opening <: |
| 965 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<" )); |
| 966 | } |
| 967 | |
| 968 | // Parse a link label. Returns 1 if successful. |
| 969 | // Note: unescaped brackets are not allowed in labels. |
| 970 | // The label begins with `[` and ends with the first `]` character |
| 971 | // encountered. Backticks in labels do not start code spans. |
| 972 | static int link_label(subject *subj, cmark_chunk *raw_label) { |
| 973 | bufsize_t startpos = subj->pos; |
| 974 | int length = 0; |
| 975 | unsigned char c; |
| 976 | |
| 977 | // advance past [ |
| 978 | if (peek_char(subj) == '[') { |
| 979 | advance(subj); |
| 980 | } else { |
| 981 | return 0; |
| 982 | } |
| 983 | |
| 984 | while ((c = peek_char(subj)) && c != '[' && c != ']') { |
| 985 | if (c == '\\') { |
| 986 | advance(subj); |
| 987 | length++; |
| 988 | if (cmark_ispunct(peek_char(subj))) { |
| 989 | advance(subj); |
| 990 | length++; |
| 991 | } |
| 992 | } else { |
| 993 | advance(subj); |
| 994 | length++; |
| 995 | } |
| 996 | if (length > MAX_LINK_LABEL_LENGTH) { |
| 997 | goto noMatch; |
| 998 | } |
| 999 | } |
| 1000 | |
| 1001 | if (c == ']') { // match found |
| 1002 | *raw_label = |
| 1003 | cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); |
| 1004 | cmark_chunk_trim(raw_label); |
| 1005 | advance(subj); // advance past ] |
| 1006 | return 1; |
| 1007 | } |
| 1008 | |
| 1009 | noMatch: |
| 1010 | subj->pos = startpos; // rewind |
| 1011 | return 0; |
| 1012 | } |
| 1013 | |
| 1014 | static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, |
| 1015 | cmark_chunk *output) { |
| 1016 | bufsize_t i = offset; |
| 1017 | size_t nb_p = 0; |
| 1018 | |
| 1019 | while (i < input->len) { |
| 1020 | if (input->data[i] == '\\' && |
| 1021 | i + 1 < input-> len && |
| 1022 | cmark_ispunct(input->data[i+1])) |
| 1023 | i += 2; |
| 1024 | else if (input->data[i] == '(') { |
| 1025 | ++nb_p; |
| 1026 | ++i; |
| 1027 | if (nb_p > 32) |
| 1028 | return -1; |
| 1029 | } else if (input->data[i] == ')') { |
| 1030 | if (nb_p == 0) |
| 1031 | break; |
| 1032 | --nb_p; |
| 1033 | ++i; |
| 1034 | } else if (cmark_isspace(input->data[i])) { |
| 1035 | if (i == offset) { |
| 1036 | return -1; |
| 1037 | } |
| 1038 | break; |
| 1039 | } else { |
| 1040 | ++i; |
| 1041 | } |
| 1042 | } |
| 1043 | |
| 1044 | if (i >= input->len || nb_p != 0) |
| 1045 | return -1; |
| 1046 | |
| 1047 | { |
| 1048 | cmark_chunk result = {input->data + offset, i - offset}; |
| 1049 | *output = result; |
| 1050 | } |
| 1051 | return i - offset; |
| 1052 | } |
| 1053 | |
| 1054 | static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, |
| 1055 | cmark_chunk *output) { |
| 1056 | bufsize_t i = offset; |
| 1057 | |
| 1058 | if (i < input->len && input->data[i] == '<') { |
| 1059 | ++i; |
| 1060 | while (i < input->len) { |
| 1061 | if (input->data[i] == '>') { |
| 1062 | ++i; |
| 1063 | break; |
| 1064 | } else if (input->data[i] == '\\') |
| 1065 | i += 2; |
| 1066 | else if (input->data[i] == '\n' || input->data[i] == '<') |
| 1067 | return -1; |
| 1068 | else |
| 1069 | ++i; |
| 1070 | } |
| 1071 | } else { |
| 1072 | return manual_scan_link_url_2(input, offset, output); |
| 1073 | } |
| 1074 | |
| 1075 | if (i >= input->len) |
| 1076 | return -1; |
| 1077 | |
| 1078 | { |
| 1079 | cmark_chunk result = {input->data + offset + 1, i - 2 - offset}; |
| 1080 | *output = result; |
| 1081 | } |
| 1082 | return i - offset; |
| 1083 | } |
| 1084 | |
| 1085 | // Return a link, an image, or a literal close bracket. |
| 1086 | static cmark_node *handle_close_bracket(subject *subj) { |
| 1087 | bufsize_t initial_pos, after_link_text_pos; |
| 1088 | bufsize_t endurl, starttitle, endtitle, endall; |
| 1089 | bufsize_t sps, n; |
| 1090 | cmark_reference *ref = NULL; |
| 1091 | cmark_chunk url_chunk, title_chunk; |
| 1092 | unsigned char *url, *title; |
| 1093 | bracket *opener; |
| 1094 | cmark_node *inl; |
| 1095 | cmark_chunk raw_label; |
| 1096 | int found_label; |
| 1097 | cmark_node *tmp, *tmpnext; |
| 1098 | bool is_image; |
| 1099 | |
| 1100 | advance(subj); // advance past ] |
| 1101 | initial_pos = subj->pos; |
| 1102 | |
| 1103 | // get last [ or ![ |
| 1104 | opener = subj->last_bracket; |
| 1105 | |
| 1106 | if (opener == NULL) { |
| 1107 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
| 1108 | } |
| 1109 | |
| 1110 | if (!opener->active) { |
| 1111 | // take delimiter off stack |
| 1112 | pop_bracket(subj); |
| 1113 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
| 1114 | } |
| 1115 | |
| 1116 | // If we got here, we matched a potential link/image text. |
| 1117 | // Now we check to see if it's a link/image. |
| 1118 | is_image = opener->image; |
| 1119 | |
| 1120 | after_link_text_pos = subj->pos; |
| 1121 | |
| 1122 | // First, look for an inline link. |
| 1123 | if (peek_char(subj) == '(' && |
| 1124 | ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && |
| 1125 | ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, |
| 1126 | &url_chunk)) > -1)) { |
| 1127 | |
| 1128 | // try to parse an explicit link: |
| 1129 | endurl = subj->pos + 1 + sps + n; |
| 1130 | starttitle = endurl + scan_spacechars(&subj->input, endurl); |
| 1131 | |
| 1132 | // ensure there are spaces btw url and title |
| 1133 | endtitle = (starttitle == endurl) |
| 1134 | ? starttitle |
| 1135 | : starttitle + scan_link_title(&subj->input, starttitle); |
| 1136 | |
| 1137 | endall = endtitle + scan_spacechars(&subj->input, endtitle); |
| 1138 | |
| 1139 | if (peek_at(subj, endall) == ')') { |
| 1140 | subj->pos = endall + 1; |
| 1141 | |
| 1142 | title_chunk = |
| 1143 | cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); |
| 1144 | url = cmark_clean_url(subj->mem, &url_chunk); |
| 1145 | title = cmark_clean_title(subj->mem, &title_chunk); |
| 1146 | cmark_chunk_free(&url_chunk); |
| 1147 | cmark_chunk_free(&title_chunk); |
| 1148 | goto match; |
| 1149 | |
| 1150 | } else { |
| 1151 | // it could still be a shortcut reference link |
| 1152 | subj->pos = after_link_text_pos; |
| 1153 | } |
| 1154 | } |
| 1155 | |
| 1156 | // Next, look for a following [link label] that matches in refmap. |
| 1157 | // skip spaces |
| 1158 | raw_label = cmark_chunk_literal("" ); |
| 1159 | found_label = link_label(subj, &raw_label); |
| 1160 | if (!found_label) { |
| 1161 | // If we have a shortcut reference link, back up |
| 1162 | // to before the spacse we skipped. |
| 1163 | subj->pos = initial_pos; |
| 1164 | } |
| 1165 | |
| 1166 | if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { |
| 1167 | cmark_chunk_free(&raw_label); |
| 1168 | raw_label = cmark_chunk_dup(&subj->input, opener->position, |
| 1169 | initial_pos - opener->position - 1); |
| 1170 | found_label = true; |
| 1171 | } |
| 1172 | |
| 1173 | if (found_label) { |
| 1174 | ref = cmark_reference_lookup(subj->refmap, &raw_label); |
| 1175 | cmark_chunk_free(&raw_label); |
| 1176 | } |
| 1177 | |
| 1178 | if (ref != NULL) { // found |
| 1179 | url = cmark_strdup(subj->mem, ref->url); |
| 1180 | title = cmark_strdup(subj->mem, ref->title); |
| 1181 | goto match; |
| 1182 | } else { |
| 1183 | goto noMatch; |
| 1184 | } |
| 1185 | |
| 1186 | noMatch: |
| 1187 | // If we fall through to here, it means we didn't match a link: |
| 1188 | pop_bracket(subj); // remove this opener from delimiter list |
| 1189 | subj->pos = initial_pos; |
| 1190 | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]" )); |
| 1191 | |
| 1192 | match: |
| 1193 | inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); |
| 1194 | inl->as.link.url = url; |
| 1195 | inl->as.link.title = title; |
| 1196 | inl->start_line = inl->end_line = subj->line; |
| 1197 | inl->start_column = opener->inl_text->start_column; |
| 1198 | inl->end_column = subj->pos + subj->column_offset + subj->block_offset; |
| 1199 | cmark_node_insert_before(opener->inl_text, inl); |
| 1200 | // Add link text: |
| 1201 | tmp = opener->inl_text->next; |
| 1202 | while (tmp) { |
| 1203 | tmpnext = tmp->next; |
| 1204 | cmark_node_unlink(tmp); |
| 1205 | append_child(inl, tmp); |
| 1206 | tmp = tmpnext; |
| 1207 | } |
| 1208 | |
| 1209 | // Free the bracket [: |
| 1210 | cmark_node_free(opener->inl_text); |
| 1211 | |
| 1212 | process_emphasis(subj, opener->previous_delimiter); |
| 1213 | pop_bracket(subj); |
| 1214 | |
| 1215 | // Now, if we have a link, we also want to deactivate earlier link |
| 1216 | // delimiters. (This code can be removed if we decide to allow links |
| 1217 | // inside links.) |
| 1218 | if (!is_image) { |
| 1219 | opener = subj->last_bracket; |
| 1220 | while (opener != NULL) { |
| 1221 | if (!opener->image) { |
| 1222 | if (!opener->active) { |
| 1223 | break; |
| 1224 | } else { |
| 1225 | opener->active = false; |
| 1226 | } |
| 1227 | } |
| 1228 | opener = opener->previous; |
| 1229 | } |
| 1230 | } |
| 1231 | |
| 1232 | return NULL; |
| 1233 | } |
| 1234 | |
| 1235 | // Parse a hard or soft linebreak, returning an inline. |
| 1236 | // Assumes the subject has a cr or newline at the current position. |
| 1237 | static cmark_node *handle_newline(subject *subj) { |
| 1238 | bufsize_t nlpos = subj->pos; |
| 1239 | // skip over cr, crlf, or lf: |
| 1240 | if (peek_at(subj, subj->pos) == '\r') { |
| 1241 | advance(subj); |
| 1242 | } |
| 1243 | if (peek_at(subj, subj->pos) == '\n') { |
| 1244 | advance(subj); |
| 1245 | } |
| 1246 | ++subj->line; |
| 1247 | subj->column_offset = -subj->pos; |
| 1248 | // skip spaces at beginning of line |
| 1249 | skip_spaces(subj); |
| 1250 | if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && |
| 1251 | peek_at(subj, nlpos - 2) == ' ') { |
| 1252 | return make_linebreak(subj->mem); |
| 1253 | } else { |
| 1254 | return make_softbreak(subj->mem); |
| 1255 | } |
| 1256 | } |
| 1257 | |
| 1258 | static bufsize_t subject_find_special_char(subject *subj, int options) { |
| 1259 | // "\r\n\\`&_*[]<!" |
| 1260 | static const int8_t SPECIAL_CHARS[256] = { |
| 1261 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1262 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, |
| 1263 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1264 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, |
| 1265 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1266 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1267 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1268 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1269 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1270 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1271 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 1272 | |
| 1273 | // " ' . - |
| 1274 | static const char SMART_PUNCT_CHARS[] = { |
| 1275 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1276 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, |
| 1277 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1278 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1279 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1280 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1281 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1282 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1283 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1284 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1285 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1286 | }; |
| 1287 | |
| 1288 | bufsize_t n = subj->pos + 1; |
| 1289 | |
| 1290 | while (n < subj->input.len) { |
| 1291 | if (SPECIAL_CHARS[subj->input.data[n]]) |
| 1292 | return n; |
| 1293 | if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]]) |
| 1294 | return n; |
| 1295 | n++; |
| 1296 | } |
| 1297 | |
| 1298 | return subj->input.len; |
| 1299 | } |
| 1300 | |
| 1301 | // Parse an inline, advancing subject, and add it as a child of parent. |
| 1302 | // Return 0 if no inline can be parsed, 1 otherwise. |
| 1303 | static int parse_inline(subject *subj, cmark_node *parent, int options) { |
| 1304 | cmark_node *new_inl = NULL; |
| 1305 | cmark_chunk contents; |
| 1306 | unsigned char c; |
| 1307 | bufsize_t startpos, endpos; |
| 1308 | c = peek_char(subj); |
| 1309 | if (c == 0) { |
| 1310 | return 0; |
| 1311 | } |
| 1312 | switch (c) { |
| 1313 | case '\r': |
| 1314 | case '\n': |
| 1315 | new_inl = handle_newline(subj); |
| 1316 | break; |
| 1317 | case '`': |
| 1318 | new_inl = handle_backticks(subj, options); |
| 1319 | break; |
| 1320 | case '\\': |
| 1321 | new_inl = handle_backslash(subj); |
| 1322 | break; |
| 1323 | case '&': |
| 1324 | new_inl = handle_entity(subj); |
| 1325 | break; |
| 1326 | case '<': |
| 1327 | new_inl = handle_pointy_brace(subj, options); |
| 1328 | break; |
| 1329 | case '*': |
| 1330 | case '_': |
| 1331 | case '\'': |
| 1332 | case '"': |
| 1333 | new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); |
| 1334 | break; |
| 1335 | case '-': |
| 1336 | new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); |
| 1337 | break; |
| 1338 | case '.': |
| 1339 | new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0); |
| 1340 | break; |
| 1341 | case '[': |
| 1342 | advance(subj); |
| 1343 | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[" )); |
| 1344 | push_bracket(subj, false, new_inl); |
| 1345 | break; |
| 1346 | case ']': |
| 1347 | new_inl = handle_close_bracket(subj); |
| 1348 | break; |
| 1349 | case '!': |
| 1350 | advance(subj); |
| 1351 | if (peek_char(subj) == '[') { |
| 1352 | advance(subj); |
| 1353 | new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![" )); |
| 1354 | push_bracket(subj, true, new_inl); |
| 1355 | } else { |
| 1356 | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!" )); |
| 1357 | } |
| 1358 | break; |
| 1359 | default: |
| 1360 | endpos = subject_find_special_char(subj, options); |
| 1361 | contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); |
| 1362 | startpos = subj->pos; |
| 1363 | subj->pos = endpos; |
| 1364 | |
| 1365 | // if we're at a newline, strip trailing spaces. |
| 1366 | if (S_is_line_end_char(peek_char(subj))) { |
| 1367 | cmark_chunk_rtrim(&contents); |
| 1368 | } |
| 1369 | |
| 1370 | new_inl = make_str(subj, startpos, endpos - 1, contents); |
| 1371 | } |
| 1372 | if (new_inl != NULL) { |
| 1373 | append_child(parent, new_inl); |
| 1374 | } |
| 1375 | |
| 1376 | return 1; |
| 1377 | } |
| 1378 | |
| 1379 | // Parse inlines from parent's string_content, adding as children of parent. |
| 1380 | void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, |
| 1381 | cmark_reference_map *refmap, int options) { |
| 1382 | subject subj; |
| 1383 | cmark_chunk content = {parent->data, parent->len}; |
| 1384 | subject_from_buf(mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap); |
| 1385 | cmark_chunk_rtrim(&subj.input); |
| 1386 | |
| 1387 | while (!is_eof(&subj) && parse_inline(&subj, parent, options)) |
| 1388 | ; |
| 1389 | |
| 1390 | process_emphasis(&subj, NULL); |
| 1391 | // free bracket and delim stack |
| 1392 | while (subj.last_delim) { |
| 1393 | remove_delimiter(&subj, subj.last_delim); |
| 1394 | } |
| 1395 | while (subj.last_bracket) { |
| 1396 | pop_bracket(&subj); |
| 1397 | } |
| 1398 | } |
| 1399 | |
| 1400 | // Parse zero or more space characters, including at most one newline. |
| 1401 | static void spnl(subject *subj) { |
| 1402 | skip_spaces(subj); |
| 1403 | if (skip_line_end(subj)) { |
| 1404 | skip_spaces(subj); |
| 1405 | } |
| 1406 | } |
| 1407 | |
| 1408 | // Parse reference. Assumes string begins with '[' character. |
| 1409 | // Modify refmap if a reference is encountered. |
| 1410 | // Return 0 if no reference found, otherwise position of subject |
| 1411 | // after reference is parsed. |
| 1412 | bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input, |
| 1413 | cmark_reference_map *refmap) { |
| 1414 | subject subj; |
| 1415 | |
| 1416 | cmark_chunk lab; |
| 1417 | cmark_chunk url; |
| 1418 | cmark_chunk title; |
| 1419 | |
| 1420 | bufsize_t matchlen = 0; |
| 1421 | bufsize_t beforetitle; |
| 1422 | |
| 1423 | subject_from_buf(mem, -1, 0, &subj, input, NULL); |
| 1424 | |
| 1425 | // parse label: |
| 1426 | if (!link_label(&subj, &lab) || lab.len == 0) |
| 1427 | return 0; |
| 1428 | |
| 1429 | // colon: |
| 1430 | if (peek_char(&subj) == ':') { |
| 1431 | advance(&subj); |
| 1432 | } else { |
| 1433 | return 0; |
| 1434 | } |
| 1435 | |
| 1436 | // parse link url: |
| 1437 | spnl(&subj); |
| 1438 | if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) { |
| 1439 | subj.pos += matchlen; |
| 1440 | } else { |
| 1441 | return 0; |
| 1442 | } |
| 1443 | |
| 1444 | // parse optional link_title |
| 1445 | beforetitle = subj.pos; |
| 1446 | spnl(&subj); |
| 1447 | matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos); |
| 1448 | if (matchlen) { |
| 1449 | title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); |
| 1450 | subj.pos += matchlen; |
| 1451 | } else { |
| 1452 | subj.pos = beforetitle; |
| 1453 | title = cmark_chunk_literal("" ); |
| 1454 | } |
| 1455 | |
| 1456 | // parse final spaces and newline: |
| 1457 | skip_spaces(&subj); |
| 1458 | if (!skip_line_end(&subj)) { |
| 1459 | if (matchlen) { // try rewinding before title |
| 1460 | subj.pos = beforetitle; |
| 1461 | skip_spaces(&subj); |
| 1462 | if (!skip_line_end(&subj)) { |
| 1463 | return 0; |
| 1464 | } |
| 1465 | } else { |
| 1466 | return 0; |
| 1467 | } |
| 1468 | } |
| 1469 | // insert reference into refmap |
| 1470 | cmark_reference_create(refmap, &lab, &url, &title); |
| 1471 | return subj.pos; |
| 1472 | } |
| 1473 | |