1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73#endif
74
75
76/************************
77 *** Internal Types ***
78 ************************/
79
80/* These are omnipresent so lets save some typing. */
81#define CHAR MD_CHAR
82#define SZ MD_SIZE
83#define OFF MD_OFFSET
84
85typedef struct MD_MARK_tag MD_MARK;
86typedef struct MD_BLOCK_tag MD_BLOCK;
87typedef struct MD_CONTAINER_tag MD_CONTAINER;
88typedef struct MD_REF_DEF_tag MD_REF_DEF;
89
90
91/* During analyzes of inline marks, we need to manage some "mark chains",
92 * of (yet unresolved) openers. This structure holds start/end of the chain.
93 * The chain internals are then realized through MD_MARK::prev and ::next.
94 */
95typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
96struct MD_MARKCHAIN_tag {
97 int head; /* Index of first mark in the chain, or -1 if empty. */
98 int tail; /* Index of last mark in the chain, or -1 if empty. */
99};
100
101/* Context propagated through all the parsing. */
102typedef struct MD_CTX_tag MD_CTX;
103struct MD_CTX_tag {
104 /* Immutable stuff (parameters of md_parse()). */
105 const CHAR* text;
106 SZ size;
107 MD_PARSER parser;
108 void* userdata;
109
110 /* When this is true, it allows some optimizations. */
111 int doc_ends_with_newline;
112
113 /* Helper temporary growing buffer. */
114 CHAR* buffer;
115 unsigned alloc_buffer;
116
117 /* Reference definitions. */
118 MD_REF_DEF* ref_defs;
119 int n_ref_defs;
120 int alloc_ref_defs;
121 void** ref_def_hashtable;
122 int ref_def_hashtable_size;
123
124 /* Stack of inline/span markers.
125 * This is only used for parsing a single block contents but by storing it
126 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
127 * (re)allocations. */
128 MD_MARK* marks;
129 int n_marks;
130 int alloc_marks;
131
132#if defined MD4C_USE_UTF16
133 char mark_char_map[128];
134#else
135 char mark_char_map[256];
136#endif
137
138 /* For resolving of inline spans. */
139 MD_MARKCHAIN mark_chains[13];
140#define PTR_CHAIN ctx->mark_chains[0]
141#define TABLECELLBOUNDARIES ctx->mark_chains[1]
142#define ASTERISK_OPENERS_extraword_mod3_0 ctx->mark_chains[2]
143#define ASTERISK_OPENERS_extraword_mod3_1 ctx->mark_chains[3]
144#define ASTERISK_OPENERS_extraword_mod3_2 ctx->mark_chains[4]
145#define ASTERISK_OPENERS_intraword_mod3_0 ctx->mark_chains[5]
146#define ASTERISK_OPENERS_intraword_mod3_1 ctx->mark_chains[6]
147#define ASTERISK_OPENERS_intraword_mod3_2 ctx->mark_chains[7]
148#define UNDERSCORE_OPENERS ctx->mark_chains[8]
149#define TILDE_OPENERS_1 ctx->mark_chains[9]
150#define TILDE_OPENERS_2 ctx->mark_chains[10]
151#define BRACKET_OPENERS ctx->mark_chains[11]
152#define DOLLAR_OPENERS ctx->mark_chains[12]
153#define OPENERS_CHAIN_FIRST 2
154#define OPENERS_CHAIN_LAST 12
155
156 int n_table_cell_boundaries;
157
158 /* For resolving links. */
159 int unresolved_link_head;
160 int unresolved_link_tail;
161
162 /* For resolving raw HTML. */
163 OFF html_comment_horizon;
164 OFF html_proc_instr_horizon;
165 OFF html_decl_horizon;
166 OFF html_cdata_horizon;
167
168 /* For block analysis.
169 * Notes:
170 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
171 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
172 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
173 * instead of MD_LINE(s).
174 */
175 void* block_bytes;
176 MD_BLOCK* current_block;
177 int n_block_bytes;
178 int alloc_block_bytes;
179
180 /* For container block analysis. */
181 MD_CONTAINER* containers;
182 int n_containers;
183 int alloc_containers;
184
185 /* Minimal indentation to call the block "indented code block". */
186 unsigned code_indent_offset;
187
188 /* Contextual info for line analysis. */
189 SZ code_fence_length; /* For checking closing fence length. */
190 int html_block_type; /* For checking closing raw HTML condition. */
191 int last_line_has_list_loosening_effect;
192 int last_list_item_starts_with_two_blank_lines;
193};
194
195enum MD_LINETYPE_tag {
196 MD_LINE_BLANK,
197 MD_LINE_HR,
198 MD_LINE_ATXHEADER,
199 MD_LINE_SETEXTHEADER,
200 MD_LINE_SETEXTUNDERLINE,
201 MD_LINE_INDENTEDCODE,
202 MD_LINE_FENCEDCODE,
203 MD_LINE_HTML,
204 MD_LINE_TEXT,
205 MD_LINE_TABLE,
206 MD_LINE_TABLEUNDERLINE
207};
208typedef enum MD_LINETYPE_tag MD_LINETYPE;
209
210typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
211struct MD_LINE_ANALYSIS_tag {
212 MD_LINETYPE type : 16;
213 unsigned data : 16;
214 OFF beg;
215 OFF end;
216 unsigned indent; /* Indentation level. */
217};
218
219typedef struct MD_LINE_tag MD_LINE;
220struct MD_LINE_tag {
221 OFF beg;
222 OFF end;
223};
224
225typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
226struct MD_VERBATIMLINE_tag {
227 OFF beg;
228 OFF end;
229 OFF indent;
230};
231
232
233/*******************
234 *** Debugging ***
235 *******************/
236
237#define MD_LOG(msg) \
238 do { \
239 if(ctx->parser.debug_log != NULL) \
240 ctx->parser.debug_log((msg), ctx->userdata); \
241 } while(0)
242
243#ifdef DEBUG
244 #define MD_ASSERT(cond) \
245 do { \
246 if(!(cond)) { \
247 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
248 "Assertion '" STRINGIZE(cond) "' failed."); \
249 exit(1); \
250 } \
251 } while(0)
252
253 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
254#else
255 #ifdef __GNUC__
256 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
257 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
258 #elif defined _MSC_VER && _MSC_VER > 120
259 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
260 #define MD_UNREACHABLE() do { __assume(0); } while(0)
261 #else
262 #define MD_ASSERT(cond) do {} while(0)
263 #define MD_UNREACHABLE() do {} while(0)
264 #endif
265#endif
266
267
268/*****************
269 *** Helpers ***
270 *****************/
271
272/* Character accessors. */
273#define CH(off) (ctx->text[(off)])
274#define STR(off) (ctx->text + (off))
275
276/* Check whether the pointer points into ctx->text. */
277#define IS_INPUT_STR(ptr) (ctx->text <= (ptr) && (ptr) < (ctx->text + ctx->size))
278
279/* Character classification.
280 * Note we assume ASCII compatibility of code points < 128 here. */
281#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
282#define ISANYOF_(ch, palette) (md_strchr((palette), (ch)) != NULL)
283#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
284#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
285#define ISASCII_(ch) ((unsigned)(ch) <= 127)
286#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
287#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
288#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
289#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
290#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
291#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
292#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
293#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
294#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
295#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
296#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
297
298#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
299#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
300#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
301#define ISASCII(off) ISASCII_(CH(off))
302#define ISBLANK(off) ISBLANK_(CH(off))
303#define ISNEWLINE(off) ISNEWLINE_(CH(off))
304#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
305#define ISCNTRL(off) ISCNTRL_(CH(off))
306#define ISPUNCT(off) ISPUNCT_(CH(off))
307#define ISUPPER(off) ISUPPER_(CH(off))
308#define ISLOWER(off) ISLOWER_(CH(off))
309#define ISALPHA(off) ISALPHA_(CH(off))
310#define ISDIGIT(off) ISDIGIT_(CH(off))
311#define ISXDIGIT(off) ISXDIGIT_(CH(off))
312#define ISALNUM(off) ISALNUM_(CH(off))
313
314
315#if defined MD4C_USE_UTF16
316 #define md_strchr wcschr
317#else
318 #define md_strchr strchr
319#endif
320
321
322/* Case insensitive check of string equality. */
323static inline int
324md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
325{
326 OFF i;
327 for(i = 0; i < n; i++) {
328 CHAR ch1 = s1[i];
329 CHAR ch2 = s2[i];
330
331 if(ISLOWER_(ch1))
332 ch1 += ('A'-'a');
333 if(ISLOWER_(ch2))
334 ch2 += ('A'-'a');
335 if(ch1 != ch2)
336 return FALSE;
337 }
338 return TRUE;
339}
340
341static inline int
342md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
343{
344 return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
345}
346
347static int
348md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
349{
350 OFF off = 0;
351 int ret = 0;
352
353 while(1) {
354 while(off < size && str[off] != _T('\0'))
355 off++;
356
357 if(off > 0) {
358 ret = ctx->parser.text(type, str, off, ctx->userdata);
359 if(ret != 0)
360 return ret;
361
362 str += off;
363 size -= off;
364 off = 0;
365 }
366
367 if(off >= size)
368 return 0;
369
370 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
371 if(ret != 0)
372 return ret;
373 off++;
374 }
375}
376
377
378#define MD_CHECK(func) \
379 do { \
380 ret = (func); \
381 if(ret < 0) \
382 goto abort; \
383 } while(0)
384
385
386#define MD_TEMP_BUFFER(sz) \
387 do { \
388 if(sz > ctx->alloc_buffer) { \
389 CHAR* new_buffer; \
390 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
391 \
392 new_buffer = realloc(ctx->buffer, new_size); \
393 if(new_buffer == NULL) { \
394 MD_LOG("realloc() failed."); \
395 ret = -1; \
396 goto abort; \
397 } \
398 \
399 ctx->buffer = new_buffer; \
400 ctx->alloc_buffer = new_size; \
401 } \
402 } while(0)
403
404
405#define MD_ENTER_BLOCK(type, arg) \
406 do { \
407 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
408 if(ret != 0) { \
409 MD_LOG("Aborted from enter_block() callback."); \
410 goto abort; \
411 } \
412 } while(0)
413
414#define MD_LEAVE_BLOCK(type, arg) \
415 do { \
416 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
417 if(ret != 0) { \
418 MD_LOG("Aborted from leave_block() callback."); \
419 goto abort; \
420 } \
421 } while(0)
422
423#define MD_ENTER_SPAN(type, arg) \
424 do { \
425 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
426 if(ret != 0) { \
427 MD_LOG("Aborted from enter_span() callback."); \
428 goto abort; \
429 } \
430 } while(0)
431
432#define MD_LEAVE_SPAN(type, arg) \
433 do { \
434 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
435 if(ret != 0) { \
436 MD_LOG("Aborted from leave_span() callback."); \
437 goto abort; \
438 } \
439 } while(0)
440
441#define MD_TEXT(type, str, size) \
442 do { \
443 if(size > 0) { \
444 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
445 if(ret != 0) { \
446 MD_LOG("Aborted from text() callback."); \
447 goto abort; \
448 } \
449 } \
450 } while(0)
451
452#define MD_TEXT_INSECURE(type, str, size) \
453 do { \
454 if(size > 0) { \
455 ret = md_text_with_null_replacement(ctx, type, str, size); \
456 if(ret != 0) { \
457 MD_LOG("Aborted from text() callback."); \
458 goto abort; \
459 } \
460 } \
461 } while(0)
462
463
464
465/*************************
466 *** Unicode Support ***
467 *************************/
468
469typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
470struct MD_UNICODE_FOLD_INFO_tag {
471 unsigned codepoints[3];
472 int n_codepoints;
473};
474
475
476#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
477 /* Binary search over sorted "map" of codepoints. Consecutive sequences
478 * of codepoints may be encoded in the map by just using the
479 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
480 *
481 * Returns index of the found record in the map (in the case of ranges,
482 * the minimal value is used); or -1 on failure. */
483 static int
484 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
485 {
486 int beg, end;
487 int pivot_beg, pivot_end;
488
489 beg = 0;
490 end = (int) map_size-1;
491 while(beg <= end) {
492 /* Pivot may be a range, not just a single value. */
493 pivot_beg = pivot_end = (beg + end) / 2;
494 if(map[pivot_end] & 0x40000000)
495 pivot_end++;
496 if(map[pivot_beg] & 0x80000000)
497 pivot_beg--;
498
499 if(codepoint < (map[pivot_beg] & 0x00ffffff))
500 end = pivot_beg - 1;
501 else if(codepoint > (map[pivot_end] & 0x00ffffff))
502 beg = pivot_end + 1;
503 else
504 return pivot_beg;
505 }
506
507 return -1;
508 }
509
510 static int
511 md_is_unicode_whitespace__(unsigned codepoint)
512 {
513#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
514#define S(cp) (cp)
515 /* Unicode "Zs" category.
516 * (generated by scripts/build_whitespace_map.py) */
517 static const unsigned WHITESPACE_MAP[] = {
518 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
519 };
520#undef R
521#undef S
522
523 /* The ASCII ones are the most frequently used ones, also CommonMark
524 * specification requests few more in this range. */
525 if(codepoint <= 0x7f)
526 return ISWHITESPACE_(codepoint);
527
528 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
529 }
530
531 static int
532 md_is_unicode_punct__(unsigned codepoint)
533 {
534#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
535#define S(cp) (cp)
536 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
537 * (generated by scripts/build_punct_map.py) */
538 static const unsigned PUNCT_MAP[] = {
539 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
540 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
541 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
542 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
543 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
544 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
545 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
546 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
547 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
548 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
549 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
550 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
551 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
552 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
553 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f), S(0x3030),
554 S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
555 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
556 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
557 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
558 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
559 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
560 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
561 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), R(0x10f55,0x10f59), R(0x11047,0x1104d),
562 R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175), R(0x111c5,0x111c8),
563 S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9), R(0x1144b,0x1144f),
564 S(0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643), R(0x11660,0x1166c),
565 R(0x1173c,0x1173e), S(0x1183b), S(0x119e2), R(0x11a3f,0x11a46), R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2),
566 R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), S(0x11fff), R(0x12470,0x12474),
567 R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44), R(0x16e97,0x16e9a), S(0x16fe2),
568 S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
569 };
570#undef R
571#undef S
572
573 /* The ASCII ones are the most frequently used ones, also CommonMark
574 * specification requests few more in this range. */
575 if(codepoint <= 0x7f)
576 return ISPUNCT_(codepoint);
577
578 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
579 }
580
581 static void
582 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
583 {
584#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
585#define S(cp) (cp)
586 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
587 * (generated by scripts/build_punct_map.py) */
588 static const unsigned FOLD_MAP_1[] = {
589 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
590 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
591 S(0x0186), S(0x0187), S(0x0189), S(0x018b), S(0x018e), S(0x018f), S(0x0190), S(0x0191), S(0x0193),
592 S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f), R(0x01a0,0x01a4), S(0x01a6),
593 S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b3), S(0x01b7), S(0x01b8),
594 S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8), S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee),
595 S(0x01f1), S(0x01f2), S(0x01f6), S(0x01f7), R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a),
596 S(0x023b), S(0x023d), S(0x023e), S(0x0241), S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345),
597 S(0x0370), S(0x0376), S(0x037f), S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), R(0x0391,0x03a1),
598 R(0x03a3,0x03ab), S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee),
599 S(0x03f0), S(0x03f1), S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff),
600 R(0x0400,0x040f), R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd),
601 R(0x04d0,0x052e), R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80),
602 S(0x1c81), S(0x1c82), S(0x1c83), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
603 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
604 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
605 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fba), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8), S(0x1fda), S(0x1fe8),
606 S(0x1fea), S(0x1fec), S(0x1ff8), S(0x1ffa), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f),
607 S(0x2183), R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64),
608 R(0x2c67,0x2c6b), S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e),
609 R(0x2c80,0x2ce2), S(0x2ceb), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e),
610 R(0xa732,0xa76e), S(0xa779), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790),
611 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
612 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), R(0xab70,0xabbf),
613 R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2), R(0x118a0,0x118bf),
614 R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
615 };
616 static const unsigned FOLD_MAP_1_DATA[] = {
617 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
618 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0254, 0x0188, 0x0256, 0x018c, 0x01dd,
619 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275, 0x01a1, 0x01a5,
620 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x01b4, 0x0292, 0x01b9, 0x01bd, 0x01c6, 0x01c6,
621 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3, 0x0195, 0x01bf, 0x01f9, 0x021f,
622 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242, 0x0180, 0x0289, 0x028c, 0x0247, 0x024f,
623 0x03b9, 0x0371, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af, 0x03cc, 0x03cd, 0x03b1, 0x03c1, 0x03c3, 0x03cb,
624 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0, 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8,
625 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f, 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf,
626 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586, 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432,
627 0x0434, 0x043e, 0x0441, 0x0442, 0x044a, 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95,
628 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07, 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45,
629 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60, 0x1f67, 0x1fb0, 0x1f70, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1f76,
630 0x1fe0, 0x1f7a, 0x1fe5, 0x1f78, 0x1f7c, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170, 0x217f, 0x2184, 0x24d0,
631 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251, 0x0271, 0x0250, 0x0252,
632 0x2c73, 0x2c76, 0x023f, 0x2c81, 0x2ce3, 0x2cec, 0x2cf3, 0xa641, 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f,
633 0xa733, 0xa76f, 0xa77a, 0x1d79, 0xa77f, 0xa787, 0xa78c, 0x0265, 0xa791, 0xa797, 0xa7a9, 0x0266, 0x025c,
634 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d, 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e,
635 0x13a0, 0x13ef, 0xff41, 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df,
636 0x16e60, 0x16e7f, 0x1e922, 0x1e943
637 };
638 static const unsigned FOLD_MAP_2[] = {
639 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
640 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
641 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
642 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
643 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
644 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
645 };
646 static const unsigned FOLD_MAP_2_DATA[] = {
647 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
648 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
649 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
650 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
651 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
652 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
653 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
654 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
655 };
656 static const unsigned FOLD_MAP_3[] = {
657 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
658 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
659 };
660 static const unsigned FOLD_MAP_3_DATA[] = {
661 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
662 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
663 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
664 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
665 };
666#undef R
667#undef S
668 static const struct {
669 const unsigned* map;
670 const unsigned* data;
671 size_t map_size;
672 int n_codepoints;
673 } FOLD_MAP_LIST[] = {
674 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
675 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
676 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
677 };
678
679 int i;
680
681 /* Fast path for ASCII characters. */
682 if(codepoint <= 0x7f) {
683 info->codepoints[0] = codepoint;
684 if(ISUPPER_(codepoint))
685 info->codepoints[0] += 'a' - 'A';
686 info->n_codepoints = 1;
687 return;
688 }
689
690 /* Try to locate the codepoint in any of the maps. */
691 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
692 int index;
693
694 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
695 if(index >= 0) {
696 /* Found the mapping. */
697 int n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
698 const unsigned* map = FOLD_MAP_LIST[i].map;
699 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
700
701 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
702 info->n_codepoints = n_codepoints;
703
704 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
705 /* The found mapping maps whole range of codepoints,
706 * i.e. we have to offset info->codepoints[0] accordingly. */
707 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
708 /* Alternating type of the range. */
709 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
710 } else {
711 /* Range to range kind of mapping. */
712 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
713 }
714 }
715
716 return;
717 }
718 }
719
720 /* No mapping found. Map the codepoint to itself. */
721 info->codepoints[0] = codepoint;
722 info->n_codepoints = 1;
723 }
724#endif
725
726
727#if defined MD4C_USE_UTF16
728 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
729 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
730 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
731
732 static unsigned
733 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
734 {
735 if(IS_UTF16_SURROGATE_HI(str[0])) {
736 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
737 if(p_size != NULL)
738 *p_size = 2;
739 return UTF16_DECODE_SURROGATE(str[0], str[1]);
740 }
741 }
742
743 if(p_size != NULL)
744 *p_size = 1;
745 return str[0];
746 }
747
748 static unsigned
749 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
750 {
751 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
752 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
753
754 return CH(off);
755 }
756
757 /* No whitespace uses surrogates, so no decoding needed here. */
758 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
759 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
760 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
761
762 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
763 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
764
765 static inline int
766 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
767 {
768 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
769 }
770#elif defined MD4C_USE_UTF8
771 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
772 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
773 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
774 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
775 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
776
777 static unsigned
778 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
779 {
780 if(!IS_UTF8_LEAD1(str[0])) {
781 if(IS_UTF8_LEAD2(str[0])) {
782 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
783 if(p_size != NULL)
784 *p_size = 2;
785
786 return (((unsigned int)str[0] & 0x1f) << 6) |
787 (((unsigned int)str[1] & 0x3f) << 0);
788 }
789 } else if(IS_UTF8_LEAD3(str[0])) {
790 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
791 if(p_size != NULL)
792 *p_size = 3;
793
794 return (((unsigned int)str[0] & 0x0f) << 12) |
795 (((unsigned int)str[1] & 0x3f) << 6) |
796 (((unsigned int)str[2] & 0x3f) << 0);
797 }
798 } else if(IS_UTF8_LEAD4(str[0])) {
799 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
800 if(p_size != NULL)
801 *p_size = 4;
802
803 return (((unsigned int)str[0] & 0x07) << 18) |
804 (((unsigned int)str[1] & 0x3f) << 12) |
805 (((unsigned int)str[2] & 0x3f) << 6) |
806 (((unsigned int)str[3] & 0x3f) << 0);
807 }
808 }
809 }
810
811 if(p_size != NULL)
812 *p_size = 1;
813 return (unsigned) str[0];
814 }
815
816 static unsigned
817 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
818 {
819 if(!IS_UTF8_LEAD1(CH(off-1))) {
820 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
821 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
822 (((unsigned int)CH(off-1) & 0x3f) << 0);
823
824 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
825 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
826 (((unsigned int)CH(off-2) & 0x3f) << 6) |
827 (((unsigned int)CH(off-1) & 0x3f) << 0);
828
829 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
830 return (((unsigned int)CH(off-4) & 0x07) << 18) |
831 (((unsigned int)CH(off-3) & 0x3f) << 12) |
832 (((unsigned int)CH(off-2) & 0x3f) << 6) |
833 (((unsigned int)CH(off-1) & 0x3f) << 0);
834 }
835
836 return (unsigned) CH(off-1);
837 }
838
839 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
840 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
841 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
842
843 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
844 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
845
846 static inline unsigned
847 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
848 {
849 return md_decode_utf8__(str+off, str_size-off, p_char_size);
850 }
851#else
852 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
853 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
854 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
855
856 #define ISUNICODEPUNCT(off) ISPUNCT(off)
857 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
858
859 static inline void
860 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
861 {
862 info->codepoints[0] = codepoint;
863 if(ISUPPER_(codepoint))
864 info->codepoints[0] += 'a' - 'A';
865 info->n_codepoints = 1;
866 }
867
868 static inline unsigned
869 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
870 {
871 *p_size = 1;
872 return (unsigned) str[off];
873 }
874#endif
875
876
877/*************************************
878 *** Helper string manipulations ***
879 *************************************/
880
881/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
882 * line breaks with given replacement character.
883 *
884 * NOTE: Caller is responsible to make sure the buffer is large enough.
885 * (Given the output is always shorter then input, (end - beg) is good idea
886 * what the caller should allocate.)
887 */
888static void
889md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
890 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
891{
892 CHAR* ptr = buffer;
893 int line_index = 0;
894 OFF off = beg;
895
896 while(1) {
897 const MD_LINE* line = &lines[line_index];
898 OFF line_end = line->end;
899 if(end < line_end)
900 line_end = end;
901
902 while(off < line_end) {
903 *ptr = CH(off);
904 ptr++;
905 off++;
906 }
907
908 if(off >= end) {
909 *p_size = ptr - buffer;
910 return;
911 }
912
913 *ptr = line_break_replacement_char;
914 ptr++;
915
916 line_index++;
917 off = lines[line_index].beg;
918 }
919}
920
921/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
922 */
923static int
924md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
925 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
926{
927 CHAR* buffer;
928
929 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
930 if(buffer == NULL) {
931 MD_LOG("malloc() failed.");
932 return -1;
933 }
934
935 md_merge_lines(ctx, beg, end, lines, n_lines,
936 line_break_replacement_char, buffer, p_size);
937
938 *p_str = buffer;
939 return 0;
940}
941
942static OFF
943md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
944{
945 SZ char_size;
946 unsigned codepoint;
947
948 while(off < size) {
949 codepoint = md_decode_unicode(label, off, size, &char_size);
950 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
951 break;
952 off += char_size;
953 }
954
955 return off;
956}
957
958
959/******************************
960 *** Recognizing raw HTML ***
961 ******************************/
962
963/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
964 * or when breaking document to blocks (checking for start of HTML block type 7).
965 *
966 * When breaking document to blocks, we do not yet know line boundaries, but
967 * in that case the whole tag has to live on a single line. We distinguish this
968 * by n_lines == 0.
969 */
970static int
971md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
972{
973 int attr_state;
974 OFF off = beg;
975 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
976 int i = 0;
977
978 MD_ASSERT(CH(beg) == _T('<'));
979
980 if(off + 1 >= line_end)
981 return FALSE;
982 off++;
983
984 /* For parsing attributes, we need a little state automaton below.
985 * State -1: no attributes are allowed.
986 * State 0: attribute could follow after some whitespace.
987 * State 1: after a whitespace (attribute name may follow).
988 * State 2: after attribute name ('=' MAY follow).
989 * State 3: after '=' (value specification MUST follow).
990 * State 41: in middle of unquoted attribute value.
991 * State 42: in middle of single-quoted attribute value.
992 * State 43: in middle of double-quoted attribute value.
993 */
994 attr_state = 0;
995
996 if(CH(off) == _T('/')) {
997 /* Closer tag "</ ... >". No attributes may be present. */
998 attr_state = -1;
999 off++;
1000 }
1001
1002 /* Tag name */
1003 if(off >= line_end || !ISALPHA(off))
1004 return FALSE;
1005 off++;
1006 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1007 off++;
1008
1009 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1010 * and final '>'. */
1011 while(1) {
1012 while(off < line_end && !ISNEWLINE(off)) {
1013 if(attr_state > 40) {
1014 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1015 attr_state = 0;
1016 off--; /* Put the char back for re-inspection in the new state. */
1017 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1018 attr_state = 0;
1019 } else if(attr_state == 43 && CH(off) == _T('"')) {
1020 attr_state = 0;
1021 }
1022 off++;
1023 } else if(ISWHITESPACE(off)) {
1024 if(attr_state == 0)
1025 attr_state = 1;
1026 off++;
1027 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1028 /* End. */
1029 goto done;
1030 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1031 /* End with digraph '/>' */
1032 off++;
1033 goto done;
1034 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1035 off++;
1036 /* Attribute name */
1037 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1038 off++;
1039 attr_state = 2;
1040 } else if(attr_state == 2 && CH(off) == _T('=')) {
1041 /* Attribute assignment sign */
1042 off++;
1043 attr_state = 3;
1044 } else if(attr_state == 3) {
1045 /* Expecting start of attribute value. */
1046 if(CH(off) == _T('"'))
1047 attr_state = 43;
1048 else if(CH(off) == _T('\''))
1049 attr_state = 42;
1050 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1051 attr_state = 41;
1052 else
1053 return FALSE;
1054 off++;
1055 } else {
1056 /* Anything unexpected. */
1057 return FALSE;
1058 }
1059 }
1060
1061 /* We have to be on a single line. See definition of start condition
1062 * of HTML block, type 7. */
1063 if(n_lines == 0)
1064 return FALSE;
1065
1066 i++;
1067 if(i >= n_lines)
1068 return FALSE;
1069
1070 off = lines[i].beg;
1071 line_end = lines[i].end;
1072
1073 if(attr_state == 0 || attr_state == 41)
1074 attr_state = 1;
1075
1076 if(off >= max_end)
1077 return FALSE;
1078 }
1079
1080done:
1081 if(off >= max_end)
1082 return FALSE;
1083
1084 *p_end = off+1;
1085 return TRUE;
1086}
1087
1088static int
1089md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1090 const MD_LINE* lines, int n_lines,
1091 OFF beg, OFF max_end, OFF* p_end,
1092 OFF* p_scan_horizon)
1093{
1094 OFF off = beg;
1095 int i = 0;
1096
1097 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1098 /* We have already scanned the range up to the max_end so we know
1099 * there is nothing to see. */
1100 return FALSE;
1101 }
1102
1103 while(TRUE) {
1104 while(off + len <= lines[i].end && off + len <= max_end) {
1105 if(md_ascii_eq(STR(off), str, len)) {
1106 /* Success. */
1107 *p_end = off + len;
1108 return TRUE;
1109 }
1110 off++;
1111 }
1112
1113 i++;
1114 if(off >= max_end || i >= n_lines) {
1115 /* Failure. */
1116 *p_scan_horizon = off;
1117 return FALSE;
1118 }
1119
1120 off = lines[i].beg;
1121 }
1122}
1123
1124static int
1125md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1126{
1127 OFF off = beg;
1128
1129 MD_ASSERT(CH(beg) == _T('<'));
1130
1131 if(off + 4 >= lines[0].end)
1132 return FALSE;
1133 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1134 return FALSE;
1135 off += 4;
1136
1137 /* ">" and "->" must not follow the opening. */
1138 if(off < lines[0].end && CH(off) == _T('>'))
1139 return FALSE;
1140 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1141 return FALSE;
1142
1143 /* HTML comment must not contain "--", so we scan just for "--" instead
1144 * of "-->" and verify manually that '>' follows. */
1145 if(md_scan_for_html_closer(ctx, _T("--"), 2,
1146 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1147 {
1148 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1149 *p_end = *p_end + 1;
1150 return TRUE;
1151 }
1152 }
1153
1154 return FALSE;
1155}
1156
1157static int
1158md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1159{
1160 OFF off = beg;
1161
1162 if(off + 2 >= lines[0].end)
1163 return FALSE;
1164 if(CH(off+1) != _T('?'))
1165 return FALSE;
1166 off += 2;
1167
1168 return md_scan_for_html_closer(ctx, _T("?>"), 2,
1169 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1170}
1171
1172static int
1173md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1174{
1175 OFF off = beg;
1176
1177 if(off + 2 >= lines[0].end)
1178 return FALSE;
1179 if(CH(off+1) != _T('!'))
1180 return FALSE;
1181 off += 2;
1182
1183 /* Declaration name. */
1184 if(off >= lines[0].end || !ISALPHA(off))
1185 return FALSE;
1186 off++;
1187 while(off < lines[0].end && ISALPHA(off))
1188 off++;
1189 if(off < lines[0].end && !ISWHITESPACE(off))
1190 return FALSE;
1191
1192 return md_scan_for_html_closer(ctx, _T(">"), 1,
1193 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1194}
1195
1196static int
1197md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1198{
1199 static const CHAR open_str[] = _T("<![CDATA[");
1200 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1201
1202 OFF off = beg;
1203
1204 if(off + open_size >= lines[0].end)
1205 return FALSE;
1206 if(memcmp(STR(off), open_str, open_size) != 0)
1207 return FALSE;
1208 off += open_size;
1209
1210 if(lines[n_lines-1].end < max_end)
1211 max_end = lines[n_lines-1].end - 2;
1212
1213 return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1214 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1215}
1216
1217static int
1218md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1219{
1220 MD_ASSERT(CH(beg) == _T('<'));
1221 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1222 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1223 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1224 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1225 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1226}
1227
1228
1229/****************************
1230 *** Recognizing Entity ***
1231 ****************************/
1232
1233static int
1234md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1235{
1236 OFF off = beg;
1237
1238 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1239 off++;
1240
1241 if(1 <= off - beg && off - beg <= 6) {
1242 *p_end = off;
1243 return TRUE;
1244 } else {
1245 return FALSE;
1246 }
1247}
1248
1249static int
1250md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1251{
1252 OFF off = beg;
1253
1254 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1255 off++;
1256
1257 if(1 <= off - beg && off - beg <= 7) {
1258 *p_end = off;
1259 return TRUE;
1260 } else {
1261 return FALSE;
1262 }
1263}
1264
1265static int
1266md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1267{
1268 OFF off = beg;
1269
1270 if(off < max_end && ISALPHA_(text[off]))
1271 off++;
1272 else
1273 return FALSE;
1274
1275 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1276 off++;
1277
1278 if(2 <= off - beg && off - beg <= 48) {
1279 *p_end = off;
1280 return TRUE;
1281 } else {
1282 return FALSE;
1283 }
1284}
1285
1286static int
1287md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1288{
1289 int is_contents;
1290 OFF off = beg;
1291
1292 MD_ASSERT(text[off] == _T('&'));
1293 off++;
1294
1295 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1296 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1297 else if(off+1 < max_end && text[off] == _T('#'))
1298 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1299 else
1300 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1301
1302 if(is_contents && off < max_end && text[off] == _T(';')) {
1303 *p_end = off+1;
1304 return TRUE;
1305 } else {
1306 return FALSE;
1307 }
1308}
1309
1310static inline int
1311md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1312{
1313 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1314}
1315
1316
1317/******************************
1318 *** Attribute Management ***
1319 ******************************/
1320
1321typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1322struct MD_ATTRIBUTE_BUILD_tag {
1323 CHAR* text;
1324 MD_TEXTTYPE* substr_types;
1325 OFF* substr_offsets;
1326 int substr_count;
1327 int substr_alloc;
1328 MD_TEXTTYPE trivial_types[1];
1329 OFF trivial_offsets[2];
1330};
1331
1332
1333#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1334
1335static int
1336md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1337 MD_TEXTTYPE type, OFF off)
1338{
1339 if(build->substr_count >= build->substr_alloc) {
1340 MD_TEXTTYPE* new_substr_types;
1341 OFF* new_substr_offsets;
1342
1343 build->substr_alloc = (build->substr_alloc > 0
1344 ? build->substr_alloc + build->substr_alloc / 2
1345 : 8);
1346 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1347 build->substr_alloc * sizeof(MD_TEXTTYPE));
1348 if(new_substr_types == NULL) {
1349 MD_LOG("realloc() failed.");
1350 return -1;
1351 }
1352 /* Note +1 to reserve space for final offset (== raw_size). */
1353 new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1354 (build->substr_alloc+1) * sizeof(OFF));
1355 if(new_substr_offsets == NULL) {
1356 MD_LOG("realloc() failed.");
1357 free(new_substr_types);
1358 return -1;
1359 }
1360
1361 build->substr_types = new_substr_types;
1362 build->substr_offsets = new_substr_offsets;
1363 }
1364
1365 build->substr_types[build->substr_count] = type;
1366 build->substr_offsets[build->substr_count] = off;
1367 build->substr_count++;
1368 return 0;
1369}
1370
1371static void
1372md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1373{
1374 if(build->substr_alloc > 0) {
1375 free(build->text);
1376 free(build->substr_types);
1377 free(build->substr_offsets);
1378 }
1379}
1380
1381static int
1382md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1383 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1384{
1385 OFF raw_off, off;
1386 int is_trivial;
1387 int ret = 0;
1388
1389 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1390
1391 /* If there is no backslash and no ampersand, build trivial attribute
1392 * without any malloc(). */
1393 is_trivial = TRUE;
1394 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1395 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1396 is_trivial = FALSE;
1397 break;
1398 }
1399 }
1400
1401 if(is_trivial) {
1402 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1403 build->substr_types = build->trivial_types;
1404 build->substr_offsets = build->trivial_offsets;
1405 build->substr_count = 1;
1406 build->substr_alloc = 0;
1407 build->trivial_types[0] = MD_TEXT_NORMAL;
1408 build->trivial_offsets[0] = 0;
1409 build->trivial_offsets[1] = raw_size;
1410 off = raw_size;
1411 } else {
1412 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1413 if(build->text == NULL) {
1414 MD_LOG("malloc() failed.");
1415 goto abort;
1416 }
1417
1418 raw_off = 0;
1419 off = 0;
1420
1421 while(raw_off < raw_size) {
1422 if(raw_text[raw_off] == _T('\0')) {
1423 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1424 memcpy(build->text + off, raw_text + raw_off, 1);
1425 off++;
1426 raw_off++;
1427 continue;
1428 }
1429
1430 if(raw_text[raw_off] == _T('&')) {
1431 OFF ent_end;
1432
1433 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1434 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1435 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1436 off += ent_end - raw_off;
1437 raw_off = ent_end;
1438 continue;
1439 }
1440 }
1441
1442 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1443 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1444
1445 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1446 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1447 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1448 raw_off++;
1449
1450 build->text[off++] = raw_text[raw_off++];
1451 }
1452 build->substr_offsets[build->substr_count] = off;
1453 }
1454
1455 attr->text = build->text;
1456 attr->size = off;
1457 attr->substr_offsets = build->substr_offsets;
1458 attr->substr_types = build->substr_types;
1459 return 0;
1460
1461abort:
1462 md_free_attribute(ctx, build);
1463 return -1;
1464}
1465
1466
1467/*********************************************
1468 *** Dictionary of Reference Definitions ***
1469 *********************************************/
1470
1471#define MD_FNV1A_BASE 2166136261U
1472#define MD_FNV1A_PRIME 16777619U
1473
1474static inline unsigned
1475md_fnv1a(unsigned base, const void* data, size_t n)
1476{
1477 const unsigned char* buf = (const unsigned char*) data;
1478 unsigned hash = base;
1479 size_t i;
1480
1481 for(i = 0; i < n; i++) {
1482 hash ^= buf[i];
1483 hash *= MD_FNV1A_PRIME;
1484 }
1485
1486 return hash;
1487}
1488
1489
1490struct MD_REF_DEF_tag {
1491 CHAR* label;
1492 CHAR* title;
1493 unsigned hash;
1494 SZ label_size;
1495 SZ title_size;
1496 OFF dest_beg;
1497 OFF dest_end;
1498};
1499
1500/* Label equivalence is quite complicated with regards to whitespace and case
1501 * folding. This complicates computing a hash of it as well as direct comparison
1502 * of two labels. */
1503
1504static unsigned
1505md_link_label_hash(const CHAR* label, SZ size)
1506{
1507 unsigned hash = MD_FNV1A_BASE;
1508 OFF off;
1509 unsigned codepoint;
1510 int is_whitespace = FALSE;
1511
1512 off = md_skip_unicode_whitespace(label, 0, size);
1513 while(off < size) {
1514 SZ char_size;
1515
1516 codepoint = md_decode_unicode(label, off, size, &char_size);
1517 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1518
1519 if(is_whitespace) {
1520 codepoint = ' ';
1521 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1522 off = md_skip_unicode_whitespace(label, off, size);
1523 } else {
1524 MD_UNICODE_FOLD_INFO fold_info;
1525
1526 md_get_unicode_fold_info(codepoint, &fold_info);
1527 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1528 off += char_size;
1529 }
1530 }
1531
1532 return hash;
1533}
1534
1535static OFF
1536md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1537 MD_UNICODE_FOLD_INFO* fold_info)
1538{
1539 unsigned codepoint;
1540 SZ char_size;
1541
1542 if(off >= size) {
1543 /* Treat end of a link label as a whitespace. */
1544 goto whitespace;
1545 }
1546
1547 if(ISNEWLINE_(label[off])) {
1548 /* Treat new lines as a whitespace. */
1549 off++;
1550 goto whitespace;
1551 }
1552
1553 codepoint = md_decode_unicode(label, off, size, &char_size);
1554 off += char_size;
1555 if(ISUNICODEWHITESPACE_(codepoint)) {
1556 /* Treat all whitespace as equivalent */
1557 goto whitespace;
1558 }
1559
1560 /* Get real folding info. */
1561 md_get_unicode_fold_info(codepoint, fold_info);
1562 return off;
1563
1564whitespace:
1565 fold_info->codepoints[0] = _T(' ');
1566 fold_info->n_codepoints = 1;
1567 return md_skip_unicode_whitespace(label, off, size);
1568}
1569
1570static int
1571md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1572{
1573 OFF a_off;
1574 OFF b_off;
1575 int a_reached_end = FALSE;
1576 int b_reached_end = FALSE;
1577 MD_UNICODE_FOLD_INFO a_fi = { 0 };
1578 MD_UNICODE_FOLD_INFO b_fi = { 0 };
1579 OFF a_fi_off = 0;
1580 OFF b_fi_off = 0;
1581 int cmp;
1582
1583 a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1584 b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1585 while(!a_reached_end || !b_reached_end) {
1586 /* If needed, load fold info for next char. */
1587 if(a_fi_off >= a_fi.n_codepoints) {
1588 a_fi_off = 0;
1589 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1590 a_reached_end = (a_off >= a_size);
1591 }
1592 if(b_fi_off >= b_fi.n_codepoints) {
1593 b_fi_off = 0;
1594 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1595 b_reached_end = (b_off >= b_size);
1596 }
1597
1598 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1599 if(cmp != 0)
1600 return cmp;
1601
1602 a_fi_off++;
1603 b_fi_off++;
1604 }
1605
1606 return 0;
1607}
1608
1609typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1610struct MD_REF_DEF_LIST_tag {
1611 int n_ref_defs;
1612 int alloc_ref_defs;
1613 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1614};
1615
1616static int
1617md_ref_def_cmp(const void* a, const void* b)
1618{
1619 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1620 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1621
1622 if(a_ref->hash < b_ref->hash)
1623 return -1;
1624 else if(a_ref->hash > b_ref->hash)
1625 return +1;
1626 else
1627 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1628}
1629
1630static int
1631md_ref_def_cmp_for_sort(const void* a, const void* b)
1632{
1633 int cmp;
1634
1635 cmp = md_ref_def_cmp(a, b);
1636
1637 /* Ensure stability of the sorting. */
1638 if(cmp == 0) {
1639 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1640 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1641
1642 if(a_ref < b_ref)
1643 cmp = -1;
1644 else if(a_ref > b_ref)
1645 cmp = +1;
1646 else
1647 cmp = 0;
1648 }
1649
1650 return cmp;
1651}
1652
1653static int
1654md_build_ref_def_hashtable(MD_CTX* ctx)
1655{
1656 int i, j;
1657
1658 if(ctx->n_ref_defs == 0)
1659 return 0;
1660
1661 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1662 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1663 if(ctx->ref_def_hashtable == NULL) {
1664 MD_LOG("malloc() failed.");
1665 goto abort;
1666 }
1667 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1668
1669 /* Each member of ctx->ref_def_hashtable[] can be:
1670 * -- NULL,
1671 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1672 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1673 * such MD_REF_DEFs.
1674 */
1675 for(i = 0; i < ctx->n_ref_defs; i++) {
1676 MD_REF_DEF* def = &ctx->ref_defs[i];
1677 void* bucket;
1678 MD_REF_DEF_LIST* list;
1679
1680 def->hash = md_link_label_hash(def->label, def->label_size);
1681 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1682
1683 if(bucket == NULL) {
1684 /* The bucket is empty. Make it just point to the def. */
1685 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1686 continue;
1687 }
1688
1689 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1690 /* The bucket already contains one ref. def. Lets see whether it
1691 * is the same label (ref. def. duplicate) or different one
1692 * (hash conflict). */
1693 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1694
1695 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1696 /* Duplicate label: Ignore this ref. def. */
1697 continue;
1698 }
1699
1700 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1701 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1702 if(list == NULL) {
1703 MD_LOG("malloc() failed.");
1704 goto abort;
1705 }
1706 list->ref_defs[0] = old_def;
1707 list->ref_defs[1] = def;
1708 list->n_ref_defs = 2;
1709 list->alloc_ref_defs = 2;
1710 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1711 continue;
1712 }
1713
1714 /* Append the def to the complex bucket list.
1715 *
1716 * Note in this case we ignore potential duplicates to avoid expensive
1717 * iterating over the complex bucket. Below, we revisit all the complex
1718 * buckets and handle it more cheaply after the complex bucket contents
1719 * is sorted. */
1720 list = (MD_REF_DEF_LIST*) bucket;
1721 if(list->n_ref_defs >= list->alloc_ref_defs) {
1722 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1723 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1724 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1725 if(list_tmp == NULL) {
1726 MD_LOG("realloc() failed.");
1727 goto abort;
1728 }
1729 list = list_tmp;
1730 list->alloc_ref_defs = alloc_ref_defs;
1731 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1732 }
1733
1734 list->ref_defs[list->n_ref_defs] = def;
1735 list->n_ref_defs++;
1736 }
1737
1738 /* Sort the complex buckets so we can use bsearch() with them. */
1739 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1740 void* bucket = ctx->ref_def_hashtable[i];
1741 MD_REF_DEF_LIST* list;
1742
1743 if(bucket == NULL)
1744 continue;
1745 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1746 continue;
1747
1748 list = (MD_REF_DEF_LIST*) bucket;
1749 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1750
1751 /* Disable all duplicates in the complex bucket by forcing all such
1752 * records to point to the 1st such ref. def. I.e. no matter which
1753 * record is found during the lookup, it will always point to the right
1754 * ref. def. in ctx->ref_defs[]. */
1755 for(j = 1; j < list->n_ref_defs; j++) {
1756 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1757 list->ref_defs[j] = list->ref_defs[j-1];
1758 }
1759 }
1760
1761 return 0;
1762
1763abort:
1764 return -1;
1765}
1766
1767static void
1768md_free_ref_def_hashtable(MD_CTX* ctx)
1769{
1770 if(ctx->ref_def_hashtable != NULL) {
1771 int i;
1772
1773 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1774 void* bucket = ctx->ref_def_hashtable[i];
1775 if(bucket == NULL)
1776 continue;
1777 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1778 continue;
1779 free(bucket);
1780 }
1781
1782 free(ctx->ref_def_hashtable);
1783 }
1784}
1785
1786static const MD_REF_DEF*
1787md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1788{
1789 unsigned hash;
1790 void* bucket;
1791
1792 if(ctx->ref_def_hashtable_size == 0)
1793 return NULL;
1794
1795 hash = md_link_label_hash(label, label_size);
1796 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1797
1798 if(bucket == NULL) {
1799 return NULL;
1800 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1801 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1802
1803 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1804 return def;
1805 else
1806 return NULL;
1807 } else {
1808 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1809 MD_REF_DEF key_buf;
1810 const MD_REF_DEF* key = &key_buf;
1811 const MD_REF_DEF** ret;
1812
1813 key_buf.label = (CHAR*) label;
1814 key_buf.label_size = label_size;
1815 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1816
1817 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1818 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1819 if(ret != NULL)
1820 return *ret;
1821 else
1822 return NULL;
1823 }
1824}
1825
1826
1827/***************************
1828 *** Recognizing Links ***
1829 ***************************/
1830
1831/* Note this code is partially shared between processing inlines and blocks
1832 * as reference definitions and links share some helper parser functions.
1833 */
1834
1835typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1836struct MD_LINK_ATTR_tag {
1837 OFF dest_beg;
1838 OFF dest_end;
1839
1840 CHAR* title;
1841 SZ title_size;
1842 int title_needs_free;
1843};
1844
1845
1846static int
1847md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1848 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1849 OFF* p_contents_beg, OFF* p_contents_end)
1850{
1851 OFF off = beg;
1852 OFF contents_beg = 0;
1853 OFF contents_end = 0;
1854 int line_index = 0;
1855 int len = 0;
1856
1857 if(CH(off) != _T('['))
1858 return FALSE;
1859 off++;
1860
1861 while(1) {
1862 OFF line_end = lines[line_index].end;
1863
1864 while(off < line_end) {
1865 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1866 if(contents_end == 0) {
1867 contents_beg = off;
1868 *p_beg_line_index = line_index;
1869 }
1870 contents_end = off + 2;
1871 off += 2;
1872 } else if(CH(off) == _T('[')) {
1873 return FALSE;
1874 } else if(CH(off) == _T(']')) {
1875 if(contents_beg < contents_end) {
1876 /* Success. */
1877 *p_contents_beg = contents_beg;
1878 *p_contents_end = contents_end;
1879 *p_end = off+1;
1880 *p_end_line_index = line_index;
1881 return TRUE;
1882 } else {
1883 /* Link label must have some non-whitespace contents. */
1884 return FALSE;
1885 }
1886 } else {
1887 unsigned codepoint;
1888 SZ char_size;
1889
1890 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1891 if(!ISUNICODEWHITESPACE_(codepoint)) {
1892 if(contents_end == 0) {
1893 contents_beg = off;
1894 *p_beg_line_index = line_index;
1895 }
1896 contents_end = off + char_size;
1897 }
1898
1899 off += char_size;
1900 }
1901
1902 len++;
1903 if(len > 999)
1904 return FALSE;
1905 }
1906
1907 line_index++;
1908 len++;
1909 if(line_index < n_lines)
1910 off = lines[line_index].beg;
1911 else
1912 break;
1913 }
1914
1915 return FALSE;
1916}
1917
1918static int
1919md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1920 OFF* p_contents_beg, OFF* p_contents_end)
1921{
1922 OFF off = beg;
1923
1924 if(off >= max_end || CH(off) != _T('<'))
1925 return FALSE;
1926 off++;
1927
1928 while(off < max_end) {
1929 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1930 off += 2;
1931 continue;
1932 }
1933
1934 if(ISNEWLINE(off) || CH(off) == _T('<'))
1935 return FALSE;
1936
1937 if(CH(off) == _T('>')) {
1938 /* Success. */
1939 *p_contents_beg = beg+1;
1940 *p_contents_end = off;
1941 *p_end = off+1;
1942 return TRUE;
1943 }
1944
1945 off++;
1946 }
1947
1948 return FALSE;
1949}
1950
1951static int
1952md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1953 OFF* p_contents_beg, OFF* p_contents_end)
1954{
1955 OFF off = beg;
1956 int parenthesis_level = 0;
1957
1958 while(off < max_end) {
1959 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1960 off += 2;
1961 continue;
1962 }
1963
1964 if(ISWHITESPACE(off) || ISCNTRL(off))
1965 break;
1966
1967 /* Link destination may include balanced pairs of unescaped '(' ')'.
1968 * Note we limit the maximal nesting level by 32 to protect us from
1969 * https://github.com/jgm/cmark/issues/214 */
1970 if(CH(off) == _T('(')) {
1971 parenthesis_level++;
1972 if(parenthesis_level > 32)
1973 return FALSE;
1974 } else if(CH(off) == _T(')')) {
1975 if(parenthesis_level == 0)
1976 break;
1977 parenthesis_level--;
1978 }
1979
1980 off++;
1981 }
1982
1983 if(parenthesis_level != 0 || off == beg)
1984 return FALSE;
1985
1986 /* Success. */
1987 *p_contents_beg = beg;
1988 *p_contents_end = off;
1989 *p_end = off;
1990 return TRUE;
1991}
1992
1993static inline int
1994md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1995 OFF* p_contents_beg, OFF* p_contents_end)
1996{
1997 if(CH(beg) == _T('<'))
1998 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
1999 else
2000 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2001}
2002
2003static int
2004md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2005 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2006 OFF* p_contents_beg, OFF* p_contents_end)
2007{
2008 OFF off = beg;
2009 CHAR closer_char;
2010 int line_index = 0;
2011
2012 /* White space with up to one line break. */
2013 while(off < lines[line_index].end && ISWHITESPACE(off))
2014 off++;
2015 if(off >= lines[line_index].end) {
2016 line_index++;
2017 if(line_index >= n_lines)
2018 return FALSE;
2019 off = lines[line_index].beg;
2020 }
2021 if(off == beg)
2022 return FALSE;
2023
2024 *p_beg_line_index = line_index;
2025
2026 /* First char determines how to detect end of it. */
2027 switch(CH(off)) {
2028 case _T('"'): closer_char = _T('"'); break;
2029 case _T('\''): closer_char = _T('\''); break;
2030 case _T('('): closer_char = _T(')'); break;
2031 default: return FALSE;
2032 }
2033 off++;
2034
2035 *p_contents_beg = off;
2036
2037 while(line_index < n_lines) {
2038 OFF line_end = lines[line_index].end;
2039
2040 while(off < line_end) {
2041 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2042 off++;
2043 } else if(CH(off) == closer_char) {
2044 /* Success. */
2045 *p_contents_end = off;
2046 *p_end = off+1;
2047 *p_end_line_index = line_index;
2048 return TRUE;
2049 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2050 /* ()-style title cannot contain (unescaped '(')) */
2051 return FALSE;
2052 }
2053
2054 off++;
2055 }
2056
2057 line_index++;
2058 }
2059
2060 return FALSE;
2061}
2062
2063/* Returns 0 if it is not a reference definition.
2064 *
2065 * Returns N > 0 if it is a reference definition. N then corresponds to the
2066 * number of lines forming it). In this case the definition is stored for
2067 * resolving any links referring to it.
2068 *
2069 * Returns -1 in case of an error (out of memory).
2070 */
2071static int
2072md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2073{
2074 OFF label_contents_beg;
2075 OFF label_contents_end;
2076 int label_contents_line_index = -1;
2077 int label_is_multiline;
2078 CHAR* label = NULL;
2079 SZ label_size;
2080 OFF dest_contents_beg;
2081 OFF dest_contents_end;
2082 OFF title_contents_beg;
2083 OFF title_contents_end;
2084 int title_contents_line_index;
2085 int title_is_multiline;
2086 OFF off;
2087 int line_index = 0;
2088 int tmp_line_index;
2089 MD_REF_DEF* def;
2090 int ret;
2091
2092 /* Link label. */
2093 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2094 &off, &label_contents_line_index, &line_index,
2095 &label_contents_beg, &label_contents_end))
2096 return FALSE;
2097 label_is_multiline = (label_contents_line_index != line_index);
2098
2099 /* Colon. */
2100 if(off >= lines[line_index].end || CH(off) != _T(':'))
2101 return FALSE;
2102 off++;
2103
2104 /* Optional white space with up to one line break. */
2105 while(off < lines[line_index].end && ISWHITESPACE(off))
2106 off++;
2107 if(off >= lines[line_index].end) {
2108 line_index++;
2109 if(line_index >= n_lines)
2110 return FALSE;
2111 off = lines[line_index].beg;
2112 }
2113
2114 /* Link destination. */
2115 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2116 &off, &dest_contents_beg, &dest_contents_end))
2117 return FALSE;
2118
2119 /* (Optional) title. Note we interpret it as an title only if nothing
2120 * more follows on its last line. */
2121 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2122 &off, &title_contents_line_index, &tmp_line_index,
2123 &title_contents_beg, &title_contents_end)
2124 && off >= lines[line_index + tmp_line_index].end)
2125 {
2126 title_is_multiline = (tmp_line_index != title_contents_line_index);
2127 title_contents_line_index += line_index;
2128 line_index += tmp_line_index;
2129 } else {
2130 /* Not a title. */
2131 title_is_multiline = FALSE;
2132 title_contents_beg = off;
2133 title_contents_end = off;
2134 title_contents_line_index = 0;
2135 }
2136
2137 /* Nothing more can follow on the last line. */
2138 if(off < lines[line_index].end)
2139 return FALSE;
2140
2141 /* Construct label. */
2142 if(!label_is_multiline) {
2143 label = (CHAR*) STR(label_contents_beg);
2144 label_size = label_contents_end - label_contents_beg;
2145 } else {
2146 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2147 lines + label_contents_line_index, n_lines - label_contents_line_index,
2148 _T(' '), &label, &label_size));
2149 }
2150
2151 /* Store the reference definition. */
2152 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2153 MD_REF_DEF* new_defs;
2154
2155 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2156 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2157 : 16);
2158 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2159 if(new_defs == NULL) {
2160 MD_LOG("realloc() failed.");
2161 goto abort;
2162 }
2163
2164 ctx->ref_defs = new_defs;
2165 }
2166
2167 def = &ctx->ref_defs[ctx->n_ref_defs];
2168 memset(def, 0, sizeof(MD_REF_DEF));
2169
2170 def->label = label;
2171 def->label_size = label_size;
2172
2173 def->dest_beg = dest_contents_beg;
2174 def->dest_end = dest_contents_end;
2175
2176 if(title_contents_beg >= title_contents_end) {
2177 def->title = NULL;
2178 def->title_size = 0;
2179 } else if(!title_is_multiline) {
2180 def->title = (CHAR*) STR(title_contents_beg);
2181 def->title_size = title_contents_end - title_contents_beg;
2182 } else {
2183 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2184 lines + title_contents_line_index, n_lines - title_contents_line_index,
2185 _T('\n'), &def->title, &def->title_size));
2186 }
2187
2188 /* Success. */
2189 ctx->n_ref_defs++;
2190 return line_index + 1;
2191
2192abort:
2193 /* Failure. */
2194 if(!IS_INPUT_STR(label))
2195 free(label);
2196 return ret;
2197}
2198
2199static int
2200md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2201 OFF beg, OFF end, MD_LINK_ATTR* attr)
2202{
2203 const MD_REF_DEF* def;
2204 const MD_LINE* beg_line;
2205 const MD_LINE* end_line;
2206 CHAR* label;
2207 SZ label_size;
2208 int ret;
2209
2210 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2211 MD_ASSERT(CH(end-1) == _T(']'));
2212
2213 beg += (CH(beg) == _T('!') ? 2 : 1);
2214 end--;
2215
2216 /* Find lines corresponding to the beg and end positions. */
2217 MD_ASSERT(lines[0].beg <= beg);
2218 beg_line = lines;
2219 while(beg >= beg_line->end)
2220 beg_line++;
2221
2222 MD_ASSERT(end <= lines[n_lines-1].end);
2223 end_line = beg_line;
2224 while(end >= end_line->end)
2225 end_line++;
2226
2227 if(beg_line != end_line) {
2228 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2229 n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2230 } else {
2231 label = (CHAR*) STR(beg);
2232 label_size = end - beg;
2233 }
2234
2235 def = md_lookup_ref_def(ctx, label, label_size);
2236 if(def != NULL) {
2237 attr->dest_beg = def->dest_beg;
2238 attr->dest_end = def->dest_end;
2239 attr->title = def->title;
2240 attr->title_size = def->title_size;
2241 attr->title_needs_free = FALSE;
2242 }
2243
2244 if(!IS_INPUT_STR(label))
2245 free(label);
2246
2247 ret = (def != NULL);
2248
2249abort:
2250 return ret;
2251}
2252
2253static int
2254md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2255 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2256{
2257 int line_index = 0;
2258 int tmp_line_index;
2259 OFF title_contents_beg;
2260 OFF title_contents_end;
2261 int title_contents_line_index;
2262 int title_is_multiline;
2263 OFF off = beg;
2264 int ret = FALSE;
2265
2266 while(off >= lines[line_index].end)
2267 line_index++;
2268
2269 MD_ASSERT(CH(off) == _T('('));
2270 off++;
2271
2272 /* Optional white space with up to one line break. */
2273 while(off < lines[line_index].end && ISWHITESPACE(off))
2274 off++;
2275 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2276 line_index++;
2277 if(line_index >= n_lines)
2278 return FALSE;
2279 off = lines[line_index].beg;
2280 }
2281
2282 /* Link destination may be omitted, but only when not also having a title. */
2283 if(off < ctx->size && CH(off) == _T(')')) {
2284 attr->dest_beg = off;
2285 attr->dest_end = off;
2286 attr->title = NULL;
2287 attr->title_size = 0;
2288 attr->title_needs_free = FALSE;
2289 off++;
2290 *p_end = off;
2291 return TRUE;
2292 }
2293
2294 /* Link destination. */
2295 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2296 &off, &attr->dest_beg, &attr->dest_end))
2297 return FALSE;
2298
2299 /* (Optional) title. */
2300 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2301 &off, &title_contents_line_index, &tmp_line_index,
2302 &title_contents_beg, &title_contents_end))
2303 {
2304 title_is_multiline = (tmp_line_index != title_contents_line_index);
2305 title_contents_line_index += line_index;
2306 line_index += tmp_line_index;
2307 } else {
2308 /* Not a title. */
2309 title_is_multiline = FALSE;
2310 title_contents_beg = off;
2311 title_contents_end = off;
2312 title_contents_line_index = 0;
2313 }
2314
2315 /* Optional whitespace followed with final ')'. */
2316 while(off < lines[line_index].end && ISWHITESPACE(off))
2317 off++;
2318 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2319 line_index++;
2320 if(line_index >= n_lines)
2321 return FALSE;
2322 off = lines[line_index].beg;
2323 }
2324 if(CH(off) != _T(')'))
2325 goto abort;
2326 off++;
2327
2328 if(title_contents_beg >= title_contents_end) {
2329 attr->title = NULL;
2330 attr->title_size = 0;
2331 attr->title_needs_free = FALSE;
2332 } else if(!title_is_multiline) {
2333 attr->title = (CHAR*) STR(title_contents_beg);
2334 attr->title_size = title_contents_end - title_contents_beg;
2335 attr->title_needs_free = FALSE;
2336 } else {
2337 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2338 lines + title_contents_line_index, n_lines - title_contents_line_index,
2339 _T('\n'), &attr->title, &attr->title_size));
2340 attr->title_needs_free = TRUE;
2341 }
2342
2343 *p_end = off;
2344 ret = TRUE;
2345
2346abort:
2347 return ret;
2348}
2349
2350static void
2351md_free_ref_defs(MD_CTX* ctx)
2352{
2353 int i;
2354
2355 for(i = 0; i < ctx->n_ref_defs; i++) {
2356 MD_REF_DEF* def = &ctx->ref_defs[i];
2357
2358 if(!IS_INPUT_STR(def->label))
2359 free(def->label);
2360 if(!IS_INPUT_STR(def->title))
2361 free(def->title);
2362 }
2363
2364 free(ctx->ref_defs);
2365}
2366
2367
2368/******************************************
2369 *** Processing Inlines (a.k.a Spans) ***
2370 ******************************************/
2371
2372/* We process inlines in few phases:
2373 *
2374 * (1) We go through the block text and collect all significant characters
2375 * which may start/end a span or some other significant position into
2376 * ctx->marks[]. Core of this is what md_collect_marks() does.
2377 *
2378 * We also do some very brief preliminary context-less analysis, whether
2379 * it might be opener or closer (e.g. of an emphasis span).
2380 *
2381 * This speeds the other steps as we do not need to re-iterate over all
2382 * characters anymore.
2383 *
2384 * (2) We analyze each potential mark types, in order by their precedence.
2385 *
2386 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2387 * skipping already resolved regions (in preceding precedences) and try to
2388 * resolve them.
2389 *
2390 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2391 * them as resolved.
2392 *
2393 * (2.2) For range-type marks, we analyze whether the mark could be closer
2394 * and, if yes, whether there is some preceding opener it could satisfy.
2395 *
2396 * If not we check whether it could be really an opener and if yes, we
2397 * remember it so subsequent closers may resolve it.
2398 *
2399 * (3) Finally, when all marks were analyzed, we render the block contents
2400 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2401 * or ::close_span() whenever we reach a resolved mark.
2402 */
2403
2404
2405/* The mark structure.
2406 *
2407 * '\\': Maybe escape sequence.
2408 * '\0': NULL char.
2409 * '*': Maybe (strong) emphasis start/end.
2410 * '_': Maybe (strong) emphasis start/end.
2411 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2412 * '`': Maybe code span start/end.
2413 * '&': Maybe start of entity.
2414 * ';': Maybe end of entity.
2415 * '<': Maybe start of raw HTML or autolink.
2416 * '>': Maybe end of raw HTML or autolink.
2417 * '[': Maybe start of link label or link text.
2418 * '!': Equivalent of '[' for image.
2419 * ']': Maybe end of link label or link text.
2420 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2421 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2422 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2423 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2424 * (e.g. emphasis) or to make more space for storing some special data
2425 * related to the preceding mark (e.g. link).
2426 *
2427 * Note that not all instances of these chars in the text imply creation of the
2428 * structure. Only those which have (or may have, after we see more context)
2429 * the special meaning.
2430 *
2431 * (Keep this struct as small as possible to fit as much of them into CPU
2432 * cache line.)
2433 */
2434struct MD_MARK_tag {
2435 OFF beg;
2436 OFF end;
2437
2438 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2439 * of given type 'ch'.
2440 *
2441 * During resolving, we disconnect from the chain and point to the
2442 * corresponding counterpart so opener points to its closer and vice versa.
2443 */
2444 int prev;
2445 int next;
2446 CHAR ch;
2447 unsigned char flags;
2448};
2449
2450/* Mark flags (these apply to ALL mark types). */
2451#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2452#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2453#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2454#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2455#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2456
2457/* Mark flags specific for various mark types (so they can share bits). */
2458#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2459#define MD_MARK_EMPH_MOD3_0 0x40
2460#define MD_MARK_EMPH_MOD3_1 0x80
2461#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2462#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2463#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2464#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2465
2466static MD_MARKCHAIN*
2467md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2468{
2469 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2470 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2471 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2472 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2473 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2474 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2475 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2476 default: MD_UNREACHABLE();
2477 }
2478 return NULL;
2479}
2480
2481static MD_MARKCHAIN*
2482md_mark_chain(MD_CTX* ctx, int mark_index)
2483{
2484 MD_MARK* mark = &ctx->marks[mark_index];
2485
2486 switch(mark->ch) {
2487 case _T('*'): return md_asterisk_chain(ctx, mark->flags);
2488 case _T('_'): return &UNDERSCORE_OPENERS;
2489 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2490 case _T('['): return &BRACKET_OPENERS;
2491 case _T('|'): return &TABLECELLBOUNDARIES;
2492 default: return NULL;
2493 }
2494}
2495
2496static MD_MARK*
2497md_push_mark(MD_CTX* ctx)
2498{
2499 if(ctx->n_marks >= ctx->alloc_marks) {
2500 MD_MARK* new_marks;
2501
2502 ctx->alloc_marks = (ctx->alloc_marks > 0
2503 ? ctx->alloc_marks + ctx->alloc_marks / 2
2504 : 64);
2505 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2506 if(new_marks == NULL) {
2507 MD_LOG("realloc() failed.");
2508 return NULL;
2509 }
2510
2511 ctx->marks = new_marks;
2512 }
2513
2514 return &ctx->marks[ctx->n_marks++];
2515}
2516
2517#define PUSH_MARK_() \
2518 do { \
2519 mark = md_push_mark(ctx); \
2520 if(mark == NULL) { \
2521 ret = -1; \
2522 goto abort; \
2523 } \
2524 } while(0)
2525
2526#define PUSH_MARK(ch_, beg_, end_, flags_) \
2527 do { \
2528 PUSH_MARK_(); \
2529 mark->beg = (beg_); \
2530 mark->end = (end_); \
2531 mark->prev = -1; \
2532 mark->next = -1; \
2533 mark->ch = (char)(ch_); \
2534 mark->flags = (flags_); \
2535 } while(0)
2536
2537
2538static void
2539md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2540{
2541 if(chain->tail >= 0)
2542 ctx->marks[chain->tail].next = mark_index;
2543 else
2544 chain->head = mark_index;
2545
2546 ctx->marks[mark_index].prev = chain->tail;
2547 ctx->marks[mark_index].next = -1;
2548 chain->tail = mark_index;
2549}
2550
2551/* Sometimes, we need to store a pointer into the mark. It is quite rare
2552 * so we do not bother to make MD_MARK use union, and it can only happen
2553 * for dummy marks. */
2554static inline void
2555md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2556{
2557 MD_MARK* mark = &ctx->marks[mark_index];
2558 MD_ASSERT(mark->ch == 'D');
2559
2560 /* Check only members beg and end are misused for this. */
2561 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2562 memcpy(mark, &ptr, sizeof(void*));
2563}
2564
2565static inline void*
2566md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2567{
2568 void* ptr;
2569 MD_MARK* mark = &ctx->marks[mark_index];
2570 MD_ASSERT(mark->ch == 'D');
2571 memcpy(&ptr, mark, sizeof(void*));
2572 return ptr;
2573}
2574
2575static void
2576md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2577{
2578 MD_MARK* opener = &ctx->marks[opener_index];
2579 MD_MARK* closer = &ctx->marks[closer_index];
2580
2581 /* Remove opener from the list of openers. */
2582 if(chain != NULL) {
2583 if(opener->prev >= 0)
2584 ctx->marks[opener->prev].next = opener->next;
2585 else
2586 chain->head = opener->next;
2587
2588 if(opener->next >= 0)
2589 ctx->marks[opener->next].prev = opener->prev;
2590 else
2591 chain->tail = opener->prev;
2592 }
2593
2594 /* Interconnect opener and closer and mark both as resolved. */
2595 opener->next = closer_index;
2596 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2597 closer->prev = opener_index;
2598 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2599}
2600
2601
2602#define MD_ROLLBACK_ALL 0
2603#define MD_ROLLBACK_CROSSING 1
2604
2605/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2606 * resolvings accordingly to these rules:
2607 *
2608 * (1) All openers BEFORE the range corresponding to any closer inside the
2609 * range are un-resolved and they are re-added to their respective chains
2610 * of unresolved openers. This ensures we can reuse the opener for closers
2611 * AFTER the range.
2612 *
2613 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2614 * are discarded.
2615 *
2616 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2617 * in (1) are discarded. I.e. pairs of openers and closers which are both
2618 * inside the range are retained as well as any unpaired marks.
2619 */
2620static void
2621md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2622{
2623 int i;
2624 int mark_index;
2625
2626 /* Cut all unresolved openers at the mark index. */
2627 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2628 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2629
2630 while(chain->tail >= opener_index)
2631 chain->tail = ctx->marks[chain->tail].prev;
2632
2633 if(chain->tail >= 0)
2634 ctx->marks[chain->tail].next = -1;
2635 else
2636 chain->head = -1;
2637 }
2638
2639 /* Go backwards so that unresolved openers are re-added into their
2640 * respective chains, in the right order. */
2641 mark_index = closer_index - 1;
2642 while(mark_index > opener_index) {
2643 MD_MARK* mark = &ctx->marks[mark_index];
2644 int mark_flags = mark->flags;
2645 int discard_flag = (how == MD_ROLLBACK_ALL);
2646
2647 if(mark->flags & MD_MARK_CLOSER) {
2648 int mark_opener_index = mark->prev;
2649
2650 /* Undo opener BEFORE the range. */
2651 if(mark_opener_index < opener_index) {
2652 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2653 MD_MARKCHAIN* chain;
2654
2655 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2656 chain = md_mark_chain(ctx, opener_index);
2657 if(chain != NULL) {
2658 md_mark_chain_append(ctx, chain, mark_opener_index);
2659 discard_flag = 1;
2660 }
2661 }
2662 }
2663
2664 /* And reset our flags. */
2665 if(discard_flag)
2666 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2667
2668 /* Jump as far as we can over unresolved or non-interesting marks. */
2669 switch(how) {
2670 case MD_ROLLBACK_CROSSING:
2671 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2672 /* If we are closer with opener INSIDE the range, there may
2673 * not be any other crosser inside the subrange. */
2674 mark_index = mark->prev;
2675 break;
2676 }
2677 /* Pass through. */
2678 default:
2679 mark_index--;
2680 break;
2681 }
2682 }
2683}
2684
2685static void
2686md_build_mark_char_map(MD_CTX* ctx)
2687{
2688 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2689
2690 ctx->mark_char_map['\\'] = 1;
2691 ctx->mark_char_map['*'] = 1;
2692 ctx->mark_char_map['_'] = 1;
2693 ctx->mark_char_map['`'] = 1;
2694 ctx->mark_char_map['&'] = 1;
2695 ctx->mark_char_map[';'] = 1;
2696 ctx->mark_char_map['<'] = 1;
2697 ctx->mark_char_map['>'] = 1;
2698 ctx->mark_char_map['['] = 1;
2699 ctx->mark_char_map['!'] = 1;
2700 ctx->mark_char_map[']'] = 1;
2701 ctx->mark_char_map['\0'] = 1;
2702
2703 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2704 ctx->mark_char_map['~'] = 1;
2705
2706 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2707 ctx->mark_char_map['$'] = 1;
2708
2709 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2710 ctx->mark_char_map['@'] = 1;
2711
2712 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2713 ctx->mark_char_map[':'] = 1;
2714
2715 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2716 ctx->mark_char_map['.'] = 1;
2717
2718 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2719 ctx->mark_char_map['|'] = 1;
2720
2721 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2722 int i;
2723
2724 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2725 if(ISWHITESPACE_(i))
2726 ctx->mark_char_map[i] = 1;
2727 }
2728 }
2729}
2730
2731/* We limit code span marks to lower then 32 backticks. This solves the
2732 * pathologic case of too many openers, each of different length: Their
2733 * resolving would be then O(n^2). */
2734#define CODESPAN_MARK_MAXLEN 32
2735
2736static int
2737md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2738 OFF* p_opener_beg, OFF* p_opener_end,
2739 OFF* p_closer_beg, OFF* p_closer_end,
2740 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2741 int* p_reached_paragraph_end)
2742{
2743 OFF opener_beg = beg;
2744 OFF opener_end;
2745 OFF closer_beg;
2746 OFF closer_end;
2747 SZ mark_len;
2748 OFF line_end;
2749 int has_space_after_opener = FALSE;
2750 int has_eol_after_opener = FALSE;
2751 int has_space_before_closer = FALSE;
2752 int has_eol_before_closer = FALSE;
2753 int has_only_space = TRUE;
2754 int line_index = 0;
2755
2756 line_end = lines[0].end;
2757 opener_end = opener_beg;
2758 while(opener_end < line_end && CH(opener_end) == _T('`'))
2759 opener_end++;
2760 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2761 has_eol_after_opener = (opener_end == line_end);
2762
2763 /* The caller needs to know end of the opening mark even if we fail. */
2764 *p_opener_end = opener_end;
2765
2766 mark_len = opener_end - opener_beg;
2767 if(mark_len > CODESPAN_MARK_MAXLEN)
2768 return FALSE;
2769
2770 /* Check whether we already know there is no closer of this length.
2771 * If so, re-scan does no sense. This fixes issue #59. */
2772 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2773 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2774 return FALSE;
2775
2776 closer_beg = opener_end;
2777 closer_end = opener_end;
2778
2779 /* Find closer mark. */
2780 while(TRUE) {
2781 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2782 if(CH(closer_beg) != _T(' '))
2783 has_only_space = FALSE;
2784 closer_beg++;
2785 }
2786 closer_end = closer_beg;
2787 while(closer_end < line_end && CH(closer_end) == _T('`'))
2788 closer_end++;
2789
2790 if(closer_end - closer_beg == mark_len) {
2791 /* Success. */
2792 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2793 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2794 break;
2795 }
2796
2797 if(closer_end - closer_beg > 0) {
2798 /* We have found a back-tick which is not part of the closer. */
2799 has_only_space = FALSE;
2800
2801 /* But if we eventually fail, remember it as a potential closer
2802 * of its own length for future attempts. This mitigates needs for
2803 * rescans. */
2804 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2805 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2806 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2807 }
2808 }
2809
2810 if(closer_end >= line_end) {
2811 line_index++;
2812 if(line_index >= n_lines) {
2813 /* Reached end of the paragraph and still nothing. */
2814 *p_reached_paragraph_end = TRUE;
2815 return FALSE;
2816 }
2817 /* Try on the next line. */
2818 line_end = lines[line_index].end;
2819 closer_beg = lines[line_index].beg;
2820 } else {
2821 closer_beg = closer_end;
2822 }
2823 }
2824
2825 /* If there is a space or a new line both after and before the opener
2826 * (and if the code span is not made of spaces only), consume one initial
2827 * and one trailing space as part of the marks. */
2828 if(!has_only_space &&
2829 (has_space_after_opener || has_eol_after_opener) &&
2830 (has_space_before_closer || has_eol_before_closer))
2831 {
2832 if(has_space_after_opener)
2833 opener_end++;
2834 else
2835 opener_end = lines[1].beg;
2836
2837 if(has_space_before_closer)
2838 closer_beg--;
2839 else {
2840 closer_beg = lines[line_index-1].end;
2841 /* We need to eat the preceding "\r\n" but not any line trailing
2842 * spaces. */
2843 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2844 closer_beg++;
2845 }
2846 }
2847
2848 *p_opener_beg = opener_beg;
2849 *p_opener_end = opener_end;
2850 *p_closer_beg = closer_beg;
2851 *p_closer_end = closer_end;
2852 return TRUE;
2853}
2854
2855static int
2856md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2857{
2858 OFF off = beg+1;
2859
2860 MD_ASSERT(CH(beg) == _T('<'));
2861
2862 /* Check for scheme. */
2863 if(off >= max_end || !ISASCII(off))
2864 return FALSE;
2865 off++;
2866 while(1) {
2867 if(off >= max_end)
2868 return FALSE;
2869 if(off - beg > 32)
2870 return FALSE;
2871 if(CH(off) == _T(':') && off - beg >= 3)
2872 break;
2873 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2874 return FALSE;
2875 off++;
2876 }
2877
2878 /* Check the path after the scheme. */
2879 while(off < max_end && CH(off) != _T('>')) {
2880 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2881 return FALSE;
2882 off++;
2883 }
2884
2885 if(off >= max_end)
2886 return FALSE;
2887
2888 MD_ASSERT(CH(off) == _T('>'));
2889 *p_end = off+1;
2890 return TRUE;
2891}
2892
2893static int
2894md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2895{
2896 OFF off = beg + 1;
2897 int label_len;
2898
2899 MD_ASSERT(CH(beg) == _T('<'));
2900
2901 /* The code should correspond to this regexp:
2902 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2903 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2904 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2905 */
2906
2907 /* Username (before '@'). */
2908 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2909 off++;
2910 if(off <= beg+1)
2911 return FALSE;
2912
2913 /* '@' */
2914 if(off >= max_end || CH(off) != _T('@'))
2915 return FALSE;
2916 off++;
2917
2918 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2919 * characters or '-', but '-' is not allowed as first or last char. */
2920 label_len = 0;
2921 while(off < max_end) {
2922 if(ISALNUM(off))
2923 label_len++;
2924 else if(CH(off) == _T('-') && label_len > 0)
2925 label_len++;
2926 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2927 label_len = 0;
2928 else
2929 break;
2930
2931 if(label_len > 63)
2932 return FALSE;
2933
2934 off++;
2935 }
2936
2937 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2938 return FALSE;
2939
2940 *p_end = off+1;
2941 return TRUE;
2942}
2943
2944static int
2945md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2946{
2947 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2948 *p_missing_mailto = FALSE;
2949 return TRUE;
2950 }
2951
2952 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2953 *p_missing_mailto = TRUE;
2954 return TRUE;
2955 }
2956
2957 return FALSE;
2958}
2959
2960static int
2961md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2962{
2963 int i;
2964 int ret = 0;
2965 MD_MARK* mark;
2966 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2967 int codespan_scanned_till_paragraph_end = FALSE;
2968
2969 for(i = 0; i < n_lines; i++) {
2970 const MD_LINE* line = &lines[i];
2971 OFF off = line->beg;
2972 OFF line_end = line->end;
2973
2974 while(TRUE) {
2975 CHAR ch;
2976
2977#ifdef MD4C_USE_UTF16
2978 /* For UTF-16, mark_char_map[] covers only ASCII. */
2979 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
2980 (ctx->mark_char_map[(unsigned char) CH(off)]))
2981#else
2982 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2983 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
2984#endif
2985
2986 /* Optimization: Use some loop unrolling. */
2987 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
2988 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
2989 off += 4;
2990 while(off < line_end && !IS_MARK_CHAR(off+0))
2991 off++;
2992
2993 if(off >= line_end)
2994 break;
2995
2996 ch = CH(off);
2997
2998 /* A backslash escape.
2999 * It can go beyond line->end as it may involve escaped new
3000 * line to form a hard break. */
3001 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3002 /* Hard-break cannot be on the last line of the block. */
3003 if(!ISNEWLINE(off+1) || i+1 < n_lines)
3004 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3005 off += 2;
3006 continue;
3007 }
3008
3009 /* A potential (string) emphasis start/end. */
3010 if(ch == _T('*') || ch == _T('_')) {
3011 OFF tmp = off+1;
3012 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3013 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3014
3015 while(tmp < line_end && CH(tmp) == ch)
3016 tmp++;
3017
3018 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3019 left_level = 0;
3020 else if(ISUNICODEPUNCTBEFORE(off))
3021 left_level = 1;
3022 else
3023 left_level = 2;
3024
3025 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3026 right_level = 0;
3027 else if(ISUNICODEPUNCT(tmp))
3028 right_level = 1;
3029 else
3030 right_level = 2;
3031
3032 /* Intra-word underscore doesn't have special meaning. */
3033 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3034 left_level = 0;
3035 right_level = 0;
3036 }
3037
3038 if(left_level != 0 || right_level != 0) {
3039 unsigned flags = 0;
3040
3041 if(left_level > 0 && left_level >= right_level)
3042 flags |= MD_MARK_POTENTIAL_CLOSER;
3043 if(right_level > 0 && right_level >= left_level)
3044 flags |= MD_MARK_POTENTIAL_OPENER;
3045 if(left_level == 2 && right_level == 2)
3046 flags |= MD_MARK_EMPH_INTRAWORD;
3047
3048 /* For "the rule of three" we need to remember the original
3049 * size of the mark (modulo three), before we potentially
3050 * split the mark when being later resolved partially by some
3051 * shorter closer. */
3052 switch((tmp - off) % 3) {
3053 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3054 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3055 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3056 }
3057
3058 PUSH_MARK(ch, off, tmp, flags);
3059
3060 /* During resolving, multiple asterisks may have to be
3061 * split into independent span start/ends. Consider e.g.
3062 * "**foo* bar*". Therefore we push also some empty dummy
3063 * marks to have enough space for that. */
3064 off++;
3065 while(off < tmp) {
3066 PUSH_MARK('D', off, off, 0);
3067 off++;
3068 }
3069 continue;
3070 }
3071
3072 off = tmp;
3073 continue;
3074 }
3075
3076 /* A potential code span start/end. */
3077 if(ch == _T('`')) {
3078 OFF opener_beg, opener_end;
3079 OFF closer_beg, closer_end;
3080 int is_code_span;
3081
3082 is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off,
3083 &opener_beg, &opener_end, &closer_beg, &closer_end,
3084 codespan_last_potential_closers,
3085 &codespan_scanned_till_paragraph_end);
3086 if(is_code_span) {
3087 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3088 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3089 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3090 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3091
3092 off = closer_end;
3093
3094 /* Advance the current line accordingly. */
3095 while(off > line_end) {
3096 i++;
3097 line++;
3098 line_end = line->end;
3099 }
3100 continue;
3101 }
3102
3103 off = opener_end;
3104 continue;
3105 }
3106
3107 /* A potential entity start. */
3108 if(ch == _T('&')) {
3109 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3110 off++;
3111 continue;
3112 }
3113
3114 /* A potential entity end. */
3115 if(ch == _T(';')) {
3116 /* We surely cannot be entity unless the previous mark is '&'. */
3117 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3118 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3119
3120 off++;
3121 continue;
3122 }
3123
3124 /* A potential autolink or raw HTML start/end. */
3125 if(ch == _T('<')) {
3126 int is_autolink;
3127 OFF autolink_end;
3128 int missing_mailto;
3129
3130 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3131 int is_html;
3132 OFF html_end;
3133
3134 /* Given the nature of the raw HTML, we have to recognize
3135 * it here. Doing so later in md_analyze_lt_gt() could
3136 * open can of worms of quadratic complexity. */
3137 is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
3138 lines[n_lines-1].end, &html_end);
3139 if(is_html) {
3140 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3141 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3142 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3143 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3144 off = html_end;
3145
3146 /* Advance the current line accordingly. */
3147 while(off > line_end) {
3148 i++;
3149 line++;
3150 line_end = line->end;
3151 }
3152 continue;
3153 }
3154 }
3155
3156 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3157 &autolink_end, &missing_mailto);
3158 if(is_autolink) {
3159 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3160 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3161 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3162 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3163 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3164 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3165 off = autolink_end;
3166 continue;
3167 }
3168
3169 off++;
3170 continue;
3171 }
3172
3173 /* A potential link or its part. */
3174 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3175 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3176 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3177 off = tmp;
3178 /* Two dummies to make enough place for data we need if it is
3179 * a link. */
3180 PUSH_MARK('D', off, off, 0);
3181 PUSH_MARK('D', off, off, 0);
3182 continue;
3183 }
3184 if(ch == _T(']')) {
3185 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3186 off++;
3187 continue;
3188 }
3189
3190 /* A potential permissive e-mail autolink. */
3191 if(ch == _T('@')) {
3192 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3193 off + 3 < line->end && ISALNUM(off+1))
3194 {
3195 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3196 /* Push a dummy as a reserve for a closer. */
3197 PUSH_MARK('D', off, off, 0);
3198 }
3199
3200 off++;
3201 continue;
3202 }
3203
3204 /* A potential permissive URL autolink. */
3205 if(ch == _T(':')) {
3206 static struct {
3207 const CHAR* scheme;
3208 SZ scheme_size;
3209 const CHAR* suffix;
3210 SZ suffix_size;
3211 } scheme_map[] = {
3212 /* In the order from the most frequently used, arguably. */
3213 { _T("http"), 4, _T("//"), 2 },
3214 { _T("https"), 5, _T("//"), 2 },
3215 { _T("ftp"), 3, _T("//"), 2 }
3216 };
3217 int scheme_index;
3218
3219 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3220 const CHAR* scheme = scheme_map[scheme_index].scheme;
3221 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3222 const CHAR* suffix = scheme_map[scheme_index].suffix;
3223 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3224
3225 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) &&
3226 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3227 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size))
3228 {
3229 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3230 /* Push a dummy as a reserve for a closer. */
3231 PUSH_MARK('D', off, off, 0);
3232 off += 1 + suffix_size;
3233 continue;
3234 }
3235 }
3236
3237 off++;
3238 continue;
3239 }
3240
3241 /* A potential permissive WWW autolink. */
3242 if(ch == _T('.')) {
3243 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) &&
3244 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3245 off + 1 < line_end)
3246 {
3247 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3248 /* Push a dummy as a reserve for a closer. */
3249 PUSH_MARK('D', off, off, 0);
3250 off++;
3251 continue;
3252 }
3253
3254 off++;
3255 continue;
3256 }
3257
3258 /* A potential table cell boundary or wiki link label delimiter. */
3259 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3260 PUSH_MARK(ch, off, off+1, 0);
3261 off++;
3262 continue;
3263 }
3264
3265 /* A potential strikethrough start/end. */
3266 if(ch == _T('~')) {
3267 OFF tmp = off+1;
3268
3269 while(tmp < line_end && CH(tmp) == _T('~'))
3270 tmp++;
3271
3272 if(tmp - off < 3) {
3273 unsigned flags = 0;
3274
3275 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3276 flags |= MD_MARK_POTENTIAL_OPENER;
3277 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3278 flags |= MD_MARK_POTENTIAL_CLOSER;
3279 if(flags != 0)
3280 PUSH_MARK(ch, off, tmp, flags);
3281 }
3282
3283 off = tmp;
3284 continue;
3285 }
3286
3287 /* A potential equation start/end */
3288 if(ch == _T('$')) {
3289 /* We can have at most two consecutive $ signs,
3290 * where two dollar signs signify a display equation. */
3291 OFF tmp = off+1;
3292
3293 while(tmp < line_end && CH(tmp) == _T('$'))
3294 tmp++;
3295
3296 if (tmp - off <= 2)
3297 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3298 off = tmp;
3299 continue;
3300 }
3301
3302 /* Turn non-trivial whitespace into single space. */
3303 if(ISWHITESPACE_(ch)) {
3304 OFF tmp = off+1;
3305
3306 while(tmp < line_end && ISWHITESPACE(tmp))
3307 tmp++;
3308
3309 if(tmp - off > 1 || ch != _T(' '))
3310 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3311
3312 off = tmp;
3313 continue;
3314 }
3315
3316 /* NULL character. */
3317 if(ch == _T('\0')) {
3318 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3319 off++;
3320 continue;
3321 }
3322
3323 off++;
3324 }
3325 }
3326
3327 /* Add a dummy mark at the end of the mark vector to simplify
3328 * process_inlines(). */
3329 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3330
3331abort:
3332 return ret;
3333}
3334
3335static void
3336md_analyze_bracket(MD_CTX* ctx, int mark_index)
3337{
3338 /* We cannot really resolve links here as for that we would need
3339 * more context. E.g. a following pair of brackets (reference link),
3340 * or enclosing pair of brackets (if the inner is the link, the outer
3341 * one cannot be.)
3342 *
3343 * Therefore we here only construct a list of resolved '[' ']' pairs
3344 * ordered by position of the closer. This allows ur to analyze what is
3345 * or is not link in the right order, from inside to outside in case
3346 * of nested brackets.
3347 *
3348 * The resolving itself is deferred into md_resolve_links().
3349 */
3350
3351 MD_MARK* mark = &ctx->marks[mark_index];
3352
3353 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3354 md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3355 return;
3356 }
3357
3358 if(BRACKET_OPENERS.tail >= 0) {
3359 /* Pop the opener from the chain. */
3360 int opener_index = BRACKET_OPENERS.tail;
3361 MD_MARK* opener = &ctx->marks[opener_index];
3362 if(opener->prev >= 0)
3363 ctx->marks[opener->prev].next = -1;
3364 else
3365 BRACKET_OPENERS.head = -1;
3366 BRACKET_OPENERS.tail = opener->prev;
3367
3368 /* Interconnect the opener and closer. */
3369 opener->next = mark_index;
3370 mark->prev = opener_index;
3371
3372 /* Add the pair into chain of potential links for md_resolve_links().
3373 * Note we misuse opener->prev for this as opener->next points to its
3374 * closer. */
3375 if(ctx->unresolved_link_tail >= 0)
3376 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3377 else
3378 ctx->unresolved_link_head = opener_index;
3379 ctx->unresolved_link_tail = opener_index;
3380 opener->prev = -1;
3381 }
3382}
3383
3384/* Forward declaration. */
3385static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3386 int mark_beg, int mark_end);
3387
3388static int
3389md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3390{
3391 int opener_index = ctx->unresolved_link_head;
3392 OFF last_link_beg = 0;
3393 OFF last_link_end = 0;
3394 OFF last_img_beg = 0;
3395 OFF last_img_end = 0;
3396
3397 while(opener_index >= 0) {
3398 MD_MARK* opener = &ctx->marks[opener_index];
3399 int closer_index = opener->next;
3400 MD_MARK* closer = &ctx->marks[closer_index];
3401 int next_index = opener->prev;
3402 MD_MARK* next_opener;
3403 MD_MARK* next_closer;
3404 MD_LINK_ATTR attr;
3405 int is_link = FALSE;
3406
3407 if(next_index >= 0) {
3408 next_opener = &ctx->marks[next_index];
3409 next_closer = &ctx->marks[next_opener->next];
3410 } else {
3411 next_opener = NULL;
3412 next_closer = NULL;
3413 }
3414
3415 /* If nested ("[ [ ] ]"), we need to make sure that:
3416 * - The outer does not end inside of (...) belonging to the inner.
3417 * - The outer cannot be link if the inner is link (i.e. not image).
3418 *
3419 * (Note we here analyze from inner to outer as the marks are ordered
3420 * by closer->beg.)
3421 */
3422 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3423 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3424 (opener->beg < last_link_end && opener->ch == '['))
3425 {
3426 opener_index = next_index;
3427 continue;
3428 }
3429
3430 /* Recognize and resolve wiki links.
3431 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3432 */
3433 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3434 (opener->end - opener->beg == 1) && /* not image */
3435 next_opener != NULL && /* double '[' opener */
3436 next_opener->ch == '[' &&
3437 (next_opener->beg == opener->beg - 1) &&
3438 (next_opener->end - next_opener->beg == 1) &&
3439 next_closer != NULL && /* double ']' closer */
3440 next_closer->ch == ']' &&
3441 (next_closer->beg == closer->beg + 1) &&
3442 (next_closer->end - next_closer->beg == 1))
3443 {
3444 MD_MARK* delim = NULL;
3445 int delim_index;
3446 OFF dest_beg, dest_end;
3447
3448 is_link = TRUE;
3449
3450 /* We don't allow destination to be longer then 100 characters.
3451 * Lets scan to see whether there is '|'. (If not then the whole
3452 * wiki-link has to be below the 100 characters.) */
3453 delim_index = opener_index + 1;
3454 while(delim_index < closer_index) {
3455 MD_MARK* m = &ctx->marks[delim_index];
3456 if(m->ch == '|') {
3457 delim = m;
3458 break;
3459 }
3460 if(m->ch != 'D' && m->beg - opener->end > 100)
3461 break;
3462 delim_index++;
3463 }
3464 dest_beg = opener->end;
3465 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3466 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3467 is_link = FALSE;
3468
3469 /* There may not be any new line in the destination. */
3470 if(is_link) {
3471 OFF off;
3472 for(off = dest_beg; off < dest_end; off++) {
3473 if(ISNEWLINE(off)) {
3474 is_link = FALSE;
3475 break;
3476 }
3477 }
3478 }
3479
3480 if(is_link) {
3481 if(delim != NULL) {
3482 if(delim->end < closer->beg) {
3483 opener->end = delim->beg;
3484 } else {
3485 /* The pipe is just before the closer: [[foo|]] */
3486 closer->beg = delim->beg;
3487 delim = NULL;
3488 }
3489 }
3490
3491 opener->beg = next_opener->beg;
3492 opener->next = closer_index;
3493 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3494
3495 closer->end = next_closer->end;
3496 closer->prev = opener_index;
3497 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3498
3499 last_link_beg = opener->beg;
3500 last_link_end = closer->end;
3501
3502 if(delim != NULL) {
3503 delim->flags |= MD_MARK_RESOLVED;
3504 md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3505 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3506 } else {
3507 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3508 }
3509
3510 opener_index = next_opener->prev;
3511 continue;
3512 }
3513 }
3514
3515 if(next_opener != NULL && next_opener->beg == closer->end) {
3516 if(next_closer->beg > closer->end + 1) {
3517 /* Might be full reference link. */
3518 is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3519 } else {
3520 /* Might be shortcut reference link. */
3521 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3522 }
3523
3524 if(is_link < 0)
3525 return -1;
3526
3527 if(is_link) {
3528 /* Eat the 2nd "[...]". */
3529 closer->end = next_closer->end;
3530 }
3531 } else {
3532 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3533 /* Might be inline link. */
3534 OFF inline_link_end = UINT_MAX;
3535
3536 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3537 if(is_link < 0)
3538 return -1;
3539
3540 /* Check the closing ')' is not inside an already resolved range
3541 * (i.e. a range with a higher priority), e.g. a code span. */
3542 if(is_link) {
3543 int i = closer_index + 1;
3544
3545 while(i < ctx->n_marks) {
3546 MD_MARK* mark = &ctx->marks[i];
3547
3548 if(mark->beg >= inline_link_end)
3549 break;
3550 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3551 if(ctx->marks[mark->next].beg >= inline_link_end) {
3552 /* Cancel the link status. */
3553 if(!IS_INPUT_STR(attr.title))
3554 free(attr.title);
3555 is_link = FALSE;
3556 break;
3557 }
3558
3559 i = mark->next + 1;
3560 } else {
3561 i++;
3562 }
3563 }
3564 }
3565
3566 if(is_link) {
3567 /* Eat the "(...)" */
3568 closer->end = inline_link_end;
3569 }
3570 }
3571
3572 if(!is_link) {
3573 /* Might be collapsed reference link. */
3574 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3575 if(is_link < 0)
3576 return -1;
3577 }
3578 }
3579
3580 if(is_link) {
3581 /* Resolve the brackets as a link. */
3582 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3583 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3584
3585 /* If it is a link, we store the destination and title in the two
3586 * dummy marks after the opener. */
3587 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3588 ctx->marks[opener_index+1].beg = attr.dest_beg;
3589 ctx->marks[opener_index+1].end = attr.dest_end;
3590
3591 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3592 md_mark_store_ptr(ctx, opener_index+2, attr.title);
3593 /* The title might or might not have been allocated for us. */
3594 if(attr.title_needs_free)
3595 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
3596 ctx->marks[opener_index+2].prev = attr.title_size;
3597
3598 if(opener->ch == '[') {
3599 last_link_beg = opener->beg;
3600 last_link_end = closer->end;
3601 } else {
3602 last_img_beg = opener->beg;
3603 last_img_end = closer->end;
3604 }
3605
3606 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3607 }
3608
3609 opener_index = next_index;
3610 }
3611
3612 return 0;
3613}
3614
3615/* Analyze whether the mark '&' starts a HTML entity.
3616 * If so, update its flags as well as flags of corresponding closer ';'. */
3617static void
3618md_analyze_entity(MD_CTX* ctx, int mark_index)
3619{
3620 MD_MARK* opener = &ctx->marks[mark_index];
3621 MD_MARK* closer;
3622 OFF off;
3623
3624 /* Cannot be entity if there is no closer as the next mark.
3625 * (Any other mark between would mean strange character which cannot be
3626 * part of the entity.
3627 *
3628 * So we can do all the work on '&' and do not call this later for the
3629 * closing mark ';'.
3630 */
3631 if(mark_index + 1 >= ctx->n_marks)
3632 return;
3633 closer = &ctx->marks[mark_index+1];
3634 if(closer->ch != ';')
3635 return;
3636
3637 if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3638 MD_ASSERT(off == closer->end);
3639
3640 md_resolve_range(ctx, NULL, mark_index, mark_index+1);
3641 opener->end = closer->end;
3642 }
3643}
3644
3645static void
3646md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3647{
3648 MD_MARK* mark = &ctx->marks[mark_index];
3649 mark->flags |= MD_MARK_RESOLVED;
3650
3651 md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
3652 ctx->n_table_cell_boundaries++;
3653}
3654
3655/* Split a longer mark into two. The new mark takes the given count of
3656 * characters. May only be called if an adequate number of dummy 'D' marks
3657 * follows.
3658 */
3659static int
3660md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3661{
3662 MD_MARK* mark = &ctx->marks[mark_index];
3663 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3664 MD_MARK* dummy = &ctx->marks[new_mark_index];
3665
3666 MD_ASSERT(mark->end - mark->beg > n);
3667 MD_ASSERT(dummy->ch == 'D');
3668
3669 memcpy(dummy, mark, sizeof(MD_MARK));
3670 mark->end -= n;
3671 dummy->beg = mark->end;
3672
3673 return new_mark_index;
3674}
3675
3676static void
3677md_analyze_emph(MD_CTX* ctx, int mark_index)
3678{
3679 MD_MARK* mark = &ctx->marks[mark_index];
3680 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3681
3682 /* If we can be a closer, try to resolve with the preceding opener. */
3683 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3684 MD_MARK* opener = NULL;
3685 int opener_index;
3686
3687 if(mark->ch == _T('*')) {
3688 MD_MARKCHAIN* opener_chains[6];
3689 int i, n_opener_chains;
3690 unsigned flags = mark->flags;
3691
3692 /* Apply the "rule of three". */
3693 n_opener_chains = 0;
3694 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3695 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3696 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3697 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3698 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3699 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3700 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3701 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3702 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3703 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3704
3705 /* Opener is the most recent mark from the allowed chains. */
3706 for(i = 0; i < n_opener_chains; i++) {
3707 if(opener_chains[i]->tail >= 0) {
3708 int tmp_index = opener_chains[i]->tail;
3709 MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3710 if(opener == NULL || tmp_mark->end > opener->end) {
3711 opener_index = tmp_index;
3712 opener = tmp_mark;
3713 }
3714 }
3715 }
3716 } else {
3717 /* Simple emph. mark */
3718 if(chain->tail >= 0) {
3719 opener_index = chain->tail;
3720 opener = &ctx->marks[opener_index];
3721 }
3722 }
3723
3724 /* Resolve, if we have found matching opener. */
3725 if(opener != NULL) {
3726 SZ opener_size = opener->end - opener->beg;
3727 SZ closer_size = mark->end - mark->beg;
3728 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
3729
3730 if(opener_size > closer_size) {
3731 opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3732 md_mark_chain_append(ctx, opener_chain, opener_index);
3733 } else if(opener_size < closer_size) {
3734 md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3735 }
3736
3737 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3738 md_resolve_range(ctx, opener_chain, opener_index, mark_index);
3739 return;
3740 }
3741 }
3742
3743 /* If we could not resolve as closer, we may be yet be an opener. */
3744 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3745 md_mark_chain_append(ctx, chain, mark_index);
3746}
3747
3748static void
3749md_analyze_tilde(MD_CTX* ctx, int mark_index)
3750{
3751 MD_MARK* mark = &ctx->marks[mark_index];
3752 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3753
3754 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3755 * only tildes sequences of length 1 and 2, and the length of the opener
3756 * and closer has to match. */
3757
3758 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) {
3759 int opener_index = chain->head;
3760
3761 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3762 md_resolve_range(ctx, chain, opener_index, mark_index);
3763 return;
3764 }
3765
3766 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3767 md_mark_chain_append(ctx, chain, mark_index);
3768}
3769
3770static void
3771md_analyze_dollar(MD_CTX* ctx, int mark_index)
3772{
3773 /* This should mimic the way inline equations work in LaTeX, so there
3774 * can only ever be one item in the chain (i.e. the dollars can't be
3775 * nested). This is basically the same as the md_analyze_tilde function,
3776 * except that we require matching openers and closers to be of the same
3777 * length.
3778 *
3779 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3780 if(DOLLAR_OPENERS.head >= 0) {
3781 /* If the potential closer has a non-matching number of $, discard */
3782 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3783 MD_MARK* close = &ctx->marks[mark_index];
3784
3785 int opener_index = DOLLAR_OPENERS.head;
3786 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
3787 if (open->end - open->beg == close->end - close->beg) {
3788 /* We are the matching closer */
3789 md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
3790 } else {
3791 /* We don't match the opener, so discard old opener and insert as opener */
3792 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3793 }
3794 } else {
3795 /* No unmatched openers, so we are opener */
3796 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3797 }
3798}
3799
3800static void
3801md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3802{
3803 MD_MARK* opener = &ctx->marks[mark_index];
3804 int closer_index = mark_index + 1;
3805 MD_MARK* closer = &ctx->marks[closer_index];
3806 MD_MARK* next_resolved_mark;
3807 OFF off = opener->end;
3808 int n_dots = FALSE;
3809 int has_underscore_in_last_seg = FALSE;
3810 int has_underscore_in_next_to_last_seg = FALSE;
3811 int n_opened_parenthesis = 0;
3812
3813 /* Check for domain. */
3814 while(off < ctx->size) {
3815 if(ISALNUM(off) || CH(off) == _T('-')) {
3816 off++;
3817 } else if(CH(off) == _T('.')) {
3818 /* We must see at least one period. */
3819 n_dots++;
3820 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3821 has_underscore_in_last_seg = FALSE;
3822 off++;
3823 } else if(CH(off) == _T('_')) {
3824 /* No underscore may be present in the last two domain segments. */
3825 has_underscore_in_last_seg = TRUE;
3826 off++;
3827 } else {
3828 break;
3829 }
3830 }
3831 if(off > opener->end && CH(off-1) == _T('.')) {
3832 off--;
3833 n_dots--;
3834 }
3835 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3836 return;
3837
3838 /* Check for path. */
3839 next_resolved_mark = closer + 1;
3840 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3841 next_resolved_mark++;
3842 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) {
3843 /* Parenthesis must be balanced. */
3844 if(CH(off) == _T('(')) {
3845 n_opened_parenthesis++;
3846 } else if(CH(off) == _T(')')) {
3847 if(n_opened_parenthesis > 0)
3848 n_opened_parenthesis--;
3849 else
3850 break;
3851 }
3852
3853 off++;
3854 }
3855 /* These cannot be last char In such case they are more likely normal
3856 * punctuation. */
3857 if(ISANYOF(off-1, _T("?!.,:*_~")))
3858 off--;
3859
3860 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3861 * length so all the contents becomes the link text. */
3862 MD_ASSERT(closer->ch == 'D');
3863 opener->end = opener->beg;
3864 closer->ch = opener->ch;
3865 closer->beg = off;
3866 closer->end = off;
3867 md_resolve_range(ctx, NULL, mark_index, closer_index);
3868}
3869
3870/* The permissive autolinks do not have to be enclosed in '<' '>' but we
3871 * instead impose stricter rules what is understood as an e-mail address
3872 * here. Actually any non-alphanumeric characters with exception of '.'
3873 * are prohibited both in username and after '@'. */
3874static void
3875md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3876{
3877 MD_MARK* opener = &ctx->marks[mark_index];
3878 int closer_index;
3879 MD_MARK* closer;
3880 OFF beg = opener->beg;
3881 OFF end = opener->end;
3882 int dot_count = 0;
3883
3884 MD_ASSERT(CH(beg) == _T('@'));
3885
3886 /* Scan for name before '@'. */
3887 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3888 beg--;
3889
3890 /* Scan for domain after '@'. */
3891 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3892 if(CH(end) == _T('.'))
3893 dot_count++;
3894 end++;
3895 }
3896 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */
3897 dot_count--;
3898 end--;
3899 }
3900 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3901 return;
3902 if(CH(end-1) == _T('@') || dot_count == 0)
3903 return;
3904
3905 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3906 * length so all the contents becomes the link text. */
3907 closer_index = mark_index + 1;
3908 closer = &ctx->marks[closer_index];
3909 MD_ASSERT(closer->ch == 'D');
3910
3911 opener->beg = beg;
3912 opener->end = beg;
3913 closer->ch = opener->ch;
3914 closer->beg = end;
3915 closer->end = end;
3916 md_resolve_range(ctx, NULL, mark_index, closer_index);
3917}
3918
3919static inline void
3920md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3921 int mark_beg, int mark_end, const CHAR* mark_chars)
3922{
3923 int i = mark_beg;
3924
3925 while(i < mark_end) {
3926 MD_MARK* mark = &ctx->marks[i];
3927
3928 /* Skip resolved spans. */
3929 if(mark->flags & MD_MARK_RESOLVED) {
3930 if(mark->flags & MD_MARK_OPENER) {
3931 MD_ASSERT(i < mark->next);
3932 i = mark->next + 1;
3933 } else {
3934 i++;
3935 }
3936 continue;
3937 }
3938
3939 /* Skip marks we do not want to deal with. */
3940 if(!ISANYOF_(mark->ch, mark_chars)) {
3941 i++;
3942 continue;
3943 }
3944
3945 /* Analyze the mark. */
3946 switch(mark->ch) {
3947 case '[': /* Pass through. */
3948 case '!': /* Pass through. */
3949 case ']': md_analyze_bracket(ctx, i); break;
3950 case '&': md_analyze_entity(ctx, i); break;
3951 case '|': md_analyze_table_cell_boundary(ctx, i); break;
3952 case '_': /* Pass through. */
3953 case '*': md_analyze_emph(ctx, i); break;
3954 case '~': md_analyze_tilde(ctx, i); break;
3955 case '$': md_analyze_dollar(ctx, i); break;
3956 case '.': /* Pass through. */
3957 case ':': md_analyze_permissive_url_autolink(ctx, i); break;
3958 case '@': md_analyze_permissive_email_autolink(ctx, i); break;
3959 }
3960
3961 i++;
3962 }
3963}
3964
3965/* Analyze marks (build ctx->marks). */
3966static int
3967md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3968{
3969 int ret;
3970
3971 /* Reset the previously collected stack of marks. */
3972 ctx->n_marks = 0;
3973
3974 /* Collect all marks. */
3975 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
3976
3977 /* We analyze marks in few groups to handle their precedence. */
3978 /* (1) Entities; code spans; autolinks; raw HTML. */
3979 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&"));
3980
3981 /* (2) Links. */
3982 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
3983 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
3984 BRACKET_OPENERS.head = -1;
3985 BRACKET_OPENERS.tail = -1;
3986 ctx->unresolved_link_head = -1;
3987 ctx->unresolved_link_tail = -1;
3988
3989 if(table_mode) {
3990 /* (3) Analyze table cell boundaries.
3991 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
3992 * not after, because caller may need it. */
3993 MD_ASSERT(n_lines == 1);
3994 TABLECELLBOUNDARIES.head = -1;
3995 TABLECELLBOUNDARIES.tail = -1;
3996 ctx->n_table_cell_boundaries = 0;
3997 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
3998 return ret;
3999 }
4000
4001 /* (4) Emphasis and strong emphasis; permissive autolinks. */
4002 md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4003
4004abort:
4005 return ret;
4006}
4007
4008static void
4009md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4010 int mark_beg, int mark_end)
4011{
4012 int i;
4013
4014 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4015
4016 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4017 ctx->mark_chains[i].head = -1;
4018 ctx->mark_chains[i].tail = -1;
4019 }
4020}
4021
4022static int
4023md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4024 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4025 const CHAR* title, SZ title_size)
4026{
4027 MD_ATTRIBUTE_BUILD href_build = { 0 };
4028 MD_ATTRIBUTE_BUILD title_build = { 0 };
4029 MD_SPAN_A_DETAIL det;
4030 int ret = 0;
4031
4032 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4033 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4034 memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4035 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4036 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4037 &det.href, &href_build));
4038 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4039
4040 if(enter)
4041 MD_ENTER_SPAN(type, &det);
4042 else
4043 MD_LEAVE_SPAN(type, &det);
4044
4045abort:
4046 md_free_attribute(ctx, &href_build);
4047 md_free_attribute(ctx, &title_build);
4048 return ret;
4049}
4050
4051static int
4052md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4053{
4054 MD_ATTRIBUTE_BUILD target_build = { 0 };
4055 MD_SPAN_WIKILINK_DETAIL det;
4056 int ret = 0;
4057
4058 memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4059 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4060
4061 if (enter)
4062 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4063 else
4064 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4065
4066abort:
4067 md_free_attribute(ctx, &target_build);
4068 return ret;
4069}
4070
4071
4072/* Render the output, accordingly to the analyzed ctx->marks. */
4073static int
4074md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4075{
4076 MD_TEXTTYPE text_type;
4077 const MD_LINE* line = lines;
4078 MD_MARK* prev_mark = NULL;
4079 MD_MARK* mark;
4080 OFF off = lines[0].beg;
4081 OFF end = lines[n_lines-1].end;
4082 int enforce_hardbreak = 0;
4083 int ret = 0;
4084
4085 /* Find first resolved mark. Note there is always at least one resolved
4086 * mark, the dummy last one after the end of the latest line we actually
4087 * never really reach. This saves us of a lot of special checks and cases
4088 * in this function. */
4089 mark = ctx->marks;
4090 while(!(mark->flags & MD_MARK_RESOLVED))
4091 mark++;
4092
4093 text_type = MD_TEXT_NORMAL;
4094
4095 while(1) {
4096 /* Process the text up to the next mark or end-of-line. */
4097 OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4098 if(tmp > off) {
4099 MD_TEXT(text_type, STR(off), tmp - off);
4100 off = tmp;
4101 }
4102
4103 /* If reached the mark, process it and move to next one. */
4104 if(off >= mark->beg) {
4105 switch(mark->ch) {
4106 case '\\': /* Backslash escape. */
4107 if(ISNEWLINE(mark->beg+1))
4108 enforce_hardbreak = 1;
4109 else
4110 MD_TEXT(text_type, STR(mark->beg+1), 1);
4111 break;
4112
4113 case ' ': /* Non-trivial space. */
4114 MD_TEXT(text_type, _T(" "), 1);
4115 break;
4116
4117 case '`': /* Code span. */
4118 if(mark->flags & MD_MARK_OPENER) {
4119 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4120 text_type = MD_TEXT_CODE;
4121 } else {
4122 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4123 text_type = MD_TEXT_NORMAL;
4124 }
4125 break;
4126
4127 case '_': /* Underline (or emphasis if we fall through). */
4128 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4129 if(mark->flags & MD_MARK_OPENER) {
4130 while(off < mark->end) {
4131 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4132 off++;
4133 }
4134 } else {
4135 while(off < mark->end) {
4136 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4137 off++;
4138 }
4139 }
4140 break;
4141 }
4142 /* Fall though. */
4143
4144 case '*': /* Emphasis, strong emphasis. */
4145 if(mark->flags & MD_MARK_OPENER) {
4146 if((mark->end - off) % 2) {
4147 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4148 off++;
4149 }
4150 while(off + 1 < mark->end) {
4151 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4152 off += 2;
4153 }
4154 } else {
4155 while(off + 1 < mark->end) {
4156 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4157 off += 2;
4158 }
4159 if((mark->end - off) % 2) {
4160 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4161 off++;
4162 }
4163 }
4164 break;
4165
4166 case '~':
4167 if(mark->flags & MD_MARK_OPENER)
4168 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4169 else
4170 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4171 break;
4172
4173 case '$':
4174 if(mark->flags & MD_MARK_OPENER) {
4175 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4176 text_type = MD_TEXT_LATEXMATH;
4177 } else {
4178 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4179 text_type = MD_TEXT_NORMAL;
4180 }
4181 break;
4182
4183 case '[': /* Link, wiki link, image. */
4184 case '!':
4185 case ']':
4186 {
4187 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4188 const MD_MARK* closer = &ctx->marks[opener->next];
4189 const MD_MARK* dest_mark;
4190 const MD_MARK* title_mark;
4191
4192 if ((opener->ch == '[' && closer->ch == ']') &&
4193 opener->end - opener->beg >= 2 &&
4194 closer->end - closer->beg >= 2)
4195 {
4196 int has_label = (opener->end - opener->beg > 2);
4197 SZ target_sz;
4198
4199 if(has_label)
4200 target_sz = opener->end - (opener->beg+2);
4201 else
4202 target_sz = closer->beg - opener->end;
4203
4204 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4205 has_label ? STR(opener->beg+2) : STR(opener->end),
4206 target_sz));
4207
4208 break;
4209 }
4210
4211 dest_mark = opener+1;
4212 MD_ASSERT(dest_mark->ch == 'D');
4213 title_mark = opener+2;
4214 MD_ASSERT(title_mark->ch == 'D');
4215
4216 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4217 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4218 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4219 md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4220
4221 /* link/image closer may span multiple lines. */
4222 if(mark->ch == ']') {
4223 while(mark->end > line->end)
4224 line++;
4225 }
4226
4227 break;
4228 }
4229
4230 case '<':
4231 case '>': /* Autolink or raw HTML. */
4232 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4233 /* Raw HTML. */
4234 if(mark->flags & MD_MARK_OPENER)
4235 text_type = MD_TEXT_HTML;
4236 else
4237 text_type = MD_TEXT_NORMAL;
4238 break;
4239 }
4240 /* Pass through, if auto-link. */
4241
4242 case '@': /* Permissive e-mail autolink. */
4243 case ':': /* Permissive URL autolink. */
4244 case '.': /* Permissive WWW autolink. */
4245 {
4246 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4247 MD_MARK* closer = &ctx->marks[opener->next];
4248 const CHAR* dest = STR(opener->end);
4249 SZ dest_size = closer->beg - opener->end;
4250
4251 /* For permissive auto-links we do not know closer mark
4252 * position at the time of md_collect_marks(), therefore
4253 * it can be out-of-order in ctx->marks[].
4254 *
4255 * With this flag, we make sure that we output the closer
4256 * only if we processed the opener. */
4257 if(mark->flags & MD_MARK_OPENER)
4258 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4259
4260 if(opener->ch == '@' || opener->ch == '.') {
4261 dest_size += 7;
4262 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4263 memcpy(ctx->buffer,
4264 (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4265 7 * sizeof(CHAR));
4266 memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4267 dest = ctx->buffer;
4268 }
4269
4270 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4271 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4272 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4273 break;
4274 }
4275
4276 case '&': /* Entity. */
4277 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4278 break;
4279
4280 case '\0':
4281 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4282 break;
4283
4284 case 127:
4285 goto abort;
4286 }
4287
4288 off = mark->end;
4289
4290 /* Move to next resolved mark. */
4291 prev_mark = mark;
4292 mark++;
4293 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4294 mark++;
4295 }
4296
4297 /* If reached end of line, move to next one. */
4298 if(off >= line->end) {
4299 /* If it is the last line, we are done. */
4300 if(off >= end)
4301 break;
4302
4303 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4304 OFF tmp;
4305
4306 MD_ASSERT(prev_mark != NULL);
4307 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4308 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4309
4310 /* Inside a code span, trailing line whitespace has to be
4311 * outputted. */
4312 tmp = off;
4313 while(off < ctx->size && ISBLANK(off))
4314 off++;
4315 if(off > tmp)
4316 MD_TEXT(text_type, STR(tmp), off-tmp);
4317
4318 /* and new lines are transformed into single spaces. */
4319 if(prev_mark->end < off && off < mark->beg)
4320 MD_TEXT(text_type, _T(" "), 1);
4321 } else if(text_type == MD_TEXT_HTML) {
4322 /* Inside raw HTML, we output the new line verbatim, including
4323 * any trailing spaces. */
4324 OFF tmp = off;
4325
4326 while(tmp < end && ISBLANK(tmp))
4327 tmp++;
4328 if(tmp > off)
4329 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4330 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4331 } else {
4332 /* Output soft or hard line break. */
4333 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4334
4335 if(text_type == MD_TEXT_NORMAL) {
4336 if(enforce_hardbreak)
4337 break_type = MD_TEXT_BR;
4338 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4339 break_type = MD_TEXT_BR;
4340 }
4341
4342 MD_TEXT(break_type, _T("\n"), 1);
4343 }
4344
4345 /* Move to the next line. */
4346 line++;
4347 off = line->beg;
4348
4349 enforce_hardbreak = 0;
4350 }
4351 }
4352
4353abort:
4354 return ret;
4355}
4356
4357
4358/***************************
4359 *** Processing Tables ***
4360 ***************************/
4361
4362static void
4363md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4364{
4365 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4366 OFF off = beg;
4367
4368 while(n_align > 0) {
4369 int index = 0; /* index into align_map[] */
4370
4371 while(CH(off) != _T('-'))
4372 off++;
4373 if(off > beg && CH(off-1) == _T(':'))
4374 index |= 1;
4375 while(off < end && CH(off) == _T('-'))
4376 off++;
4377 if(off < end && CH(off) == _T(':'))
4378 index |= 2;
4379
4380 *align = align_map[index];
4381 align++;
4382 n_align--;
4383 }
4384
4385}
4386
4387/* Forward declaration. */
4388static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4389
4390static int
4391md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4392{
4393 MD_LINE line;
4394 MD_BLOCK_TD_DETAIL det;
4395 int ret = 0;
4396
4397 while(beg < end && ISWHITESPACE(beg))
4398 beg++;
4399 while(end > beg && ISWHITESPACE(end-1))
4400 end--;
4401
4402 det.align = align;
4403 line.beg = beg;
4404 line.end = end;
4405
4406 MD_ENTER_BLOCK(cell_type, &det);
4407 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4408 MD_LEAVE_BLOCK(cell_type, &det);
4409
4410abort:
4411 return ret;
4412}
4413
4414static int
4415md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4416 const MD_ALIGN* align, int col_count)
4417{
4418 MD_LINE line;
4419 OFF* pipe_offs = NULL;
4420 int i, j, k, n;
4421 int ret = 0;
4422
4423 line.beg = beg;
4424 line.end = end;
4425
4426 /* Break the line into table cells by identifying pipe characters who
4427 * form the cell boundary. */
4428 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4429
4430 /* We have to remember the cell boundaries in local buffer because
4431 * ctx->marks[] shall be reused during cell contents processing. */
4432 n = ctx->n_table_cell_boundaries + 2;
4433 pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4434 if(pipe_offs == NULL) {
4435 MD_LOG("malloc() failed.");
4436 ret = -1;
4437 goto abort;
4438 }
4439 j = 0;
4440 pipe_offs[j++] = beg;
4441 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4442 MD_MARK* mark = &ctx->marks[i];
4443 pipe_offs[j++] = mark->end;
4444 }
4445 pipe_offs[j++] = end+1;
4446
4447 /* Process cells. */
4448 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4449 k = 0;
4450 for(i = 0; i < j-1 && k < col_count; i++) {
4451 if(pipe_offs[i] < pipe_offs[i+1]-1)
4452 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4453 }
4454 /* Make sure we call enough table cells even if the current table contains
4455 * too few of them. */
4456 while(k < col_count)
4457 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4458 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4459
4460abort:
4461 free(pipe_offs);
4462
4463 /* Free any temporary memory blocks stored within some dummy marks. */
4464 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4465 free(md_mark_get_ptr(ctx, i));
4466 PTR_CHAIN.head = -1;
4467 PTR_CHAIN.tail = -1;
4468
4469 return ret;
4470}
4471
4472static int
4473md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4474{
4475 MD_ALIGN* align;
4476 int i;
4477 int ret = 0;
4478
4479 /* At least two lines have to be present: The column headers and the line
4480 * with the underlines. */
4481 MD_ASSERT(n_lines >= 2);
4482
4483 align = malloc(col_count * sizeof(MD_ALIGN));
4484 if(align == NULL) {
4485 MD_LOG("malloc() failed.");
4486 ret = -1;
4487 goto abort;
4488 }
4489
4490 md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4491
4492 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4493 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4494 lines[0].beg, lines[0].end, align, col_count));
4495 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4496
4497 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4498 for(i = 2; i < n_lines; i++) {
4499 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4500 lines[i].beg, lines[i].end, align, col_count));
4501 }
4502 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4503
4504abort:
4505 free(align);
4506 return ret;
4507}
4508
4509
4510/**************************
4511 *** Processing Block ***
4512 **************************/
4513
4514#define MD_BLOCK_CONTAINER_OPENER 0x01
4515#define MD_BLOCK_CONTAINER_CLOSER 0x02
4516#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4517#define MD_BLOCK_LOOSE_LIST 0x04
4518#define MD_BLOCK_SETEXT_HEADER 0x08
4519
4520struct MD_BLOCK_tag {
4521 MD_BLOCKTYPE type : 8;
4522 unsigned flags : 8;
4523
4524 /* MD_BLOCK_H: Header level (1 - 6)
4525 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4526 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4527 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4528 */
4529 unsigned data : 16;
4530
4531 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4532 * MD_BLOCK_LI: Task mark offset in the input doc.
4533 * MD_BLOCK_OL: Start item number.
4534 */
4535 unsigned n_lines;
4536};
4537
4538struct MD_CONTAINER_tag {
4539 CHAR ch;
4540 unsigned is_loose : 8;
4541 unsigned is_task : 8;
4542 unsigned start;
4543 unsigned mark_indent;
4544 unsigned contents_indent;
4545 OFF block_byte_off;
4546 OFF task_mark_off;
4547};
4548
4549
4550static int
4551md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4552{
4553 int i;
4554 int ret;
4555
4556 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4557 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4558
4559abort:
4560 /* Free any temporary memory blocks stored within some dummy marks. */
4561 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4562 free(md_mark_get_ptr(ctx, i));
4563 PTR_CHAIN.head = -1;
4564 PTR_CHAIN.tail = -1;
4565
4566 return ret;
4567}
4568
4569static int
4570md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4571{
4572 static const CHAR indent_chunk_str[] = _T(" ");
4573 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4574
4575 int i;
4576 int ret = 0;
4577
4578 for(i = 0; i < n_lines; i++) {
4579 const MD_VERBATIMLINE* line = &lines[i];
4580 int indent = line->indent;
4581
4582 MD_ASSERT(indent >= 0);
4583
4584 /* Output code indentation. */
4585 while(indent > (int) SIZEOF_ARRAY(indent_chunk_str)) {
4586 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4587 indent -= SIZEOF_ARRAY(indent_chunk_str);
4588 }
4589 if(indent > 0)
4590 MD_TEXT(text_type, indent_chunk_str, indent);
4591
4592 /* Output the code line itself. */
4593 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4594
4595 /* Enforce end-of-line. */
4596 MD_TEXT(text_type, _T("\n"), 1);
4597 }
4598
4599abort:
4600 return ret;
4601}
4602
4603static int
4604md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4605{
4606 if(is_fenced) {
4607 /* Skip the first line in case of fenced code: It is the fence.
4608 * (Only the starting fence is present due to logic in md_analyze_line().) */
4609 lines++;
4610 n_lines--;
4611 } else {
4612 /* Ignore blank lines at start/end of indented code block. */
4613 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4614 lines++;
4615 n_lines--;
4616 }
4617 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4618 n_lines--;
4619 }
4620 }
4621
4622 if(n_lines == 0)
4623 return 0;
4624
4625 return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4626}
4627
4628static int
4629md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4630 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4631{
4632 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4633 OFF beg = fence_line->beg;
4634 OFF end = fence_line->end;
4635 OFF lang_end;
4636 CHAR fence_ch = CH(fence_line->beg);
4637 int ret = 0;
4638
4639 /* Skip the fence itself. */
4640 while(beg < ctx->size && CH(beg) == fence_ch)
4641 beg++;
4642 /* Trim initial spaces. */
4643 while(beg < ctx->size && CH(beg) == _T(' '))
4644 beg++;
4645
4646 /* Trim trailing spaces. */
4647 while(end > beg && CH(end-1) == _T(' '))
4648 end--;
4649
4650 /* Build info string attribute. */
4651 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4652
4653 /* Build info string attribute. */
4654 lang_end = beg;
4655 while(lang_end < end && !ISWHITESPACE(lang_end))
4656 lang_end++;
4657 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4658
4659 det->fence_char = fence_ch;
4660
4661abort:
4662 return ret;
4663}
4664
4665static int
4666md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4667{
4668 union {
4669 MD_BLOCK_H_DETAIL header;
4670 MD_BLOCK_CODE_DETAIL code;
4671 } det;
4672 MD_ATTRIBUTE_BUILD info_build;
4673 MD_ATTRIBUTE_BUILD lang_build;
4674 int is_in_tight_list;
4675 int clean_fence_code_detail = FALSE;
4676 int ret = 0;
4677
4678 memset(&det, 0, sizeof(det));
4679
4680 if(ctx->n_containers == 0)
4681 is_in_tight_list = FALSE;
4682 else
4683 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4684
4685 switch(block->type) {
4686 case MD_BLOCK_H:
4687 det.header.level = block->data;
4688 break;
4689
4690 case MD_BLOCK_CODE:
4691 /* For fenced code block, we may need to set the info string. */
4692 if(block->data != 0) {
4693 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4694 clean_fence_code_detail = TRUE;
4695 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4696 }
4697 break;
4698
4699 default:
4700 /* Noop. */
4701 break;
4702 }
4703
4704 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4705 MD_ENTER_BLOCK(block->type, (void*) &det);
4706
4707 /* Process the block contents accordingly to is type. */
4708 switch(block->type) {
4709 case MD_BLOCK_HR:
4710 /* noop */
4711 break;
4712
4713 case MD_BLOCK_CODE:
4714 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4715 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4716 break;
4717
4718 case MD_BLOCK_HTML:
4719 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4720 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4721 break;
4722
4723 case MD_BLOCK_TABLE:
4724 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4725 (const MD_LINE*)(block + 1), block->n_lines));
4726 break;
4727
4728 default:
4729 MD_CHECK(md_process_normal_block_contents(ctx,
4730 (const MD_LINE*)(block + 1), block->n_lines));
4731 break;
4732 }
4733
4734 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4735 MD_LEAVE_BLOCK(block->type, (void*) &det);
4736
4737abort:
4738 if(clean_fence_code_detail) {
4739 md_free_attribute(ctx, &info_build);
4740 md_free_attribute(ctx, &lang_build);
4741 }
4742 return ret;
4743}
4744
4745static int
4746md_process_all_blocks(MD_CTX* ctx)
4747{
4748 int byte_off = 0;
4749 int ret = 0;
4750
4751 /* ctx->containers now is not needed for detection of lists and list items
4752 * so we reuse it for tracking what lists are loose or tight. We rely
4753 * on the fact the vector is large enough to hold the deepest nesting
4754 * level of lists. */
4755 ctx->n_containers = 0;
4756
4757 while(byte_off < ctx->n_block_bytes) {
4758 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4759 union {
4760 MD_BLOCK_UL_DETAIL ul;
4761 MD_BLOCK_OL_DETAIL ol;
4762 MD_BLOCK_LI_DETAIL li;
4763 } det;
4764
4765 switch(block->type) {
4766 case MD_BLOCK_UL:
4767 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4768 det.ul.mark = (CHAR) block->data;
4769 break;
4770
4771 case MD_BLOCK_OL:
4772 det.ol.start = block->n_lines;
4773 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4774 det.ol.mark_delimiter = (CHAR) block->data;
4775 break;
4776
4777 case MD_BLOCK_LI:
4778 det.li.is_task = (block->data != 0);
4779 det.li.task_mark = (CHAR) block->data;
4780 det.li.task_mark_offset = (OFF) block->n_lines;
4781 break;
4782
4783 default:
4784 /* noop */
4785 break;
4786 }
4787
4788 if(block->flags & MD_BLOCK_CONTAINER) {
4789 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4790 MD_LEAVE_BLOCK(block->type, &det);
4791
4792 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4793 ctx->n_containers--;
4794 }
4795
4796 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4797 MD_ENTER_BLOCK(block->type, &det);
4798
4799 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4800 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4801 ctx->n_containers++;
4802 } else if(block->type == MD_BLOCK_QUOTE) {
4803 /* This causes that any text in a block quote, even if
4804 * nested inside a tight list item, is wrapped with
4805 * <p>...</p>. */
4806 ctx->containers[ctx->n_containers].is_loose = TRUE;
4807 ctx->n_containers++;
4808 }
4809 }
4810 } else {
4811 MD_CHECK(md_process_leaf_block(ctx, block));
4812
4813 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4814 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4815 else
4816 byte_off += block->n_lines * sizeof(MD_LINE);
4817 }
4818
4819 byte_off += sizeof(MD_BLOCK);
4820 }
4821
4822 ctx->n_block_bytes = 0;
4823
4824abort:
4825 return ret;
4826}
4827
4828
4829/************************************
4830 *** Grouping Lines into Blocks ***
4831 ************************************/
4832
4833static void*
4834md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4835{
4836 void* ptr;
4837
4838 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4839 void* new_block_bytes;
4840
4841 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4842 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4843 : 512);
4844 new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4845 if(new_block_bytes == NULL) {
4846 MD_LOG("realloc() failed.");
4847 return NULL;
4848 }
4849
4850 /* Fix the ->current_block after the reallocation. */
4851 if(ctx->current_block != NULL) {
4852 OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4853 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4854 }
4855
4856 ctx->block_bytes = new_block_bytes;
4857 }
4858
4859 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4860 ctx->n_block_bytes += n_bytes;
4861 return ptr;
4862}
4863
4864static int
4865md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4866{
4867 MD_BLOCK* block;
4868
4869 MD_ASSERT(ctx->current_block == NULL);
4870
4871 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
4872 if(block == NULL)
4873 return -1;
4874
4875 switch(line->type) {
4876 case MD_LINE_HR:
4877 block->type = MD_BLOCK_HR;
4878 break;
4879
4880 case MD_LINE_ATXHEADER:
4881 case MD_LINE_SETEXTHEADER:
4882 block->type = MD_BLOCK_H;
4883 break;
4884
4885 case MD_LINE_FENCEDCODE:
4886 case MD_LINE_INDENTEDCODE:
4887 block->type = MD_BLOCK_CODE;
4888 break;
4889
4890 case MD_LINE_TEXT:
4891 block->type = MD_BLOCK_P;
4892 break;
4893
4894 case MD_LINE_HTML:
4895 block->type = MD_BLOCK_HTML;
4896 break;
4897
4898 case MD_LINE_BLANK:
4899 case MD_LINE_SETEXTUNDERLINE:
4900 case MD_LINE_TABLEUNDERLINE:
4901 default:
4902 MD_UNREACHABLE();
4903 break;
4904 }
4905
4906 block->flags = 0;
4907 block->data = line->data;
4908 block->n_lines = 0;
4909
4910 ctx->current_block = block;
4911 return 0;
4912}
4913
4914/* Eat from start of current (textual) block any reference definitions and
4915 * remember them so we can resolve any links referring to them.
4916 *
4917 * (Reference definitions can only be at start of it as they cannot break
4918 * a paragraph.)
4919 */
4920static int
4921md_consume_link_reference_definitions(MD_CTX* ctx)
4922{
4923 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4924 int n_lines = ctx->current_block->n_lines;
4925 int n = 0;
4926
4927 /* Compute how many lines at the start of the block form one or more
4928 * reference definitions. */
4929 while(n < n_lines) {
4930 int n_link_ref_lines;
4931
4932 n_link_ref_lines = md_is_link_reference_definition(ctx,
4933 lines + n, n_lines - n);
4934 /* Not a reference definition? */
4935 if(n_link_ref_lines == 0)
4936 break;
4937
4938 /* We fail if it is the ref. def. but it could not be stored due
4939 * a memory allocation error. */
4940 if(n_link_ref_lines < 0)
4941 return -1;
4942
4943 n += n_link_ref_lines;
4944 }
4945
4946 /* If there was at least one reference definition, we need to remove
4947 * its lines from the block, or perhaps even the whole block. */
4948 if(n > 0) {
4949 if(n == n_lines) {
4950 /* Remove complete block. */
4951 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4952 ctx->n_block_bytes -= sizeof(MD_BLOCK);
4953 ctx->current_block = NULL;
4954 } else {
4955 /* Remove just some initial lines from the block. */
4956 memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
4957 ctx->current_block->n_lines -= n;
4958 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4959 }
4960 }
4961
4962 return 0;
4963}
4964
4965static int
4966md_end_current_block(MD_CTX* ctx)
4967{
4968 int ret = 0;
4969
4970 if(ctx->current_block == NULL)
4971 return ret;
4972
4973 /* Check whether there is a reference definition. (We do this here instead
4974 * of in md_analyze_line() because reference definition can take multiple
4975 * lines.) */
4976 if(ctx->current_block->type == MD_BLOCK_P ||
4977 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
4978 {
4979 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4980 if(CH(lines[0].beg) == _T('[')) {
4981 MD_CHECK(md_consume_link_reference_definitions(ctx));
4982 if(ctx->current_block == NULL)
4983 return ret;
4984 }
4985 }
4986
4987 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
4988 int n_lines = ctx->current_block->n_lines;
4989
4990 if(n_lines > 1) {
4991 /* Get rid of the underline. */
4992 ctx->current_block->n_lines--;
4993 ctx->n_block_bytes -= sizeof(MD_LINE);
4994 } else {
4995 /* Only the underline has left after eating the ref. defs.
4996 * Keep the line as beginning of a new ordinary paragraph. */
4997 ctx->current_block->type = MD_BLOCK_P;
4998 return 0;
4999 }
5000 }
5001
5002 /* Mark we are not building any block anymore. */
5003 ctx->current_block = NULL;
5004
5005abort:
5006 return ret;
5007}
5008
5009static int
5010md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5011{
5012 MD_ASSERT(ctx->current_block != NULL);
5013
5014 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5015 MD_VERBATIMLINE* line;
5016
5017 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5018 if(line == NULL)
5019 return -1;
5020
5021 line->indent = analysis->indent;
5022 line->beg = analysis->beg;
5023 line->end = analysis->end;
5024 } else {
5025 MD_LINE* line;
5026
5027 line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5028 if(line == NULL)
5029 return -1;
5030
5031 line->beg = analysis->beg;
5032 line->end = analysis->end;
5033 }
5034 ctx->current_block->n_lines++;
5035
5036 return 0;
5037}
5038
5039static int
5040md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5041 unsigned data, unsigned flags)
5042{
5043 MD_BLOCK* block;
5044 int ret = 0;
5045
5046 MD_CHECK(md_end_current_block(ctx));
5047
5048 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5049 if(block == NULL)
5050 return -1;
5051
5052 block->type = type;
5053 block->flags = flags;
5054 block->data = data;
5055 block->n_lines = start;
5056
5057abort:
5058 return ret;
5059}
5060
5061
5062
5063/***********************
5064 *** Line Analysis ***
5065 ***********************/
5066
5067static int
5068md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5069{
5070 OFF off = beg + 1;
5071 int n = 1;
5072
5073 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5074 if(CH(off) == CH(beg))
5075 n++;
5076 off++;
5077 }
5078
5079 if(n < 3) {
5080 *p_killer = off;
5081 return FALSE;
5082 }
5083
5084 /* Nothing else can be present on the line. */
5085 if(off < ctx->size && !ISNEWLINE(off)) {
5086 *p_killer = off;
5087 return FALSE;
5088 }
5089
5090 *p_end = off;
5091 return TRUE;
5092}
5093
5094static int
5095md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5096{
5097 int n;
5098 OFF off = beg + 1;
5099
5100 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5101 off++;
5102 n = off - beg;
5103
5104 if(n > 6)
5105 return FALSE;
5106 *p_level = n;
5107
5108 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5109 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5110 return FALSE;
5111
5112 while(off < ctx->size && CH(off) == _T(' '))
5113 off++;
5114 *p_beg = off;
5115 *p_end = off;
5116 return TRUE;
5117}
5118
5119static int
5120md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5121{
5122 OFF off = beg + 1;
5123
5124 while(off < ctx->size && CH(off) == CH(beg))
5125 off++;
5126
5127 /* Optionally, space(s) can follow. */
5128 while(off < ctx->size && CH(off) == _T(' '))
5129 off++;
5130
5131 /* But nothing more is allowed on the line. */
5132 if(off < ctx->size && !ISNEWLINE(off))
5133 return FALSE;
5134
5135 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5136 *p_end = off;
5137 return TRUE;
5138}
5139
5140static int
5141md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5142{
5143 OFF off = beg;
5144 int found_pipe = FALSE;
5145 unsigned col_count = 0;
5146
5147 if(off < ctx->size && CH(off) == _T('|')) {
5148 found_pipe = TRUE;
5149 off++;
5150 while(off < ctx->size && ISWHITESPACE(off))
5151 off++;
5152 }
5153
5154 while(1) {
5155 OFF cell_beg;
5156 int delimited = FALSE;
5157
5158 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5159 cell_beg = off;
5160 if(off < ctx->size && CH(off) == _T(':'))
5161 off++;
5162 while(off < ctx->size && CH(off) == _T('-'))
5163 off++;
5164 if(off < ctx->size && CH(off) == _T(':'))
5165 off++;
5166 if(off - cell_beg < 3)
5167 return FALSE;
5168
5169 col_count++;
5170
5171 /* Pipe delimiter (optional at the end of line). */
5172 while(off < ctx->size && ISWHITESPACE(off))
5173 off++;
5174 if(off < ctx->size && CH(off) == _T('|')) {
5175 delimited = TRUE;
5176 found_pipe = TRUE;
5177 off++;
5178 while(off < ctx->size && ISWHITESPACE(off))
5179 off++;
5180 }
5181
5182 /* Success, if we reach end of line. */
5183 if(off >= ctx->size || ISNEWLINE(off))
5184 break;
5185
5186 if(!delimited)
5187 return FALSE;
5188 }
5189
5190 if(!found_pipe)
5191 return FALSE;
5192
5193 *p_end = off;
5194 *p_col_count = col_count;
5195 return TRUE;
5196}
5197
5198static int
5199md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5200{
5201 OFF off = beg;
5202
5203 while(off < ctx->size && CH(off) == CH(beg))
5204 off++;
5205
5206 /* Fence must have at least three characters. */
5207 if(off - beg < 3)
5208 return FALSE;
5209
5210 ctx->code_fence_length = off - beg;
5211
5212 /* Optionally, space(s) can follow. */
5213 while(off < ctx->size && CH(off) == _T(' '))
5214 off++;
5215
5216 /* Optionally, an info string can follow. */
5217 while(off < ctx->size && !ISNEWLINE(off)) {
5218 /* Backtick-based fence must not contain '`' in the info string. */
5219 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5220 return FALSE;
5221 off++;
5222 }
5223
5224 *p_end = off;
5225 return TRUE;
5226}
5227
5228static int
5229md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5230{
5231 OFF off = beg;
5232 int ret = FALSE;
5233
5234 /* Closing fence must have at least the same length and use same char as
5235 * opening one. */
5236 while(off < ctx->size && CH(off) == ch)
5237 off++;
5238 if(off - beg < ctx->code_fence_length)
5239 goto out;
5240
5241 /* Optionally, space(s) can follow */
5242 while(off < ctx->size && CH(off) == _T(' '))
5243 off++;
5244
5245 /* But nothing more is allowed on the line. */
5246 if(off < ctx->size && !ISNEWLINE(off))
5247 goto out;
5248
5249 ret = TRUE;
5250
5251out:
5252 /* Note we set *p_end even on failure: If we are not closing fence, caller
5253 * would eat the line anyway without any parsing. */
5254 *p_end = off;
5255 return ret;
5256}
5257
5258/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5259 * (Refer to CommonMark specification for details about the types.)
5260 */
5261static int
5262md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5263{
5264 typedef struct TAG_tag TAG;
5265 struct TAG_tag {
5266 const CHAR* name;
5267 unsigned len : 8;
5268 };
5269
5270 /* Type 6 is started by a long list of allowed tags. We use two-level
5271 * tree to speed-up the search. */
5272#ifdef X
5273 #undef X
5274#endif
5275#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5276#define Xend { NULL, 0 }
5277 static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5278
5279 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5280 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5281 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5282 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5283 X("div"), X("dl"), X("dt"), Xend };
5284 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5285 X("form"), X("frame"), X("frameset"), Xend };
5286 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5287 static const TAG i6[] = { X("iframe"), Xend };
5288 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5289 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5290 static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5291 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5292 static const TAG p6[] = { X("p"), X("param"), Xend };
5293 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5294 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5295 X("thead"), X("title"), X("tr"), X("track"), Xend };
5296 static const TAG u6[] = { X("ul"), Xend };
5297 static const TAG xx[] = { Xend };
5298#undef X
5299
5300 static const TAG* map6[26] = {
5301 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5302 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5303 };
5304 OFF off = beg + 1;
5305 int i;
5306
5307 /* Check for type 1: <script, <pre, or <style */
5308 for(i = 0; t1[i].name != NULL; i++) {
5309 if(off + t1[i].len <= ctx->size) {
5310 if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5311 return 1;
5312 }
5313 }
5314
5315 /* Check for type 2: <!-- */
5316 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5317 return 2;
5318
5319 /* Check for type 3: <? */
5320 if(off < ctx->size && CH(off) == _T('?'))
5321 return 3;
5322
5323 /* Check for type 4 or 5: <! */
5324 if(off < ctx->size && CH(off) == _T('!')) {
5325 /* Check for type 4: <! followed by uppercase letter. */
5326 if(off + 1 < ctx->size && ISUPPER(off+1))
5327 return 4;
5328
5329 /* Check for type 5: <![CDATA[ */
5330 if(off + 8 < ctx->size) {
5331 if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5332 return 5;
5333 }
5334 }
5335
5336 /* Check for type 6: Many possible starting tags listed above. */
5337 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5338 int slot;
5339 const TAG* tags;
5340
5341 if(CH(off) == _T('/'))
5342 off++;
5343
5344 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5345 tags = map6[slot];
5346
5347 for(i = 0; tags[i].name != NULL; i++) {
5348 if(off + tags[i].len <= ctx->size) {
5349 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5350 OFF tmp = off + tags[i].len;
5351 if(tmp >= ctx->size)
5352 return 6;
5353 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5354 return 6;
5355 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5356 return 6;
5357 break;
5358 }
5359 }
5360 }
5361 }
5362
5363 /* Check for type 7: any COMPLETE other opening or closing tag. */
5364 if(off + 1 < ctx->size) {
5365 OFF end;
5366
5367 if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5368 /* Only optional whitespace and new line may follow. */
5369 while(end < ctx->size && ISWHITESPACE(end))
5370 end++;
5371 if(end >= ctx->size || ISNEWLINE(end))
5372 return 7;
5373 }
5374 }
5375
5376 return FALSE;
5377}
5378
5379/* Case sensitive check whether there is a substring 'what' between 'beg'
5380 * and end of line. */
5381static int
5382md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5383{
5384 OFF i;
5385 for(i = beg; i + what_len < ctx->size; i++) {
5386 if(ISNEWLINE(i))
5387 break;
5388 if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5389 *p_end = i + what_len;
5390 return TRUE;
5391 }
5392 }
5393
5394 *p_end = i;
5395 return FALSE;
5396}
5397
5398/* Returns type of HTML block end condition or FALSE if not an end condition.
5399 *
5400 * Note it fills p_end even when it is not end condition as the caller
5401 * does not need to analyze contents of a raw HTML block.
5402 */
5403static int
5404md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5405{
5406 switch(ctx->html_block_type) {
5407 case 1:
5408 {
5409 OFF off = beg;
5410
5411 while(off < ctx->size && !ISNEWLINE(off)) {
5412 if(CH(off) == _T('<')) {
5413 if(md_ascii_case_eq(STR(off), _T("</script>"), 9)) {
5414 *p_end = off + 9;
5415 return TRUE;
5416 }
5417
5418 if(md_ascii_case_eq(STR(off), _T("</style>"), 8)) {
5419 *p_end = off + 8;
5420 return TRUE;
5421 }
5422
5423 if(md_ascii_case_eq(STR(off), _T("</pre>"), 6)) {
5424 *p_end = off + 6;
5425 return TRUE;
5426 }
5427 }
5428
5429 off++;
5430 }
5431 *p_end = off;
5432 return FALSE;
5433 }
5434
5435 case 2:
5436 return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5437
5438 case 3:
5439 return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5440
5441 case 4:
5442 return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5443
5444 case 5:
5445 return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5446
5447 case 6: /* Pass through */
5448 case 7:
5449 *p_end = beg;
5450 return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5451
5452 default:
5453 MD_UNREACHABLE();
5454 }
5455 return FALSE;
5456}
5457
5458
5459static int
5460md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5461{
5462 /* Block quote has no "items" like lists. */
5463 if(container->ch == _T('>'))
5464 return FALSE;
5465
5466 if(container->ch != pivot->ch)
5467 return FALSE;
5468 if(container->mark_indent > pivot->contents_indent)
5469 return FALSE;
5470
5471 return TRUE;
5472}
5473
5474static int
5475md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5476{
5477 if(ctx->n_containers >= ctx->alloc_containers) {
5478 MD_CONTAINER* new_containers;
5479
5480 ctx->alloc_containers = (ctx->alloc_containers > 0
5481 ? ctx->alloc_containers + ctx->alloc_containers / 2
5482 : 16);
5483 new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5484 if(new_containers == NULL) {
5485 MD_LOG("realloc() failed.");
5486 return -1;
5487 }
5488
5489 ctx->containers = new_containers;
5490 }
5491
5492 memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5493 return 0;
5494}
5495
5496static int
5497md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data)
5498{
5499 int i;
5500 int ret = 0;
5501
5502 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5503 MD_CONTAINER* c = &ctx->containers[i];
5504 int is_ordered_list = FALSE;
5505
5506 switch(c->ch) {
5507 case _T(')'):
5508 case _T('.'):
5509 is_ordered_list = TRUE;
5510 /* Pass through */
5511
5512 case _T('-'):
5513 case _T('+'):
5514 case _T('*'):
5515 /* Remember offset in ctx->block_bytes so we can revisit the
5516 * block if we detect it is a loose list. */
5517 md_end_current_block(ctx);
5518 c->block_byte_off = ctx->n_block_bytes;
5519
5520 MD_CHECK(md_push_container_bytes(ctx,
5521 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5522 c->start, data, MD_BLOCK_CONTAINER_OPENER));
5523 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5524 c->task_mark_off,
5525 (c->is_task ? CH(c->task_mark_off) : 0),
5526 MD_BLOCK_CONTAINER_OPENER));
5527 break;
5528
5529 case _T('>'):
5530 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5531 break;
5532
5533 default:
5534 MD_UNREACHABLE();
5535 break;
5536 }
5537 }
5538
5539abort:
5540 return ret;
5541}
5542
5543static int
5544md_leave_child_containers(MD_CTX* ctx, int n_keep)
5545{
5546 int ret = 0;
5547
5548 while(ctx->n_containers > n_keep) {
5549 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5550 int is_ordered_list = FALSE;
5551
5552 switch(c->ch) {
5553 case _T(')'):
5554 case _T('.'):
5555 is_ordered_list = TRUE;
5556 /* Pass through */
5557
5558 case _T('-'):
5559 case _T('+'):
5560 case _T('*'):
5561 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5562 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5563 MD_BLOCK_CONTAINER_CLOSER));
5564 MD_CHECK(md_push_container_bytes(ctx,
5565 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5566 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5567 break;
5568
5569 case _T('>'):
5570 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5571 0, MD_BLOCK_CONTAINER_CLOSER));
5572 break;
5573
5574 default:
5575 MD_UNREACHABLE();
5576 break;
5577 }
5578
5579 ctx->n_containers--;
5580 }
5581
5582abort:
5583 return ret;
5584}
5585
5586static int
5587md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5588{
5589 OFF off = beg;
5590 OFF max_end;
5591
5592 if(indent >= ctx->code_indent_offset)
5593 return FALSE;
5594
5595 /* Check for block quote mark. */
5596 if(off < ctx->size && CH(off) == _T('>')) {
5597 off++;
5598 p_container->ch = _T('>');
5599 p_container->is_loose = FALSE;
5600 p_container->is_task = FALSE;
5601 p_container->mark_indent = indent;
5602 p_container->contents_indent = indent + 1;
5603 *p_end = off;
5604 return TRUE;
5605 }
5606
5607 /* Check for list item bullet mark. */
5608 if(off+1 < ctx->size && ISANYOF(off, _T("-+*")) && (ISBLANK(off+1) || ISNEWLINE(off+1))) {
5609 p_container->ch = CH(off);
5610 p_container->is_loose = FALSE;
5611 p_container->is_task = FALSE;
5612 p_container->mark_indent = indent;
5613 p_container->contents_indent = indent + 1;
5614 *p_end = off + 1;
5615 return TRUE;
5616 }
5617
5618 /* Check for ordered list item marks. */
5619 max_end = off + 9;
5620 if(max_end > ctx->size)
5621 max_end = ctx->size;
5622 p_container->start = 0;
5623 while(off < max_end && ISDIGIT(off)) {
5624 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5625 off++;
5626 }
5627 if(off > beg && off+1 < ctx->size &&
5628 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5629 (ISBLANK(off+1) || ISNEWLINE(off+1)))
5630 {
5631 p_container->ch = CH(off);
5632 p_container->is_loose = FALSE;
5633 p_container->is_task = FALSE;
5634 p_container->mark_indent = indent;
5635 p_container->contents_indent = indent + off - beg + 1;
5636 *p_end = off + 1;
5637 return TRUE;
5638 }
5639
5640 return FALSE;
5641}
5642
5643static unsigned
5644md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5645{
5646 OFF off = beg;
5647 unsigned indent = total_indent;
5648
5649 while(off < ctx->size && ISBLANK(off)) {
5650 if(CH(off) == _T('\t'))
5651 indent = (indent + 4) & ~3;
5652 else
5653 indent++;
5654 off++;
5655 }
5656
5657 *p_end = off;
5658 return indent - total_indent;
5659}
5660
5661static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0 };
5662
5663/* Analyze type of the line and find some its properties. This serves as a
5664 * main input for determining type and boundaries of a block. */
5665static int
5666md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5667 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5668{
5669 unsigned total_indent = 0;
5670 int n_parents = 0;
5671 int n_brothers = 0;
5672 int n_children = 0;
5673 MD_CONTAINER container = { 0 };
5674 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5675 OFF off = beg;
5676 OFF hr_killer = 0;
5677 int ret = 0;
5678
5679 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5680 total_indent += line->indent;
5681 line->beg = off;
5682
5683 /* Given the indentation and block quote marks '>', determine how many of
5684 * the current containers are our parents. */
5685 while(n_parents < ctx->n_containers) {
5686 MD_CONTAINER* c = &ctx->containers[n_parents];
5687
5688 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5689 off < ctx->size && CH(off) == _T('>'))
5690 {
5691 /* Block quote mark. */
5692 off++;
5693 total_indent++;
5694 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5695 total_indent += line->indent;
5696
5697 /* The optional 1st space after '>' is part of the block quote mark. */
5698 if(line->indent > 0)
5699 line->indent--;
5700
5701 line->beg = off;
5702
5703 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5704 /* List. */
5705 line->indent -= c->contents_indent;
5706 } else {
5707 break;
5708 }
5709
5710 n_parents++;
5711 }
5712
5713 if(off >= ctx->size || ISNEWLINE(off)) {
5714 /* Blank line does not need any real indentation to be nested inside
5715 * a list. */
5716 if(n_brothers + n_children == 0) {
5717 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5718 n_parents++;
5719 }
5720 }
5721
5722 while(TRUE) {
5723 /* Check whether we are fenced code continuation. */
5724 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5725 line->beg = off;
5726
5727 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5728 * which we transform into MD_LINE_BLANK. */
5729 if(line->indent < ctx->code_indent_offset) {
5730 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5731 line->type = MD_LINE_BLANK;
5732 ctx->last_line_has_list_loosening_effect = FALSE;
5733 break;
5734 }
5735 }
5736
5737 /* Change indentation accordingly to the initial code fence. */
5738 if(n_parents == ctx->n_containers) {
5739 if(line->indent > pivot_line->indent)
5740 line->indent -= pivot_line->indent;
5741 else
5742 line->indent = 0;
5743
5744 line->type = MD_LINE_FENCEDCODE;
5745 break;
5746 }
5747 }
5748
5749 /* Check whether we are HTML block continuation. */
5750 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5751 int html_block_type;
5752
5753 html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5754 if(html_block_type > 0) {
5755 MD_ASSERT(html_block_type == ctx->html_block_type);
5756
5757 /* Make sure this is the last line of the block. */
5758 ctx->html_block_type = 0;
5759
5760 /* Some end conditions serve as blank lines at the same time. */
5761 if(html_block_type == 6 || html_block_type == 7) {
5762 line->type = MD_LINE_BLANK;
5763 line->indent = 0;
5764 break;
5765 }
5766 }
5767
5768 if(n_parents == ctx->n_containers) {
5769 line->type = MD_LINE_HTML;
5770 break;
5771 }
5772 }
5773
5774 /* Check for blank line. */
5775 if(off >= ctx->size || ISNEWLINE(off)) {
5776 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5777 line->type = MD_LINE_INDENTEDCODE;
5778 if(line->indent > ctx->code_indent_offset)
5779 line->indent -= ctx->code_indent_offset;
5780 else
5781 line->indent = 0;
5782 ctx->last_line_has_list_loosening_effect = FALSE;
5783 } else {
5784 line->type = MD_LINE_BLANK;
5785 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5786 n_brothers + n_children == 0 &&
5787 ctx->containers[n_parents-1].ch != _T('>'));
5788
5789 #if 1
5790 /* See https://github.com/mity/md4c/issues/6
5791 *
5792 * This ugly checking tests we are in (yet empty) list item but not
5793 * its very first line (with the list item mark).
5794 *
5795 * If we are such blank line, then any following non-blank line
5796 * which would be part of this list item actually ends the list
5797 * because "a list item can begin with at most one blank line."
5798 */
5799 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5800 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5801 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5802 {
5803 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5804 if(top_block->type == MD_BLOCK_LI)
5805 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5806 }
5807 #endif
5808 }
5809 break;
5810 } else {
5811 #if 1
5812 /* This is 2nd half of the hack. If the flag is set (that is there
5813 * were 2nd blank line at the start of the list item) and we would also
5814 * belonging to such list item, then interrupt the list. */
5815 ctx->last_line_has_list_loosening_effect = FALSE;
5816 if(ctx->last_list_item_starts_with_two_blank_lines) {
5817 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5818 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5819 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5820 {
5821 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5822 if(top_block->type == MD_BLOCK_LI)
5823 n_parents--;
5824 }
5825
5826 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5827 }
5828 #endif
5829 }
5830
5831 /* Check whether we are Setext underline. */
5832 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
5833 && (CH(off) == _T('=') || CH(off) == _T('-'))
5834 && (n_parents == ctx->n_containers))
5835 {
5836 unsigned level;
5837
5838 if(md_is_setext_underline(ctx, off, &off, &level)) {
5839 line->type = MD_LINE_SETEXTUNDERLINE;
5840 line->data = level;
5841 break;
5842 }
5843 }
5844
5845 /* Check for thematic break line. */
5846 if(line->indent < ctx->code_indent_offset && ISANYOF(off, _T("-_*")) && off >= hr_killer) {
5847 if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
5848 line->type = MD_LINE_HR;
5849 break;
5850 }
5851 }
5852
5853 /* Check for "brother" container. I.e. whether we are another list item
5854 * in already started list. */
5855 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
5856 OFF tmp;
5857
5858 if(md_is_container_mark(ctx, line->indent, off, &tmp, &container) &&
5859 md_is_container_compatible(&ctx->containers[n_parents], &container))
5860 {
5861 pivot_line = &md_dummy_blank_line;
5862
5863 off = tmp;
5864
5865 total_indent += container.contents_indent - container.mark_indent;
5866 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5867 total_indent += line->indent;
5868 line->beg = off;
5869
5870 /* Some of the following whitespace actually still belongs to the mark. */
5871 if(off >= ctx->size || ISNEWLINE(off)) {
5872 container.contents_indent++;
5873 } else if(line->indent <= ctx->code_indent_offset) {
5874 container.contents_indent += line->indent;
5875 line->indent = 0;
5876 } else {
5877 container.contents_indent += 1;
5878 line->indent--;
5879 }
5880
5881 ctx->containers[n_parents].mark_indent = container.mark_indent;
5882 ctx->containers[n_parents].contents_indent = container.contents_indent;
5883
5884 n_brothers++;
5885 continue;
5886 }
5887 }
5888
5889 /* Check for indented code.
5890 * Note indented code block cannot interrupt a paragraph. */
5891 if(line->indent >= ctx->code_indent_offset &&
5892 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5893 {
5894 line->type = MD_LINE_INDENTEDCODE;
5895 MD_ASSERT(line->indent >= ctx->code_indent_offset);
5896 line->indent -= ctx->code_indent_offset;
5897 line->data = 0;
5898 break;
5899 }
5900
5901 /* Check for start of a new container block. */
5902 if(line->indent < ctx->code_indent_offset &&
5903 md_is_container_mark(ctx, line->indent, off, &off, &container))
5904 {
5905 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5906 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
5907 {
5908 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5909 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5910 (container.ch == _T('.') || container.ch == _T(')')) && container.start != 1)
5911 {
5912 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5913 } else {
5914 total_indent += container.contents_indent - container.mark_indent;
5915 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5916 total_indent += line->indent;
5917
5918 line->beg = off;
5919 line->data = container.ch;
5920
5921 /* Some of the following whitespace actually still belongs to the mark. */
5922 if(off >= ctx->size || ISNEWLINE(off)) {
5923 container.contents_indent++;
5924 } else if(line->indent <= ctx->code_indent_offset) {
5925 container.contents_indent += line->indent;
5926 line->indent = 0;
5927 } else {
5928 container.contents_indent += 1;
5929 line->indent--;
5930 }
5931
5932 if(n_brothers + n_children == 0)
5933 pivot_line = &md_dummy_blank_line;
5934
5935 if(n_children == 0)
5936 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5937
5938 n_children++;
5939 MD_CHECK(md_push_container(ctx, &container));
5940 continue;
5941 }
5942 }
5943
5944 /* Check whether we are table continuation. */
5945 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
5946 line->type = MD_LINE_TABLE;
5947 break;
5948 }
5949
5950 /* Check for ATX header. */
5951 if(line->indent < ctx->code_indent_offset && CH(off) == _T('#')) {
5952 unsigned level;
5953
5954 if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
5955 line->type = MD_LINE_ATXHEADER;
5956 line->data = level;
5957 break;
5958 }
5959 }
5960
5961 /* Check whether we are starting code fence. */
5962 if(CH(off) == _T('`') || CH(off) == _T('~')) {
5963 if(md_is_opening_code_fence(ctx, off, &off)) {
5964 line->type = MD_LINE_FENCEDCODE;
5965 line->data = 1;
5966 break;
5967 }
5968 }
5969
5970 /* Check for start of raw HTML block. */
5971 if(CH(off) == _T('<') && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
5972 {
5973 ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
5974
5975 /* HTML block type 7 cannot interrupt paragraph. */
5976 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
5977 ctx->html_block_type = 0;
5978
5979 if(ctx->html_block_type > 0) {
5980 /* The line itself also may immediately close the block. */
5981 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
5982 /* Make sure this is the last line of the block. */
5983 ctx->html_block_type = 0;
5984 }
5985
5986 line->type = MD_LINE_HTML;
5987 break;
5988 }
5989 }
5990
5991 /* Check for table underline. */
5992 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT &&
5993 (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':')) &&
5994 n_parents == ctx->n_containers)
5995 {
5996 unsigned col_count;
5997
5998 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
5999 md_is_table_underline(ctx, off, &off, &col_count))
6000 {
6001 line->data = col_count;
6002 line->type = MD_LINE_TABLEUNDERLINE;
6003 break;
6004 }
6005 }
6006
6007 /* By default, we are normal text line. */
6008 line->type = MD_LINE_TEXT;
6009 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6010 /* Lazy continuation. */
6011 n_parents = ctx->n_containers;
6012 }
6013
6014 /* Check for task mark. */
6015 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6016 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6017 {
6018 OFF tmp = off;
6019
6020 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6021 tmp++;
6022 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6023 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6024 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6025 {
6026 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6027 task_container->is_task = TRUE;
6028 task_container->task_mark_off = tmp + 1;
6029 off = tmp + 3;
6030 while(ISWHITESPACE(off))
6031 off++;
6032 line->beg = off;
6033 }
6034 }
6035
6036 break;
6037 }
6038
6039 /* Scan for end of the line.
6040 *
6041 * Note this is quite a bottleneck of the parsing as we here iterate almost
6042 * over compete document.
6043 */
6044#if defined __linux__ && !defined MD4C_USE_UTF16
6045 /* Recent glibc versions have superbly optimized strcspn(), even using
6046 * vectorization if available. */
6047 if(ctx->doc_ends_with_newline && off < ctx->size) {
6048 while(TRUE) {
6049 off += (OFF) strcspn(STR(off), "\r\n");
6050
6051 /* strcspn() can stop on zero terminator; but that can appear
6052 * anywhere in the Markfown input... */
6053 if(CH(off) == _T('\0'))
6054 off++;
6055 else
6056 break;
6057 }
6058 } else
6059#endif
6060 {
6061 /* Optimization: Use some loop unrolling. */
6062 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6063 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6064 off += 4;
6065 while(off < ctx->size && !ISNEWLINE(off))
6066 off++;
6067 }
6068
6069 /* Set end of the line. */
6070 line->end = off;
6071
6072 /* But for ATX header, we should exclude the optional trailing mark. */
6073 if(line->type == MD_LINE_ATXHEADER) {
6074 OFF tmp = line->end;
6075 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6076 tmp--;
6077 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6078 tmp--;
6079 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6080 line->end = tmp;
6081 }
6082
6083 /* Trim trailing spaces. */
6084 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) {
6085 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6086 line->end--;
6087 }
6088
6089 /* Eat also the new line. */
6090 if(off < ctx->size && CH(off) == _T('\r'))
6091 off++;
6092 if(off < ctx->size && CH(off) == _T('\n'))
6093 off++;
6094
6095 *p_end = off;
6096
6097 /* If we belong to a list after seeing a blank line, the list is loose. */
6098 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6099 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6100 if(c->ch != _T('>')) {
6101 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6102 block->flags |= MD_BLOCK_LOOSE_LIST;
6103 }
6104 }
6105
6106 /* Leave any containers we are not part of anymore. */
6107 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6108 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6109
6110 /* Enter any container we found a mark for. */
6111 if(n_brothers > 0) {
6112 MD_ASSERT(n_brothers == 1);
6113 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6114 ctx->containers[n_parents].task_mark_off,
6115 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6116 MD_BLOCK_CONTAINER_CLOSER));
6117 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6118 container.task_mark_off,
6119 (container.is_task ? CH(container.task_mark_off) : 0),
6120 MD_BLOCK_CONTAINER_OPENER));
6121 ctx->containers[n_parents].is_task = container.is_task;
6122 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6123 }
6124
6125 if(n_children > 0)
6126 MD_CHECK(md_enter_child_containers(ctx, n_children, line->data));
6127
6128abort:
6129 return ret;
6130}
6131
6132static int
6133md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6134{
6135 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6136 int ret = 0;
6137
6138 /* Blank line ends current leaf block. */
6139 if(line->type == MD_LINE_BLANK) {
6140 MD_CHECK(md_end_current_block(ctx));
6141 *p_pivot_line = &md_dummy_blank_line;
6142 return 0;
6143 }
6144
6145 /* Some line types form block on their own. */
6146 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6147 MD_CHECK(md_end_current_block(ctx));
6148
6149 /* Add our single-line block. */
6150 MD_CHECK(md_start_new_block(ctx, line));
6151 MD_CHECK(md_add_line_into_current_block(ctx, line));
6152 MD_CHECK(md_end_current_block(ctx));
6153 *p_pivot_line = &md_dummy_blank_line;
6154 return 0;
6155 }
6156
6157 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6158 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6159 MD_ASSERT(ctx->current_block != NULL);
6160 ctx->current_block->type = MD_BLOCK_H;
6161 ctx->current_block->data = line->data;
6162 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6163 MD_CHECK(md_add_line_into_current_block(ctx, line));
6164 MD_CHECK(md_end_current_block(ctx));
6165 if(ctx->current_block == NULL) {
6166 *p_pivot_line = &md_dummy_blank_line;
6167 } else {
6168 /* This happens if we have consumed all the body as link ref. defs.
6169 * and downgraded the underline into start of a new paragraph block. */
6170 line->type = MD_LINE_TEXT;
6171 *p_pivot_line = line;
6172 }
6173 return 0;
6174 }
6175
6176 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6177 if(line->type == MD_LINE_TABLEUNDERLINE) {
6178 MD_ASSERT(ctx->current_block != NULL);
6179 MD_ASSERT(ctx->current_block->n_lines == 1);
6180 ctx->current_block->type = MD_BLOCK_TABLE;
6181 ctx->current_block->data = line->data;
6182 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6183 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6184 MD_CHECK(md_add_line_into_current_block(ctx, line));
6185 return 0;
6186 }
6187
6188 /* The current block also ends if the line has different type. */
6189 if(line->type != pivot_line->type)
6190 MD_CHECK(md_end_current_block(ctx));
6191
6192 /* The current line may start a new block. */
6193 if(ctx->current_block == NULL) {
6194 MD_CHECK(md_start_new_block(ctx, line));
6195 *p_pivot_line = line;
6196 }
6197
6198 /* In all other cases the line is just a continuation of the current block. */
6199 MD_CHECK(md_add_line_into_current_block(ctx, line));
6200
6201abort:
6202 return ret;
6203}
6204
6205static int
6206md_process_doc(MD_CTX *ctx)
6207{
6208 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6209 MD_LINE_ANALYSIS line_buf[2];
6210 MD_LINE_ANALYSIS* line = &line_buf[0];
6211 OFF off = 0;
6212 int ret = 0;
6213
6214 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6215
6216 while(off < ctx->size) {
6217 if(line == pivot_line)
6218 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6219
6220 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6221 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6222 }
6223
6224 md_end_current_block(ctx);
6225
6226 MD_CHECK(md_build_ref_def_hashtable(ctx));
6227
6228 /* Process all blocks. */
6229 MD_CHECK(md_leave_child_containers(ctx, 0));
6230 MD_CHECK(md_process_all_blocks(ctx));
6231
6232 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6233
6234abort:
6235
6236#if 0
6237 /* Output some memory consumption statistics. */
6238 {
6239 char buffer[256];
6240 sprintf(buffer, "Alloced %u bytes for block buffer.",
6241 (unsigned)(ctx->alloc_block_bytes));
6242 MD_LOG(buffer);
6243
6244 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6245 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6246 MD_LOG(buffer);
6247
6248 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6249 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6250 MD_LOG(buffer);
6251
6252 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6253 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6254 MD_LOG(buffer);
6255 }
6256#endif
6257
6258 return ret;
6259}
6260
6261
6262/********************
6263 *** Public API ***
6264 ********************/
6265
6266int
6267md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6268{
6269 MD_CTX ctx;
6270 int i;
6271 int ret;
6272
6273 if(parser->abi_version != 0) {
6274 if(parser->debug_log != NULL)
6275 parser->debug_log("Unsupported abi_version.", userdata);
6276 return -1;
6277 }
6278
6279 /* Setup context structure. */
6280 memset(&ctx, 0, sizeof(MD_CTX));
6281 ctx.text = text;
6282 ctx.size = size;
6283 memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6284 ctx.userdata = userdata;
6285 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6286 md_build_mark_char_map(&ctx);
6287 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6288
6289 /* Reset all unresolved opener mark chains. */
6290 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6291 ctx.mark_chains[i].head = -1;
6292 ctx.mark_chains[i].tail = -1;
6293 }
6294 ctx.unresolved_link_head = -1;
6295 ctx.unresolved_link_tail = -1;
6296
6297 /* All the work. */
6298 ret = md_process_doc(&ctx);
6299
6300 /* Clean-up. */
6301 md_free_ref_defs(&ctx);
6302 md_free_ref_def_hashtable(&ctx);
6303 free(ctx.buffer);
6304 free(ctx.marks);
6305 free(ctx.block_bytes);
6306 free(ctx.containers);
6307
6308 return ret;
6309}
6310