1 | /* |
2 | * MD4C: Markdown parser for C |
3 | * (http://github.com/mity/md4c) |
4 | * |
5 | * Copyright (c) 2016-2020 Martin Mitas |
6 | * |
7 | * Permission is hereby granted, free of charge, to any person obtaining a |
8 | * copy of this software and associated documentation files (the "Software"), |
9 | * to deal in the Software without restriction, including without limitation |
10 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
11 | * and/or sell copies of the Software, and to permit persons to whom the |
12 | * Software is furnished to do so, subject to the following conditions: |
13 | * |
14 | * The above copyright notice and this permission notice shall be included in |
15 | * all copies or substantial portions of the Software. |
16 | * |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
18 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
22 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
23 | * IN THE SOFTWARE. |
24 | */ |
25 | |
26 | #ifndef MD4C_MARKDOWN_H |
27 | #define MD4C_MARKDOWN_H |
28 | |
29 | #ifdef __cplusplus |
30 | extern "C" { |
31 | #endif |
32 | |
33 | #if defined MD4C_USE_UTF16 |
34 | /* Magic to support UTF-16. Not that in order to use it, you have to define |
35 | * the macro MD4C_USE_UTF16 both when building MD4C as well as when |
36 | * including this header in your code. */ |
37 | #ifdef _WIN32 |
38 | #include <windows.h> |
39 | typedef WCHAR MD_CHAR; |
40 | #else |
41 | #error MD4C_USE_UTF16 is only supported on Windows. |
42 | #endif |
43 | #else |
44 | typedef char MD_CHAR; |
45 | #endif |
46 | |
47 | typedef unsigned MD_SIZE; |
48 | typedef unsigned MD_OFFSET; |
49 | |
50 | |
51 | /* Block represents a part of document hierarchy structure like a paragraph |
52 | * or list item. |
53 | */ |
54 | typedef enum MD_BLOCKTYPE { |
55 | /* <body>...</body> */ |
56 | MD_BLOCK_DOC = 0, |
57 | |
58 | /* <blockquote>...</blockquote> */ |
59 | MD_BLOCK_QUOTE, |
60 | |
61 | /* <ul>...</ul> |
62 | * Detail: Structure MD_BLOCK_UL_DETAIL. */ |
63 | MD_BLOCK_UL, |
64 | |
65 | /* <ol>...</ol> |
66 | * Detail: Structure MD_BLOCK_OL_DETAIL. */ |
67 | MD_BLOCK_OL, |
68 | |
69 | /* <li>...</li> |
70 | * Detail: Structure MD_BLOCK_LI_DETAIL. */ |
71 | MD_BLOCK_LI, |
72 | |
73 | /* <hr> */ |
74 | MD_BLOCK_HR, |
75 | |
76 | /* <h1>...</h1> (for levels up to 6) |
77 | * Detail: Structure MD_BLOCK_H_DETAIL. */ |
78 | MD_BLOCK_H, |
79 | |
80 | /* <pre><code>...</code></pre> |
81 | * Note the text lines within code blocks are terminated with '\n' |
82 | * instead of explicit MD_TEXT_BR. */ |
83 | MD_BLOCK_CODE, |
84 | |
85 | /* Raw HTML block. This itself does not correspond to any particular HTML |
86 | * tag. The contents of it _is_ raw HTML source intended to be put |
87 | * in verbatim form to the HTML output. */ |
88 | MD_BLOCK_HTML, |
89 | |
90 | /* <p>...</p> */ |
91 | MD_BLOCK_P, |
92 | |
93 | /* <table>...</table> and its contents. |
94 | * Detail: Structure MD_BLOCK_TD_DETAIL (used with MD_BLOCK_TH and MD_BLOCK_TD) |
95 | * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */ |
96 | MD_BLOCK_TABLE, |
97 | MD_BLOCK_THEAD, |
98 | MD_BLOCK_TBODY, |
99 | MD_BLOCK_TR, |
100 | MD_BLOCK_TH, |
101 | MD_BLOCK_TD |
102 | } MD_BLOCKTYPE; |
103 | |
104 | /* Span represents an in-line piece of a document which should be rendered with |
105 | * the same font, color and other attributes. A sequence of spans forms a block |
106 | * like paragraph or list item. */ |
107 | typedef enum MD_SPANTYPE { |
108 | /* <em>...</em> */ |
109 | MD_SPAN_EM, |
110 | |
111 | /* <strong>...</strong> */ |
112 | MD_SPAN_STRONG, |
113 | |
114 | /* <a href="xxx">...</a> |
115 | * Detail: Structure MD_SPAN_A_DETAIL. */ |
116 | MD_SPAN_A, |
117 | |
118 | /* <img src="xxx">...</a> |
119 | * Detail: Structure MD_SPAN_IMG_DETAIL. |
120 | * Note: Image text can contain nested spans and even nested images. |
121 | * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility |
122 | * of the renderer to deal with it. |
123 | */ |
124 | MD_SPAN_IMG, |
125 | |
126 | /* <code>...</code> */ |
127 | MD_SPAN_CODE, |
128 | |
129 | /* <del>...</del> |
130 | * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled. |
131 | */ |
132 | MD_SPAN_DEL, |
133 | |
134 | /* For recognizing inline ($) and display ($$) equations |
135 | * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled. |
136 | */ |
137 | MD_SPAN_LATEXMATH, |
138 | MD_SPAN_LATEXMATH_DISPLAY, |
139 | |
140 | /* Wiki links |
141 | * Note: Recognized only when MD_FLAG_WIKILINKS is enabled. |
142 | */ |
143 | MD_SPAN_WIKILINK, |
144 | |
145 | /* <u>...</u> |
146 | * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */ |
147 | MD_SPAN_U |
148 | } MD_SPANTYPE; |
149 | |
150 | /* Text is the actual textual contents of span. */ |
151 | typedef enum MD_TEXTTYPE { |
152 | /* Normal text. */ |
153 | MD_TEXT_NORMAL = 0, |
154 | |
155 | /* NULL character. CommonMark requires replacing NULL character with |
156 | * the replacement char U+FFFD, so this allows caller to do that easily. */ |
157 | MD_TEXT_NULLCHAR, |
158 | |
159 | /* Line breaks. |
160 | * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE |
161 | * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */ |
162 | MD_TEXT_BR, /* <br> (hard break) */ |
163 | MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ |
164 | |
165 | /* Entity. |
166 | * (a) Named entity, e.g. |
167 | * (Note MD4C does not have a list of known entities. |
168 | * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is |
169 | * treated as a named entity.) |
170 | * (b) Numerical entity, e.g. Ӓ |
171 | * (c) Hexadecimal entity, e.g. ካ |
172 | * |
173 | * As MD4C is mostly encoding agnostic, application gets the verbatim |
174 | * entity text into the MD_RENDERER::text_callback(). */ |
175 | MD_TEXT_ENTITY, |
176 | |
177 | /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`). |
178 | * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and |
179 | * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this |
180 | * kind of text. */ |
181 | MD_TEXT_CODE, |
182 | |
183 | /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not |
184 | * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used. |
185 | * The text contains verbatim '\n' for the new lines. */ |
186 | MD_TEXT_HTML, |
187 | |
188 | /* Text is inside an equation. This is processed the same way as inlined code |
189 | * spans (`code`). */ |
190 | MD_TEXT_LATEXMATH |
191 | } MD_TEXTTYPE; |
192 | |
193 | |
194 | /* Alignment enumeration. */ |
195 | typedef enum MD_ALIGN { |
196 | MD_ALIGN_DEFAULT = 0, /* When unspecified. */ |
197 | MD_ALIGN_LEFT, |
198 | MD_ALIGN_CENTER, |
199 | MD_ALIGN_RIGHT |
200 | } MD_ALIGN; |
201 | |
202 | |
203 | /* String attribute. |
204 | * |
205 | * This wraps strings which are outside of a normal text flow and which are |
206 | * propagated within various detailed structures, but which still may contain |
207 | * string portions of different types like e.g. entities. |
208 | * |
209 | * So, for example, lets consider an image has a title attribute string |
210 | * set to "foo " bar". (Note the string size is 14.) |
211 | * |
212 | * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following: |
213 | * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0) |
214 | * -- [1]: """ (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4) |
215 | * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10) |
216 | * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14) |
217 | * |
218 | * Note that these conditions are guaranteed: |
219 | * -- substr_offsets[0] == 0 |
220 | * -- substr_offsets[LAST+1] == size |
221 | * -- Only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR substrings can appear. |
222 | */ |
223 | typedef struct MD_ATTRIBUTE { |
224 | const MD_CHAR* text; |
225 | MD_SIZE size; |
226 | const MD_TEXTTYPE* substr_types; |
227 | const MD_OFFSET* substr_offsets; |
228 | } MD_ATTRIBUTE; |
229 | |
230 | |
231 | /* Detailed info for MD_BLOCK_UL. */ |
232 | typedef struct MD_BLOCK_UL_DETAIL { |
233 | int is_tight; /* Non-zero if tight list, zero if loose. */ |
234 | MD_CHAR mark; /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */ |
235 | } MD_BLOCK_UL_DETAIL; |
236 | |
237 | /* Detailed info for MD_BLOCK_OL. */ |
238 | typedef struct MD_BLOCK_OL_DETAIL { |
239 | unsigned start; /* Start index of the ordered list. */ |
240 | int is_tight; /* Non-zero if tight list, zero if loose. */ |
241 | MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */ |
242 | } MD_BLOCK_OL_DETAIL; |
243 | |
244 | /* Detailed info for MD_BLOCK_LI. */ |
245 | typedef struct MD_BLOCK_LI_DETAIL { |
246 | int is_task; /* Can be non-zero only with MD_FLAG_TASKLISTS */ |
247 | MD_CHAR task_mark; /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */ |
248 | MD_OFFSET task_mark_offset; /* If is_task, then offset in the input of the char between '[' and ']'. */ |
249 | } MD_BLOCK_LI_DETAIL; |
250 | |
251 | /* Detailed info for MD_BLOCK_H. */ |
252 | typedef struct MD_BLOCK_H_DETAIL { |
253 | unsigned level; /* Header level (1 - 6) */ |
254 | } MD_BLOCK_H_DETAIL; |
255 | |
256 | /* Detailed info for MD_BLOCK_CODE. */ |
257 | typedef struct MD_BLOCK_CODE_DETAIL { |
258 | MD_ATTRIBUTE info; |
259 | MD_ATTRIBUTE lang; |
260 | MD_CHAR fence_char; /* The character used for fenced code block; or zero for indented code block. */ |
261 | } MD_BLOCK_CODE_DETAIL; |
262 | |
263 | /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */ |
264 | typedef struct MD_BLOCK_TD_DETAIL { |
265 | MD_ALIGN align; |
266 | } MD_BLOCK_TD_DETAIL; |
267 | |
268 | /* Detailed info for MD_SPAN_A. */ |
269 | typedef struct MD_SPAN_A_DETAIL { |
270 | MD_ATTRIBUTE href; |
271 | MD_ATTRIBUTE title; |
272 | } MD_SPAN_A_DETAIL; |
273 | |
274 | /* Detailed info for MD_SPAN_IMG. */ |
275 | typedef struct MD_SPAN_IMG_DETAIL { |
276 | MD_ATTRIBUTE src; |
277 | MD_ATTRIBUTE title; |
278 | } MD_SPAN_IMG_DETAIL; |
279 | |
280 | /* Detailed info for MD_SPAN_WIKILINK. */ |
281 | typedef struct MD_SPAN_WIKILINK { |
282 | MD_ATTRIBUTE target; |
283 | } MD_SPAN_WIKILINK_DETAIL; |
284 | |
285 | /* Flags specifying extensions/deviations from CommonMark specification. |
286 | * |
287 | * By default (when MD_RENDERER::flags == 0), we follow CommonMark specification. |
288 | * The following flags may allow some extensions or deviations from it. |
289 | */ |
290 | #define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ |
291 | #define 0x0002 /* Do not require space in ATX headers ( ###header ) */ |
292 | #define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ |
293 | #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ |
294 | #define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ |
295 | #define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ |
296 | #define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ |
297 | #define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ |
298 | #define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ |
299 | #define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ |
300 | #define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */ |
301 | #define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ |
302 | #define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ |
303 | #define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ |
304 | |
305 | #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) |
306 | #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) |
307 | |
308 | /* Convenient sets of flags corresponding to well-known Markdown dialects. |
309 | * |
310 | * Note we may only support subset of features of the referred dialect. |
311 | * The constant just enables those extensions which bring us as close as |
312 | * possible given what features we implement. |
313 | * |
314 | * ABI compatibility note: Meaning of these can change in time as new |
315 | * extensions, bringing the dialect closer to the original, are implemented. |
316 | */ |
317 | #define MD_DIALECT_COMMONMARK 0 |
318 | #define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS) |
319 | |
320 | /* Renderer structure. |
321 | */ |
322 | typedef struct MD_PARSER { |
323 | /* Reserved. Set to zero. |
324 | */ |
325 | unsigned abi_version; |
326 | |
327 | /* Dialect options. Bitmask of MD_FLAG_xxxx values. |
328 | */ |
329 | unsigned flags; |
330 | |
331 | /* Caller-provided rendering callbacks. |
332 | * |
333 | * For some block/span types, more detailed information is provided in a |
334 | * type-specific structure pointed by the argument 'detail'. |
335 | * |
336 | * The last argument of all callbacks, 'userdata', is just propagated from |
337 | * md_parse() and is available for any use by the application. |
338 | * |
339 | * Note any strings provided to the callbacks as their arguments or as |
340 | * members of any detail structure are generally not zero-terminated. |
341 | * Application has take the respective size information into account. |
342 | * |
343 | * Callbacks may abort further parsing of the document by returning non-zero. |
344 | */ |
345 | int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
346 | int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
347 | |
348 | int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
349 | int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
350 | |
351 | int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/); |
352 | |
353 | /* Debug callback. Optional (may be NULL). |
354 | * |
355 | * If provided and something goes wrong, this function gets called. |
356 | * This is intended for debugging and problem diagnosis for developers; |
357 | * it is not intended to provide any errors suitable for displaying to an |
358 | * end user. |
359 | */ |
360 | void (*debug_log)(const char* /*msg*/, void* /*userdata*/); |
361 | |
362 | /* Reserved. Set to NULL. |
363 | */ |
364 | void (*syntax)(void); |
365 | } MD_PARSER; |
366 | |
367 | |
368 | /* For backward compatibility. Do not use in new code. */ |
369 | typedef MD_PARSER MD_RENDERER; |
370 | |
371 | |
372 | /* Parse the Markdown document stored in the string 'text' of size 'size'. |
373 | * The renderer provides callbacks to be called during the parsing so the |
374 | * caller can render the document on the screen or convert the Markdown |
375 | * to another format. |
376 | * |
377 | * Zero is returned on success. If a runtime error occurs (e.g. a memory |
378 | * fails), -1 is returned. If the processing is aborted due any callback |
379 | * returning non-zero, md_parse() the return value of the callback is returned. |
380 | */ |
381 | int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata); |
382 | |
383 | |
384 | #ifdef __cplusplus |
385 | } /* extern "C" { */ |
386 | #endif |
387 | |
388 | #endif /* MD4C_MARKDOWN_H */ |
389 | |