1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#ifndef MD4C_MARKDOWN_H
27#define MD4C_MARKDOWN_H
28
29#ifdef __cplusplus
30 extern "C" {
31#endif
32
33#if defined MD4C_USE_UTF16
34 /* Magic to support UTF-16. Not that in order to use it, you have to define
35 * the macro MD4C_USE_UTF16 both when building MD4C as well as when
36 * including this header in your code. */
37 #ifdef _WIN32
38 #include <windows.h>
39 typedef WCHAR MD_CHAR;
40 #else
41 #error MD4C_USE_UTF16 is only supported on Windows.
42 #endif
43#else
44 typedef char MD_CHAR;
45#endif
46
47typedef unsigned MD_SIZE;
48typedef unsigned MD_OFFSET;
49
50
51/* Block represents a part of document hierarchy structure like a paragraph
52 * or list item.
53 */
54typedef enum MD_BLOCKTYPE {
55 /* <body>...</body> */
56 MD_BLOCK_DOC = 0,
57
58 /* <blockquote>...</blockquote> */
59 MD_BLOCK_QUOTE,
60
61 /* <ul>...</ul>
62 * Detail: Structure MD_BLOCK_UL_DETAIL. */
63 MD_BLOCK_UL,
64
65 /* <ol>...</ol>
66 * Detail: Structure MD_BLOCK_OL_DETAIL. */
67 MD_BLOCK_OL,
68
69 /* <li>...</li>
70 * Detail: Structure MD_BLOCK_LI_DETAIL. */
71 MD_BLOCK_LI,
72
73 /* <hr> */
74 MD_BLOCK_HR,
75
76 /* <h1>...</h1> (for levels up to 6)
77 * Detail: Structure MD_BLOCK_H_DETAIL. */
78 MD_BLOCK_H,
79
80 /* <pre><code>...</code></pre>
81 * Note the text lines within code blocks are terminated with '\n'
82 * instead of explicit MD_TEXT_BR. */
83 MD_BLOCK_CODE,
84
85 /* Raw HTML block. This itself does not correspond to any particular HTML
86 * tag. The contents of it _is_ raw HTML source intended to be put
87 * in verbatim form to the HTML output. */
88 MD_BLOCK_HTML,
89
90 /* <p>...</p> */
91 MD_BLOCK_P,
92
93 /* <table>...</table> and its contents.
94 * Detail: Structure MD_BLOCK_TD_DETAIL (used with MD_BLOCK_TH and MD_BLOCK_TD)
95 * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */
96 MD_BLOCK_TABLE,
97 MD_BLOCK_THEAD,
98 MD_BLOCK_TBODY,
99 MD_BLOCK_TR,
100 MD_BLOCK_TH,
101 MD_BLOCK_TD
102} MD_BLOCKTYPE;
103
104/* Span represents an in-line piece of a document which should be rendered with
105 * the same font, color and other attributes. A sequence of spans forms a block
106 * like paragraph or list item. */
107typedef enum MD_SPANTYPE {
108 /* <em>...</em> */
109 MD_SPAN_EM,
110
111 /* <strong>...</strong> */
112 MD_SPAN_STRONG,
113
114 /* <a href="xxx">...</a>
115 * Detail: Structure MD_SPAN_A_DETAIL. */
116 MD_SPAN_A,
117
118 /* <img src="xxx">...</a>
119 * Detail: Structure MD_SPAN_IMG_DETAIL.
120 * Note: Image text can contain nested spans and even nested images.
121 * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
122 * of the renderer to deal with it.
123 */
124 MD_SPAN_IMG,
125
126 /* <code>...</code> */
127 MD_SPAN_CODE,
128
129 /* <del>...</del>
130 * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled.
131 */
132 MD_SPAN_DEL,
133
134 /* For recognizing inline ($) and display ($$) equations
135 * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled.
136 */
137 MD_SPAN_LATEXMATH,
138 MD_SPAN_LATEXMATH_DISPLAY,
139
140 /* Wiki links
141 * Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
142 */
143 MD_SPAN_WIKILINK,
144
145 /* <u>...</u>
146 * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
147 MD_SPAN_U
148} MD_SPANTYPE;
149
150/* Text is the actual textual contents of span. */
151typedef enum MD_TEXTTYPE {
152 /* Normal text. */
153 MD_TEXT_NORMAL = 0,
154
155 /* NULL character. CommonMark requires replacing NULL character with
156 * the replacement char U+FFFD, so this allows caller to do that easily. */
157 MD_TEXT_NULLCHAR,
158
159 /* Line breaks.
160 * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE
161 * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */
162 MD_TEXT_BR, /* <br> (hard break) */
163 MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */
164
165 /* Entity.
166 * (a) Named entity, e.g. &nbsp;
167 * (Note MD4C does not have a list of known entities.
168 * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
169 * treated as a named entity.)
170 * (b) Numerical entity, e.g. &#1234;
171 * (c) Hexadecimal entity, e.g. &#x12AB;
172 *
173 * As MD4C is mostly encoding agnostic, application gets the verbatim
174 * entity text into the MD_RENDERER::text_callback(). */
175 MD_TEXT_ENTITY,
176
177 /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
178 * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
179 * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this
180 * kind of text. */
181 MD_TEXT_CODE,
182
183 /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not
184 * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used.
185 * The text contains verbatim '\n' for the new lines. */
186 MD_TEXT_HTML,
187
188 /* Text is inside an equation. This is processed the same way as inlined code
189 * spans (`code`). */
190 MD_TEXT_LATEXMATH
191} MD_TEXTTYPE;
192
193
194/* Alignment enumeration. */
195typedef enum MD_ALIGN {
196 MD_ALIGN_DEFAULT = 0, /* When unspecified. */
197 MD_ALIGN_LEFT,
198 MD_ALIGN_CENTER,
199 MD_ALIGN_RIGHT
200} MD_ALIGN;
201
202
203/* String attribute.
204 *
205 * This wraps strings which are outside of a normal text flow and which are
206 * propagated within various detailed structures, but which still may contain
207 * string portions of different types like e.g. entities.
208 *
209 * So, for example, lets consider an image has a title attribute string
210 * set to "foo &quot; bar". (Note the string size is 14.)
211 *
212 * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
213 * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
214 * -- [1]: "&quot;" (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4)
215 * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
216 * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14)
217 *
218 * Note that these conditions are guaranteed:
219 * -- substr_offsets[0] == 0
220 * -- substr_offsets[LAST+1] == size
221 * -- Only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR substrings can appear.
222 */
223typedef struct MD_ATTRIBUTE {
224 const MD_CHAR* text;
225 MD_SIZE size;
226 const MD_TEXTTYPE* substr_types;
227 const MD_OFFSET* substr_offsets;
228} MD_ATTRIBUTE;
229
230
231/* Detailed info for MD_BLOCK_UL. */
232typedef struct MD_BLOCK_UL_DETAIL {
233 int is_tight; /* Non-zero if tight list, zero if loose. */
234 MD_CHAR mark; /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */
235} MD_BLOCK_UL_DETAIL;
236
237/* Detailed info for MD_BLOCK_OL. */
238typedef struct MD_BLOCK_OL_DETAIL {
239 unsigned start; /* Start index of the ordered list. */
240 int is_tight; /* Non-zero if tight list, zero if loose. */
241 MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */
242} MD_BLOCK_OL_DETAIL;
243
244/* Detailed info for MD_BLOCK_LI. */
245typedef struct MD_BLOCK_LI_DETAIL {
246 int is_task; /* Can be non-zero only with MD_FLAG_TASKLISTS */
247 MD_CHAR task_mark; /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */
248 MD_OFFSET task_mark_offset; /* If is_task, then offset in the input of the char between '[' and ']'. */
249} MD_BLOCK_LI_DETAIL;
250
251/* Detailed info for MD_BLOCK_H. */
252typedef struct MD_BLOCK_H_DETAIL {
253 unsigned level; /* Header level (1 - 6) */
254} MD_BLOCK_H_DETAIL;
255
256/* Detailed info for MD_BLOCK_CODE. */
257typedef struct MD_BLOCK_CODE_DETAIL {
258 MD_ATTRIBUTE info;
259 MD_ATTRIBUTE lang;
260 MD_CHAR fence_char; /* The character used for fenced code block; or zero for indented code block. */
261} MD_BLOCK_CODE_DETAIL;
262
263/* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */
264typedef struct MD_BLOCK_TD_DETAIL {
265 MD_ALIGN align;
266} MD_BLOCK_TD_DETAIL;
267
268/* Detailed info for MD_SPAN_A. */
269typedef struct MD_SPAN_A_DETAIL {
270 MD_ATTRIBUTE href;
271 MD_ATTRIBUTE title;
272} MD_SPAN_A_DETAIL;
273
274/* Detailed info for MD_SPAN_IMG. */
275typedef struct MD_SPAN_IMG_DETAIL {
276 MD_ATTRIBUTE src;
277 MD_ATTRIBUTE title;
278} MD_SPAN_IMG_DETAIL;
279
280/* Detailed info for MD_SPAN_WIKILINK. */
281typedef struct MD_SPAN_WIKILINK {
282 MD_ATTRIBUTE target;
283} MD_SPAN_WIKILINK_DETAIL;
284
285/* Flags specifying extensions/deviations from CommonMark specification.
286 *
287 * By default (when MD_RENDERER::flags == 0), we follow CommonMark specification.
288 * The following flags may allow some extensions or deviations from it.
289 */
290#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
291#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */
292#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */
293#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */
294#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */
295#define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */
296#define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */
297#define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */
298#define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */
299#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
300#define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */
301#define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */
302#define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */
303#define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */
304
305#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
306#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
307
308/* Convenient sets of flags corresponding to well-known Markdown dialects.
309 *
310 * Note we may only support subset of features of the referred dialect.
311 * The constant just enables those extensions which bring us as close as
312 * possible given what features we implement.
313 *
314 * ABI compatibility note: Meaning of these can change in time as new
315 * extensions, bringing the dialect closer to the original, are implemented.
316 */
317#define MD_DIALECT_COMMONMARK 0
318#define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)
319
320/* Renderer structure.
321 */
322typedef struct MD_PARSER {
323 /* Reserved. Set to zero.
324 */
325 unsigned abi_version;
326
327 /* Dialect options. Bitmask of MD_FLAG_xxxx values.
328 */
329 unsigned flags;
330
331 /* Caller-provided rendering callbacks.
332 *
333 * For some block/span types, more detailed information is provided in a
334 * type-specific structure pointed by the argument 'detail'.
335 *
336 * The last argument of all callbacks, 'userdata', is just propagated from
337 * md_parse() and is available for any use by the application.
338 *
339 * Note any strings provided to the callbacks as their arguments or as
340 * members of any detail structure are generally not zero-terminated.
341 * Application has take the respective size information into account.
342 *
343 * Callbacks may abort further parsing of the document by returning non-zero.
344 */
345 int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
346 int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
347
348 int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
349 int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
350
351 int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/);
352
353 /* Debug callback. Optional (may be NULL).
354 *
355 * If provided and something goes wrong, this function gets called.
356 * This is intended for debugging and problem diagnosis for developers;
357 * it is not intended to provide any errors suitable for displaying to an
358 * end user.
359 */
360 void (*debug_log)(const char* /*msg*/, void* /*userdata*/);
361
362 /* Reserved. Set to NULL.
363 */
364 void (*syntax)(void);
365} MD_PARSER;
366
367
368/* For backward compatibility. Do not use in new code. */
369typedef MD_PARSER MD_RENDERER;
370
371
372/* Parse the Markdown document stored in the string 'text' of size 'size'.
373 * The renderer provides callbacks to be called during the parsing so the
374 * caller can render the document on the screen or convert the Markdown
375 * to another format.
376 *
377 * Zero is returned on success. If a runtime error occurs (e.g. a memory
378 * fails), -1 is returned. If the processing is aborted due any callback
379 * returning non-zero, md_parse() the return value of the callback is returned.
380 */
381int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);
382
383
384#ifdef __cplusplus
385 } /* extern "C" { */
386#endif
387
388#endif /* MD4C_MARKDOWN_H */
389