1 | /****************************************************************** |
2 | * LexMarkdown.cxx |
3 | * |
4 | * A simple Markdown lexer for scintilla. |
5 | * |
6 | * Includes highlighting for some extra features from the |
7 | * Pandoc implementation; strikeout, using '#.' as a default |
8 | * ordered list item marker, and delimited code blocks. |
9 | * |
10 | * Limitations: |
11 | * |
12 | * Standard indented code blocks are not highlighted at all, |
13 | * as it would conflict with other indentation schemes. Use |
14 | * delimited code blocks for blanket highlighting of an |
15 | * entire code block. Embedded HTML is not highlighted either. |
16 | * Blanket HTML highlighting has issues, because some Markdown |
17 | * implementations allow Markdown markup inside of the HTML. Also, |
18 | * there is a following blank line issue that can't be ignored, |
19 | * explained in the next paragraph. Embedded HTML and code |
20 | * blocks would be better supported with language specific |
21 | * highlighting. |
22 | * |
23 | * The highlighting aims to accurately reflect correct syntax, |
24 | * but a few restrictions are relaxed. Delimited code blocks are |
25 | * highlighted, even if the line following the code block is not blank. |
26 | * Requiring a blank line after a block, breaks the highlighting |
27 | * in certain cases, because of the way Scintilla ends up calling |
28 | * the lexer. |
29 | * |
30 | * Written by Jon Strait - jstrait@moonloop.net |
31 | * |
32 | * The License.txt file describes the conditions under which this |
33 | * software may be distributed. |
34 | * |
35 | *****************************************************************/ |
36 | |
37 | #include <stdlib.h> |
38 | #include <string.h> |
39 | #include <stdio.h> |
40 | #include <stdarg.h> |
41 | #include <assert.h> |
42 | |
43 | #include <string> |
44 | #include <string_view> |
45 | |
46 | #include "ILexer.h" |
47 | #include "Scintilla.h" |
48 | #include "SciLexer.h" |
49 | |
50 | #include "WordList.h" |
51 | #include "LexAccessor.h" |
52 | #include "Accessor.h" |
53 | #include "StyleContext.h" |
54 | #include "CharacterSet.h" |
55 | #include "LexerModule.h" |
56 | |
57 | using namespace Lexilla; |
58 | |
59 | namespace { |
60 | |
61 | constexpr bool IsNewline(const int ch) { |
62 | // sc.GetRelative(i) returns '\0' if out of range |
63 | return (ch == '\n' || ch == '\r' || ch == '\0'); |
64 | } |
65 | |
66 | } |
67 | |
68 | // True if can follow ch down to the end with possibly trailing whitespace |
69 | static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) { |
70 | Sci_Position i = 0; |
71 | while (sc.GetRelative(++i) == ch) |
72 | ; |
73 | // Skip over whitespace |
74 | while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos) |
75 | ++i; |
76 | if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) { |
77 | sc.Forward(i); |
78 | sc.ChangeState(state); |
79 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
80 | return true; |
81 | } |
82 | else return false; |
83 | } |
84 | |
85 | // Set the state on text section from current to length characters, |
86 | // then set the rest until the newline to default, except for any characters matching token |
87 | static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) { |
88 | sc.SetState(state); |
89 | sc.Forward(length); |
90 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
91 | sc.Forward(); |
92 | bool started = false; |
93 | while (sc.More() && !IsNewline(sc.ch)) { |
94 | if (sc.ch == token && !started) { |
95 | sc.SetState(state); |
96 | started = true; |
97 | } |
98 | else if (sc.ch != token) { |
99 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
100 | started = false; |
101 | } |
102 | sc.Forward(); |
103 | } |
104 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
105 | } |
106 | |
107 | // Does the previous line have more than spaces and tabs? |
108 | static bool HasPrevLineContent(StyleContext &sc) { |
109 | Sci_Position i = 0; |
110 | // Go back to the previous newline |
111 | while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i))) |
112 | ; |
113 | while ((--i + (Sci_Position)sc.currentPos) >= 0) { |
114 | const int ch = sc.GetRelative(i); |
115 | if (ch == '\n') |
116 | break; |
117 | if (!((ch == '\r' || IsASpaceOrTab(ch)))) |
118 | return true; |
119 | } |
120 | return false; |
121 | } |
122 | |
123 | static bool AtTermStart(StyleContext &sc) { |
124 | return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev); |
125 | } |
126 | |
127 | static bool IsCompleteStyleRegion(StyleContext &sc, const char *token) { |
128 | bool found = false; |
129 | const size_t start = strlen(token); |
130 | Sci_Position i = static_cast<Sci_Position>(start); |
131 | while (!IsNewline(sc.GetRelative(i))) { |
132 | // make sure an empty pair of single-char tokens doesn't match |
133 | // with a longer token: {*}{*} != {**} |
134 | if (sc.GetRelative(i) == *token && sc.GetRelative(i - 1) != *token) { |
135 | found = start > 1U ? sc.GetRelative(i + 1) == token[1] : true; |
136 | break; |
137 | } |
138 | i++; |
139 | } |
140 | return AtTermStart(sc) && found; |
141 | } |
142 | |
143 | static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) { |
144 | int count = 1; |
145 | Sci_Position i = 0; |
146 | for (;;) { |
147 | ++i; |
148 | int c = sc.GetRelative(i); |
149 | if (c == sc.ch) |
150 | ++count; |
151 | // hit a terminating character |
152 | else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) { |
153 | // Are we a valid HRULE |
154 | if ((IsNewline(c) || sc.currentPos + i == endPos) && |
155 | count >= 3 && !HasPrevLineContent(sc)) { |
156 | sc.SetState(SCE_MARKDOWN_HRULE); |
157 | sc.Forward(i); |
158 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
159 | return true; |
160 | } |
161 | else { |
162 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
163 | return false; |
164 | } |
165 | } |
166 | } |
167 | } |
168 | |
169 | static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, |
170 | WordList **, Accessor &styler) { |
171 | Sci_PositionU endPos = startPos + length; |
172 | int precharCount = 0; |
173 | bool isLinkNameDetecting = false; |
174 | // Don't advance on a new loop iteration and retry at the same position. |
175 | // Useful in the corner case of having to start at the beginning file position |
176 | // in the default state. |
177 | bool freezeCursor = false; |
178 | |
179 | StyleContext sc(startPos, static_cast<Sci_PositionU>(length), initStyle, styler); |
180 | |
181 | while (sc.More()) { |
182 | // Skip past escaped characters |
183 | if (sc.ch == '\\') { |
184 | sc.Forward(); |
185 | continue; |
186 | } |
187 | |
188 | // A blockquotes resets the line semantics |
189 | if (sc.state == SCE_MARKDOWN_BLOCKQUOTE) |
190 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
191 | |
192 | // Conditional state-based actions |
193 | if (sc.state == SCE_MARKDOWN_CODE2) { |
194 | if (sc.Match("``" )) { |
195 | const int closingSpan = (sc.GetRelative(2) == '`') ? 3 : 2; |
196 | sc.Forward(closingSpan); |
197 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
198 | } |
199 | } |
200 | else if (sc.state == SCE_MARKDOWN_CODE) { |
201 | if (sc.ch == '`' && sc.chPrev != ' ') |
202 | sc.ForwardSetState(SCE_MARKDOWN_DEFAULT); |
203 | } |
204 | /* De-activated because it gets in the way of other valid indentation |
205 | * schemes, for example multiple paragraphs inside a list item. |
206 | // Code block |
207 | else if (sc.state == SCE_MARKDOWN_CODEBK) { |
208 | bool d = true; |
209 | if (IsNewline(sc.ch)) { |
210 | if (sc.chNext != '\t') { |
211 | for (int c = 1; c < 5; ++c) { |
212 | if (sc.GetRelative(c) != ' ') |
213 | d = false; |
214 | } |
215 | } |
216 | } |
217 | else if (sc.atLineStart) { |
218 | if (sc.ch != '\t' ) { |
219 | for (int i = 0; i < 4; ++i) { |
220 | if (sc.GetRelative(i) != ' ') |
221 | d = false; |
222 | } |
223 | } |
224 | } |
225 | if (!d) |
226 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
227 | } |
228 | */ |
229 | // Strong |
230 | else if (sc.state == SCE_MARKDOWN_STRONG1) { |
231 | if ((sc.Match("**" ) && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) { |
232 | sc.Forward(2); |
233 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
234 | } |
235 | } |
236 | else if (sc.state == SCE_MARKDOWN_STRONG2) { |
237 | if ((sc.Match("__" ) && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) { |
238 | sc.Forward(2); |
239 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
240 | } |
241 | } |
242 | // Emphasis |
243 | else if (sc.state == SCE_MARKDOWN_EM1) { |
244 | if ((sc.ch == '*' && sc.chPrev != ' ') || IsNewline(sc.chNext)) |
245 | sc.ForwardSetState(SCE_MARKDOWN_DEFAULT); |
246 | } |
247 | else if (sc.state == SCE_MARKDOWN_EM2) { |
248 | if ((sc.ch == '_' && sc.chPrev != ' ') || IsNewline(sc.chNext)) |
249 | sc.ForwardSetState(SCE_MARKDOWN_DEFAULT); |
250 | } |
251 | else if (sc.state == SCE_MARKDOWN_CODEBK) { |
252 | if (sc.atLineStart && sc.Match("~~~" )) { |
253 | Sci_Position i = 1; |
254 | while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos) |
255 | i++; |
256 | sc.Forward(i); |
257 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
258 | } |
259 | } |
260 | else if (sc.state == SCE_MARKDOWN_STRIKEOUT) { |
261 | if ((sc.Match("~~" ) && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) { |
262 | sc.Forward(2); |
263 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
264 | } |
265 | } |
266 | else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) { |
267 | // Header |
268 | if (sc.Match("######" )) |
269 | SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc); |
270 | else if (sc.Match("#####" )) |
271 | SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc); |
272 | else if (sc.Match("####" )) |
273 | SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc); |
274 | else if (sc.Match("###" )) |
275 | SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc); |
276 | else if (sc.Match("##" )) |
277 | SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc); |
278 | else if (sc.Match("#" )) { |
279 | // Catch the special case of an unordered list |
280 | if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) { |
281 | precharCount = 0; |
282 | sc.SetState(SCE_MARKDOWN_PRECHAR); |
283 | } |
284 | else |
285 | SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc); |
286 | } |
287 | // Code block |
288 | else if (sc.Match("~~~" )) { |
289 | if (!HasPrevLineContent(sc)) |
290 | sc.SetState(SCE_MARKDOWN_CODEBK); |
291 | else |
292 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
293 | } |
294 | else if (sc.ch == '=') { |
295 | if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc)) |
296 | ; |
297 | else |
298 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
299 | } |
300 | else if (sc.ch == '-') { |
301 | if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc)) |
302 | ; |
303 | else { |
304 | precharCount = 0; |
305 | sc.SetState(SCE_MARKDOWN_PRECHAR); |
306 | } |
307 | } |
308 | else if (IsNewline(sc.ch)) |
309 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
310 | else { |
311 | precharCount = 0; |
312 | sc.SetState(SCE_MARKDOWN_PRECHAR); |
313 | } |
314 | } |
315 | |
316 | // The header lasts until the newline |
317 | else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 || |
318 | sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 || |
319 | sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) { |
320 | if (IsNewline(sc.ch)) |
321 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
322 | } |
323 | |
324 | // New state only within the initial whitespace |
325 | if (sc.state == SCE_MARKDOWN_PRECHAR) { |
326 | // Blockquote |
327 | if (sc.ch == '>' && precharCount < 5) |
328 | sc.SetState(SCE_MARKDOWN_BLOCKQUOTE); |
329 | /* |
330 | // Begin of code block |
331 | else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4)) |
332 | sc.SetState(SCE_MARKDOWN_CODEBK); |
333 | */ |
334 | // HRule - Total of three or more hyphens, asterisks, or underscores |
335 | // on a line by themselves |
336 | else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc)) |
337 | ; |
338 | // Unordered list |
339 | else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) { |
340 | sc.SetState(SCE_MARKDOWN_ULIST_ITEM); |
341 | sc.ForwardSetState(SCE_MARKDOWN_DEFAULT); |
342 | } |
343 | // Ordered list |
344 | else if (IsADigit(sc.ch)) { |
345 | int digitCount = 0; |
346 | while (IsADigit(sc.GetRelative(++digitCount))) |
347 | ; |
348 | if (sc.GetRelative(digitCount) == '.' && |
349 | IsASpaceOrTab(sc.GetRelative(digitCount + 1))) { |
350 | sc.SetState(SCE_MARKDOWN_OLIST_ITEM); |
351 | sc.Forward(digitCount + 1); |
352 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
353 | } |
354 | } |
355 | // Alternate Ordered list |
356 | else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) { |
357 | sc.SetState(SCE_MARKDOWN_OLIST_ITEM); |
358 | sc.Forward(2); |
359 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
360 | } |
361 | else if (sc.ch != ' ' || precharCount > 2) |
362 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
363 | else |
364 | ++precharCount; |
365 | } |
366 | |
367 | // Any link |
368 | if (sc.state == SCE_MARKDOWN_LINK) { |
369 | if (sc.Match("](" ) && sc.GetRelative(-1) != '\\') { |
370 | sc.Forward(2); |
371 | isLinkNameDetecting = true; |
372 | } |
373 | else if (sc.Match("]:" ) && sc.GetRelative(-1) != '\\') { |
374 | sc.Forward(2); |
375 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
376 | } |
377 | else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') { |
378 | sc.Forward(); |
379 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
380 | } |
381 | else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') { |
382 | sc.Forward(); |
383 | sc.SetState(SCE_MARKDOWN_DEFAULT); |
384 | isLinkNameDetecting = false; |
385 | } |
386 | } |
387 | |
388 | // New state anywhere in doc |
389 | if (sc.state == SCE_MARKDOWN_DEFAULT) { |
390 | if (sc.atLineStart && sc.ch == '#') { |
391 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
392 | freezeCursor = true; |
393 | } |
394 | // Links and Images |
395 | if (sc.Match("![" )) { |
396 | sc.SetState(SCE_MARKDOWN_LINK); |
397 | sc.Forward(1); |
398 | } |
399 | else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') { |
400 | sc.SetState(SCE_MARKDOWN_LINK); |
401 | } |
402 | // Code - also a special case for alternate inside spacing |
403 | else if (sc.Match("``" ) && sc.GetRelative(3) != ' ' && AtTermStart(sc)) { |
404 | const int openingSpan = (sc.GetRelative(2) == '`') ? 2 : 1; |
405 | sc.SetState(SCE_MARKDOWN_CODE2); |
406 | sc.Forward(openingSpan); |
407 | } |
408 | else if (sc.ch == '`' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "`" )) { |
409 | sc.SetState(SCE_MARKDOWN_CODE); |
410 | } |
411 | // Strong |
412 | else if (sc.Match("**" ) && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "**" )) { |
413 | sc.SetState(SCE_MARKDOWN_STRONG1); |
414 | sc.Forward(); |
415 | } |
416 | else if (sc.Match("__" ) && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "__" )) { |
417 | sc.SetState(SCE_MARKDOWN_STRONG2); |
418 | sc.Forward(); |
419 | } |
420 | // Emphasis |
421 | else if (sc.ch == '*' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "*" )) { |
422 | sc.SetState(SCE_MARKDOWN_EM1); |
423 | } else if (sc.ch == '_' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "_" )) { |
424 | sc.SetState(SCE_MARKDOWN_EM2); |
425 | } |
426 | // Strikeout |
427 | else if (sc.Match("~~" ) && !(sc.GetRelative(2) == '~' || sc.GetRelative(2) == ' ') && |
428 | IsCompleteStyleRegion(sc, "~~" )) { |
429 | sc.SetState(SCE_MARKDOWN_STRIKEOUT); |
430 | sc.Forward(); |
431 | } |
432 | // Beginning of line |
433 | else if (IsNewline(sc.ch)) { |
434 | sc.SetState(SCE_MARKDOWN_LINE_BEGIN); |
435 | } |
436 | } |
437 | // Advance if not holding back the cursor for this iteration. |
438 | if (!freezeCursor) |
439 | sc.Forward(); |
440 | freezeCursor = false; |
441 | } |
442 | sc.Complete(); |
443 | } |
444 | |
445 | LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown" ); |
446 | |