1/******************************************************************
2 * LexMarkdown.cxx
3 *
4 * A simple Markdown lexer for scintilla.
5 *
6 * Includes highlighting for some extra features from the
7 * Pandoc implementation; strikeout, using '#.' as a default
8 * ordered list item marker, and delimited code blocks.
9 *
10 * Limitations:
11 *
12 * Standard indented code blocks are not highlighted at all,
13 * as it would conflict with other indentation schemes. Use
14 * delimited code blocks for blanket highlighting of an
15 * entire code block. Embedded HTML is not highlighted either.
16 * Blanket HTML highlighting has issues, because some Markdown
17 * implementations allow Markdown markup inside of the HTML. Also,
18 * there is a following blank line issue that can't be ignored,
19 * explained in the next paragraph. Embedded HTML and code
20 * blocks would be better supported with language specific
21 * highlighting.
22 *
23 * The highlighting aims to accurately reflect correct syntax,
24 * but a few restrictions are relaxed. Delimited code blocks are
25 * highlighted, even if the line following the code block is not blank.
26 * Requiring a blank line after a block, breaks the highlighting
27 * in certain cases, because of the way Scintilla ends up calling
28 * the lexer.
29 *
30 * Written by Jon Strait - jstrait@moonloop.net
31 *
32 * The License.txt file describes the conditions under which this
33 * software may be distributed.
34 *
35 *****************************************************************/
36
37#include <stdlib.h>
38#include <string.h>
39#include <stdio.h>
40#include <stdarg.h>
41#include <assert.h>
42
43#include <string>
44#include <string_view>
45
46#include "ILexer.h"
47#include "Scintilla.h"
48#include "SciLexer.h"
49
50#include "WordList.h"
51#include "LexAccessor.h"
52#include "Accessor.h"
53#include "StyleContext.h"
54#include "CharacterSet.h"
55#include "LexerModule.h"
56
57using namespace Lexilla;
58
59namespace {
60
61constexpr bool IsNewline(const int ch) {
62 // sc.GetRelative(i) returns '\0' if out of range
63 return (ch == '\n' || ch == '\r' || ch == '\0');
64}
65
66}
67
68// True if can follow ch down to the end with possibly trailing whitespace
69static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
70 Sci_Position i = 0;
71 while (sc.GetRelative(++i) == ch)
72 ;
73 // Skip over whitespace
74 while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
75 ++i;
76 if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
77 sc.Forward(i);
78 sc.ChangeState(state);
79 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
80 return true;
81 }
82 else return false;
83}
84
85// Set the state on text section from current to length characters,
86// then set the rest until the newline to default, except for any characters matching token
87static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
88 sc.SetState(state);
89 sc.Forward(length);
90 sc.SetState(SCE_MARKDOWN_DEFAULT);
91 sc.Forward();
92 bool started = false;
93 while (sc.More() && !IsNewline(sc.ch)) {
94 if (sc.ch == token && !started) {
95 sc.SetState(state);
96 started = true;
97 }
98 else if (sc.ch != token) {
99 sc.SetState(SCE_MARKDOWN_DEFAULT);
100 started = false;
101 }
102 sc.Forward();
103 }
104 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
105}
106
107// Does the previous line have more than spaces and tabs?
108static bool HasPrevLineContent(StyleContext &sc) {
109 Sci_Position i = 0;
110 // Go back to the previous newline
111 while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
112 ;
113 while ((--i + (Sci_Position)sc.currentPos) >= 0) {
114 const int ch = sc.GetRelative(i);
115 if (ch == '\n')
116 break;
117 if (!((ch == '\r' || IsASpaceOrTab(ch))))
118 return true;
119 }
120 return false;
121}
122
123static bool AtTermStart(StyleContext &sc) {
124 return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
125}
126
127static bool IsCompleteStyleRegion(StyleContext &sc, const char *token) {
128 bool found = false;
129 const size_t start = strlen(token);
130 Sci_Position i = static_cast<Sci_Position>(start);
131 while (!IsNewline(sc.GetRelative(i))) {
132 // make sure an empty pair of single-char tokens doesn't match
133 // with a longer token: {*}{*} != {**}
134 if (sc.GetRelative(i) == *token && sc.GetRelative(i - 1) != *token) {
135 found = start > 1U ? sc.GetRelative(i + 1) == token[1] : true;
136 break;
137 }
138 i++;
139 }
140 return AtTermStart(sc) && found;
141}
142
143static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
144 int count = 1;
145 Sci_Position i = 0;
146 for (;;) {
147 ++i;
148 int c = sc.GetRelative(i);
149 if (c == sc.ch)
150 ++count;
151 // hit a terminating character
152 else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
153 // Are we a valid HRULE
154 if ((IsNewline(c) || sc.currentPos + i == endPos) &&
155 count >= 3 && !HasPrevLineContent(sc)) {
156 sc.SetState(SCE_MARKDOWN_HRULE);
157 sc.Forward(i);
158 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
159 return true;
160 }
161 else {
162 sc.SetState(SCE_MARKDOWN_DEFAULT);
163 return false;
164 }
165 }
166 }
167}
168
169static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
170 WordList **, Accessor &styler) {
171 Sci_PositionU endPos = startPos + length;
172 int precharCount = 0;
173 bool isLinkNameDetecting = false;
174 // Don't advance on a new loop iteration and retry at the same position.
175 // Useful in the corner case of having to start at the beginning file position
176 // in the default state.
177 bool freezeCursor = false;
178
179 StyleContext sc(startPos, static_cast<Sci_PositionU>(length), initStyle, styler);
180
181 while (sc.More()) {
182 // Skip past escaped characters
183 if (sc.ch == '\\') {
184 sc.Forward();
185 continue;
186 }
187
188 // A blockquotes resets the line semantics
189 if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
190 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
191
192 // Conditional state-based actions
193 if (sc.state == SCE_MARKDOWN_CODE2) {
194 if (sc.Match("``")) {
195 const int closingSpan = (sc.GetRelative(2) == '`') ? 3 : 2;
196 sc.Forward(closingSpan);
197 sc.SetState(SCE_MARKDOWN_DEFAULT);
198 }
199 }
200 else if (sc.state == SCE_MARKDOWN_CODE) {
201 if (sc.ch == '`' && sc.chPrev != ' ')
202 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
203 }
204 /* De-activated because it gets in the way of other valid indentation
205 * schemes, for example multiple paragraphs inside a list item.
206 // Code block
207 else if (sc.state == SCE_MARKDOWN_CODEBK) {
208 bool d = true;
209 if (IsNewline(sc.ch)) {
210 if (sc.chNext != '\t') {
211 for (int c = 1; c < 5; ++c) {
212 if (sc.GetRelative(c) != ' ')
213 d = false;
214 }
215 }
216 }
217 else if (sc.atLineStart) {
218 if (sc.ch != '\t' ) {
219 for (int i = 0; i < 4; ++i) {
220 if (sc.GetRelative(i) != ' ')
221 d = false;
222 }
223 }
224 }
225 if (!d)
226 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
227 }
228 */
229 // Strong
230 else if (sc.state == SCE_MARKDOWN_STRONG1) {
231 if ((sc.Match("**") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
232 sc.Forward(2);
233 sc.SetState(SCE_MARKDOWN_DEFAULT);
234 }
235 }
236 else if (sc.state == SCE_MARKDOWN_STRONG2) {
237 if ((sc.Match("__") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
238 sc.Forward(2);
239 sc.SetState(SCE_MARKDOWN_DEFAULT);
240 }
241 }
242 // Emphasis
243 else if (sc.state == SCE_MARKDOWN_EM1) {
244 if ((sc.ch == '*' && sc.chPrev != ' ') || IsNewline(sc.chNext))
245 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
246 }
247 else if (sc.state == SCE_MARKDOWN_EM2) {
248 if ((sc.ch == '_' && sc.chPrev != ' ') || IsNewline(sc.chNext))
249 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
250 }
251 else if (sc.state == SCE_MARKDOWN_CODEBK) {
252 if (sc.atLineStart && sc.Match("~~~")) {
253 Sci_Position i = 1;
254 while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
255 i++;
256 sc.Forward(i);
257 sc.SetState(SCE_MARKDOWN_DEFAULT);
258 }
259 }
260 else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
261 if ((sc.Match("~~") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
262 sc.Forward(2);
263 sc.SetState(SCE_MARKDOWN_DEFAULT);
264 }
265 }
266 else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
267 // Header
268 if (sc.Match("######"))
269 SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
270 else if (sc.Match("#####"))
271 SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
272 else if (sc.Match("####"))
273 SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
274 else if (sc.Match("###"))
275 SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
276 else if (sc.Match("##"))
277 SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
278 else if (sc.Match("#")) {
279 // Catch the special case of an unordered list
280 if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
281 precharCount = 0;
282 sc.SetState(SCE_MARKDOWN_PRECHAR);
283 }
284 else
285 SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
286 }
287 // Code block
288 else if (sc.Match("~~~")) {
289 if (!HasPrevLineContent(sc))
290 sc.SetState(SCE_MARKDOWN_CODEBK);
291 else
292 sc.SetState(SCE_MARKDOWN_DEFAULT);
293 }
294 else if (sc.ch == '=') {
295 if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc))
296 ;
297 else
298 sc.SetState(SCE_MARKDOWN_DEFAULT);
299 }
300 else if (sc.ch == '-') {
301 if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc))
302 ;
303 else {
304 precharCount = 0;
305 sc.SetState(SCE_MARKDOWN_PRECHAR);
306 }
307 }
308 else if (IsNewline(sc.ch))
309 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
310 else {
311 precharCount = 0;
312 sc.SetState(SCE_MARKDOWN_PRECHAR);
313 }
314 }
315
316 // The header lasts until the newline
317 else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
318 sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
319 sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
320 if (IsNewline(sc.ch))
321 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
322 }
323
324 // New state only within the initial whitespace
325 if (sc.state == SCE_MARKDOWN_PRECHAR) {
326 // Blockquote
327 if (sc.ch == '>' && precharCount < 5)
328 sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
329 /*
330 // Begin of code block
331 else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
332 sc.SetState(SCE_MARKDOWN_CODEBK);
333 */
334 // HRule - Total of three or more hyphens, asterisks, or underscores
335 // on a line by themselves
336 else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
337 ;
338 // Unordered list
339 else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
340 sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
341 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
342 }
343 // Ordered list
344 else if (IsADigit(sc.ch)) {
345 int digitCount = 0;
346 while (IsADigit(sc.GetRelative(++digitCount)))
347 ;
348 if (sc.GetRelative(digitCount) == '.' &&
349 IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
350 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
351 sc.Forward(digitCount + 1);
352 sc.SetState(SCE_MARKDOWN_DEFAULT);
353 }
354 }
355 // Alternate Ordered list
356 else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
357 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
358 sc.Forward(2);
359 sc.SetState(SCE_MARKDOWN_DEFAULT);
360 }
361 else if (sc.ch != ' ' || precharCount > 2)
362 sc.SetState(SCE_MARKDOWN_DEFAULT);
363 else
364 ++precharCount;
365 }
366
367 // Any link
368 if (sc.state == SCE_MARKDOWN_LINK) {
369 if (sc.Match("](") && sc.GetRelative(-1) != '\\') {
370 sc.Forward(2);
371 isLinkNameDetecting = true;
372 }
373 else if (sc.Match("]:") && sc.GetRelative(-1) != '\\') {
374 sc.Forward(2);
375 sc.SetState(SCE_MARKDOWN_DEFAULT);
376 }
377 else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') {
378 sc.Forward();
379 sc.SetState(SCE_MARKDOWN_DEFAULT);
380 }
381 else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') {
382 sc.Forward();
383 sc.SetState(SCE_MARKDOWN_DEFAULT);
384 isLinkNameDetecting = false;
385 }
386 }
387
388 // New state anywhere in doc
389 if (sc.state == SCE_MARKDOWN_DEFAULT) {
390 if (sc.atLineStart && sc.ch == '#') {
391 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
392 freezeCursor = true;
393 }
394 // Links and Images
395 if (sc.Match("![")) {
396 sc.SetState(SCE_MARKDOWN_LINK);
397 sc.Forward(1);
398 }
399 else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') {
400 sc.SetState(SCE_MARKDOWN_LINK);
401 }
402 // Code - also a special case for alternate inside spacing
403 else if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
404 const int openingSpan = (sc.GetRelative(2) == '`') ? 2 : 1;
405 sc.SetState(SCE_MARKDOWN_CODE2);
406 sc.Forward(openingSpan);
407 }
408 else if (sc.ch == '`' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "`")) {
409 sc.SetState(SCE_MARKDOWN_CODE);
410 }
411 // Strong
412 else if (sc.Match("**") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "**")) {
413 sc.SetState(SCE_MARKDOWN_STRONG1);
414 sc.Forward();
415 }
416 else if (sc.Match("__") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "__")) {
417 sc.SetState(SCE_MARKDOWN_STRONG2);
418 sc.Forward();
419 }
420 // Emphasis
421 else if (sc.ch == '*' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "*")) {
422 sc.SetState(SCE_MARKDOWN_EM1);
423 } else if (sc.ch == '_' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "_")) {
424 sc.SetState(SCE_MARKDOWN_EM2);
425 }
426 // Strikeout
427 else if (sc.Match("~~") && !(sc.GetRelative(2) == '~' || sc.GetRelative(2) == ' ') &&
428 IsCompleteStyleRegion(sc, "~~")) {
429 sc.SetState(SCE_MARKDOWN_STRIKEOUT);
430 sc.Forward();
431 }
432 // Beginning of line
433 else if (IsNewline(sc.ch)) {
434 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
435 }
436 }
437 // Advance if not holding back the cursor for this iteration.
438 if (!freezeCursor)
439 sc.Forward();
440 freezeCursor = false;
441 }
442 sc.Complete();
443}
444
445LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");
446