1 | // Scintilla source code edit control |
2 | /** @file LexLaTeX.cxx |
3 | ** Lexer for LaTeX2e. |
4 | **/ |
5 | // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org> |
6 | // The License.txt file describes the conditions under which this software may be distributed. |
7 | |
8 | // Modified by G. HU in 2013. Added folding, syntax highting inside math environments, and changed some minor behaviors. |
9 | |
10 | #include <stdlib.h> |
11 | #include <string.h> |
12 | #include <stdio.h> |
13 | #include <stdarg.h> |
14 | #include <assert.h> |
15 | #include <ctype.h> |
16 | |
17 | #include <string> |
18 | #include <string_view> |
19 | #include <vector> |
20 | |
21 | #include "ILexer.h" |
22 | #include "Scintilla.h" |
23 | #include "SciLexer.h" |
24 | |
25 | #include "PropSetSimple.h" |
26 | #include "WordList.h" |
27 | #include "LexAccessor.h" |
28 | #include "Accessor.h" |
29 | #include "StyleContext.h" |
30 | #include "CharacterSet.h" |
31 | #include "LexerModule.h" |
32 | #include "DefaultLexer.h" |
33 | #include "LexerBase.h" |
34 | |
35 | using namespace Scintilla; |
36 | using namespace Lexilla; |
37 | |
38 | using namespace std; |
39 | |
40 | struct latexFoldSave { |
41 | latexFoldSave() : structLev(0) { |
42 | for (int i = 0; i < 8; ++i) openBegins[i] = 0; |
43 | } |
44 | latexFoldSave(const latexFoldSave &save) : structLev(save.structLev) { |
45 | for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i]; |
46 | } |
47 | latexFoldSave &operator=(const latexFoldSave &save) { |
48 | if (this != &save) { |
49 | structLev = save.structLev; |
50 | for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i]; |
51 | } |
52 | return *this; |
53 | } |
54 | int openBegins[8]; |
55 | Sci_Position structLev; |
56 | }; |
57 | |
58 | class LexerLaTeX : public LexerBase { |
59 | private: |
60 | vector<int> modes; |
61 | void setMode(Sci_Position line, int mode) { |
62 | if (line >= static_cast<Sci_Position>(modes.size())) modes.resize(line + 1, 0); |
63 | modes[line] = mode; |
64 | } |
65 | int getMode(Sci_Position line) { |
66 | if (line >= 0 && line < static_cast<Sci_Position>(modes.size())) return modes[line]; |
67 | return 0; |
68 | } |
69 | void truncModes(Sci_Position numLines) { |
70 | if (static_cast<Sci_Position>(modes.size()) > numLines * 2 + 256) |
71 | modes.resize(numLines + 128); |
72 | } |
73 | |
74 | vector<latexFoldSave> saves; |
75 | void setSave(Sci_Position line, const latexFoldSave &save) { |
76 | if (line >= static_cast<Sci_Position>(saves.size())) saves.resize(line + 1); |
77 | saves[line] = save; |
78 | } |
79 | void getSave(Sci_Position line, latexFoldSave &save) { |
80 | if (line >= 0 && line < static_cast<Sci_Position>(saves.size())) save = saves[line]; |
81 | else { |
82 | save.structLev = 0; |
83 | for (int i = 0; i < 8; ++i) save.openBegins[i] = 0; |
84 | } |
85 | } |
86 | void truncSaves(Sci_Position numLines) { |
87 | if (static_cast<Sci_Position>(saves.size()) > numLines * 2 + 256) |
88 | saves.resize(numLines + 128); |
89 | } |
90 | public: |
91 | static ILexer5 *LexerFactoryLaTeX() { |
92 | return new LexerLaTeX(); |
93 | } |
94 | void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
95 | void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
96 | |
97 | // ILexer5 methods |
98 | const char * SCI_METHOD GetName() override { |
99 | return "latex" ; |
100 | } |
101 | int SCI_METHOD GetIdentifier() override { |
102 | return SCLEX_LATEX; |
103 | } |
104 | }; |
105 | |
106 | static bool latexIsSpecial(int ch) { |
107 | return (ch == '#') || (ch == '$') || (ch == '%') || (ch == '&') || (ch == '_') || |
108 | (ch == '{') || (ch == '}') || (ch == ' '); |
109 | } |
110 | |
111 | static bool latexIsBlank(int ch) { |
112 | return (ch == ' ') || (ch == '\t'); |
113 | } |
114 | |
115 | static bool latexIsBlankAndNL(int ch) { |
116 | return (ch == ' ') || (ch == '\t') || (ch == '\r') || (ch == '\n'); |
117 | } |
118 | |
119 | static bool latexIsLetter(int ch) { |
120 | return IsASCII(ch) && isalpha(ch); |
121 | } |
122 | |
123 | static bool latexIsTagValid(Sci_Position &i, Sci_Position l, Accessor &styler) { |
124 | while (i < l) { |
125 | if (styler.SafeGetCharAt(i) == '{') { |
126 | while (i < l) { |
127 | i++; |
128 | if (styler.SafeGetCharAt(i) == '}') { |
129 | return true; |
130 | } else if (!latexIsLetter(styler.SafeGetCharAt(i)) && |
131 | styler.SafeGetCharAt(i)!='*') { |
132 | return false; |
133 | } |
134 | } |
135 | } else if (!latexIsBlank(styler.SafeGetCharAt(i))) { |
136 | return false; |
137 | } |
138 | i++; |
139 | } |
140 | return false; |
141 | } |
142 | |
143 | static bool latexNextNotBlankIs(Sci_Position i, Accessor &styler, char needle) { |
144 | char ch; |
145 | while (i < styler.Length()) { |
146 | ch = styler.SafeGetCharAt(i); |
147 | if (!latexIsBlankAndNL(ch) && ch != '*') { |
148 | if (ch == needle) |
149 | return true; |
150 | else |
151 | return false; |
152 | } |
153 | i++; |
154 | } |
155 | return false; |
156 | } |
157 | |
158 | static bool latexLastWordIs(Sci_Position start, Accessor &styler, const char *needle) { |
159 | Sci_PositionU i = 0; |
160 | Sci_PositionU l = static_cast<Sci_PositionU>(strlen(needle)); |
161 | Sci_Position ini = start-l+1; |
162 | char s[32]; |
163 | |
164 | while (i < l && i < 31) { |
165 | s[i] = styler.SafeGetCharAt(ini + i); |
166 | i++; |
167 | } |
168 | s[i] = '\0'; |
169 | |
170 | return (strcmp(s, needle) == 0); |
171 | } |
172 | |
173 | static bool latexLastWordIsMathEnv(Sci_Position pos, Accessor &styler) { |
174 | Sci_Position i, j; |
175 | char s[32]; |
176 | const char *mathEnvs[] = { "align" , "alignat" , "flalign" , "gather" , |
177 | "multiline" , "displaymath" , "eqnarray" , "equation" }; |
178 | if (styler.SafeGetCharAt(pos) != '}') return false; |
179 | for (i = pos - 1; i >= 0; --i) { |
180 | if (styler.SafeGetCharAt(i) == '{') break; |
181 | if (pos - i >= 20) return false; |
182 | } |
183 | if (i < 0 || i == pos - 1) return false; |
184 | ++i; |
185 | for (j = 0; i + j < pos; ++j) |
186 | s[j] = styler.SafeGetCharAt(i + j); |
187 | s[j] = '\0'; |
188 | if (j == 0) return false; |
189 | if (s[j - 1] == '*') s[--j] = '\0'; |
190 | for (i = 0; i < static_cast<int>(sizeof(mathEnvs) / sizeof(const char *)); ++i) |
191 | if (strcmp(s, mathEnvs[i]) == 0) return true; |
192 | return false; |
193 | } |
194 | |
195 | static inline void latexStateReset(int &mode, int &state) { |
196 | switch (mode) { |
197 | case 1: state = SCE_L_MATH; break; |
198 | case 2: state = SCE_L_MATH2; break; |
199 | default: state = SCE_L_DEFAULT; break; |
200 | } |
201 | } |
202 | |
203 | // There are cases not handled correctly, like $abcd\textrm{what is $x+y$}z+w$. |
204 | // But I think it's already good enough. |
205 | void SCI_METHOD LexerLaTeX::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) { |
206 | // startPos is assumed to be the first character of a line |
207 | Accessor styler(pAccess, &props); |
208 | styler.StartAt(startPos); |
209 | int mode = getMode(styler.GetLine(startPos) - 1); |
210 | int state = initStyle; |
211 | if (state == SCE_L_ERROR || state == SCE_L_SHORTCMD || state == SCE_L_SPECIAL) // should not happen |
212 | latexStateReset(mode, state); |
213 | |
214 | char chNext = styler.SafeGetCharAt(startPos); |
215 | char chVerbatimDelim = '\0'; |
216 | styler.StartSegment(startPos); |
217 | Sci_Position lengthDoc = startPos + length; |
218 | |
219 | for (Sci_Position i = startPos; i < lengthDoc; i++) { |
220 | char ch = chNext; |
221 | chNext = styler.SafeGetCharAt(i + 1); |
222 | |
223 | if (styler.IsLeadByte(ch)) { |
224 | i++; |
225 | chNext = styler.SafeGetCharAt(i + 1); |
226 | continue; |
227 | } |
228 | |
229 | if (ch == '\r' || ch == '\n') |
230 | setMode(styler.GetLine(i), mode); |
231 | |
232 | switch (state) { |
233 | case SCE_L_DEFAULT : |
234 | switch (ch) { |
235 | case '\\' : |
236 | styler.ColourTo(i - 1, state); |
237 | if (latexIsLetter(chNext)) { |
238 | state = SCE_L_COMMAND; |
239 | } else if (latexIsSpecial(chNext)) { |
240 | styler.ColourTo(i + 1, SCE_L_SPECIAL); |
241 | i++; |
242 | chNext = styler.SafeGetCharAt(i + 1); |
243 | } else if (chNext == '\r' || chNext == '\n') { |
244 | styler.ColourTo(i, SCE_L_ERROR); |
245 | } else if (IsASCII(chNext)) { |
246 | styler.ColourTo(i + 1, SCE_L_SHORTCMD); |
247 | if (chNext == '(') { |
248 | mode = 1; |
249 | state = SCE_L_MATH; |
250 | } else if (chNext == '[') { |
251 | mode = 2; |
252 | state = SCE_L_MATH2; |
253 | } |
254 | i++; |
255 | chNext = styler.SafeGetCharAt(i + 1); |
256 | } |
257 | break; |
258 | case '$' : |
259 | styler.ColourTo(i - 1, state); |
260 | if (chNext == '$') { |
261 | styler.ColourTo(i + 1, SCE_L_SHORTCMD); |
262 | mode = 2; |
263 | state = SCE_L_MATH2; |
264 | i++; |
265 | chNext = styler.SafeGetCharAt(i + 1); |
266 | } else { |
267 | styler.ColourTo(i, SCE_L_SHORTCMD); |
268 | mode = 1; |
269 | state = SCE_L_MATH; |
270 | } |
271 | break; |
272 | case '%' : |
273 | styler.ColourTo(i - 1, state); |
274 | state = SCE_L_COMMENT; |
275 | break; |
276 | } |
277 | break; |
278 | // These 3 will never be reached. |
279 | case SCE_L_ERROR: |
280 | case SCE_L_SPECIAL: |
281 | case SCE_L_SHORTCMD: |
282 | break; |
283 | case SCE_L_COMMAND : |
284 | if (!latexIsLetter(chNext)) { |
285 | styler.ColourTo(i, state); |
286 | if (latexNextNotBlankIs(i + 1, styler, '[' )) { |
287 | state = SCE_L_CMDOPT; |
288 | } else if (latexLastWordIs(i, styler, "\\begin" )) { |
289 | state = SCE_L_TAG; |
290 | } else if (latexLastWordIs(i, styler, "\\end" )) { |
291 | state = SCE_L_TAG2; |
292 | } else if (latexLastWordIs(i, styler, "\\verb" ) && chNext != '*' && chNext != ' ') { |
293 | chVerbatimDelim = chNext; |
294 | state = SCE_L_VERBATIM; |
295 | } else { |
296 | latexStateReset(mode, state); |
297 | } |
298 | } |
299 | break; |
300 | case SCE_L_CMDOPT : |
301 | if (ch == ']') { |
302 | styler.ColourTo(i, state); |
303 | latexStateReset(mode, state); |
304 | } |
305 | break; |
306 | case SCE_L_TAG : |
307 | if (latexIsTagValid(i, lengthDoc, styler)) { |
308 | styler.ColourTo(i, state); |
309 | latexStateReset(mode, state); |
310 | if (latexLastWordIs(i, styler, "{verbatim}" )) { |
311 | state = SCE_L_VERBATIM; |
312 | } else if (latexLastWordIs(i, styler, "{lstlisting}" )) { |
313 | state = SCE_L_VERBATIM; |
314 | } else if (latexLastWordIs(i, styler, "{comment}" )) { |
315 | state = SCE_L_COMMENT2; |
316 | } else if (latexLastWordIs(i, styler, "{math}" ) && mode == 0) { |
317 | mode = 1; |
318 | state = SCE_L_MATH; |
319 | } else if (latexLastWordIsMathEnv(i, styler) && mode == 0) { |
320 | mode = 2; |
321 | state = SCE_L_MATH2; |
322 | } |
323 | } else { |
324 | styler.ColourTo(i, SCE_L_ERROR); |
325 | latexStateReset(mode, state); |
326 | ch = styler.SafeGetCharAt(i); |
327 | if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode); |
328 | } |
329 | chNext = styler.SafeGetCharAt(i+1); |
330 | break; |
331 | case SCE_L_TAG2 : |
332 | if (latexIsTagValid(i, lengthDoc, styler)) { |
333 | styler.ColourTo(i, state); |
334 | latexStateReset(mode, state); |
335 | } else { |
336 | styler.ColourTo(i, SCE_L_ERROR); |
337 | latexStateReset(mode, state); |
338 | ch = styler.SafeGetCharAt(i); |
339 | if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode); |
340 | } |
341 | chNext = styler.SafeGetCharAt(i+1); |
342 | break; |
343 | case SCE_L_MATH : |
344 | switch (ch) { |
345 | case '\\' : |
346 | styler.ColourTo(i - 1, state); |
347 | if (latexIsLetter(chNext)) { |
348 | Sci_Position match = i + 3; |
349 | if (latexLastWordIs(match, styler, "\\end" )) { |
350 | match++; |
351 | if (latexIsTagValid(match, lengthDoc, styler)) { |
352 | if (latexLastWordIs(match, styler, "{math}" )) |
353 | mode = 0; |
354 | } |
355 | } |
356 | state = SCE_L_COMMAND; |
357 | } else if (latexIsSpecial(chNext)) { |
358 | styler.ColourTo(i + 1, SCE_L_SPECIAL); |
359 | i++; |
360 | chNext = styler.SafeGetCharAt(i + 1); |
361 | } else if (chNext == '\r' || chNext == '\n') { |
362 | styler.ColourTo(i, SCE_L_ERROR); |
363 | } else if (IsASCII(chNext)) { |
364 | if (chNext == ')') { |
365 | mode = 0; |
366 | state = SCE_L_DEFAULT; |
367 | } |
368 | styler.ColourTo(i + 1, SCE_L_SHORTCMD); |
369 | i++; |
370 | chNext = styler.SafeGetCharAt(i + 1); |
371 | } |
372 | break; |
373 | case '$' : |
374 | styler.ColourTo(i - 1, state); |
375 | styler.ColourTo(i, SCE_L_SHORTCMD); |
376 | mode = 0; |
377 | state = SCE_L_DEFAULT; |
378 | break; |
379 | case '%' : |
380 | styler.ColourTo(i - 1, state); |
381 | state = SCE_L_COMMENT; |
382 | break; |
383 | } |
384 | break; |
385 | case SCE_L_MATH2 : |
386 | switch (ch) { |
387 | case '\\' : |
388 | styler.ColourTo(i - 1, state); |
389 | if (latexIsLetter(chNext)) { |
390 | Sci_Position match = i + 3; |
391 | if (latexLastWordIs(match, styler, "\\end" )) { |
392 | match++; |
393 | if (latexIsTagValid(match, lengthDoc, styler)) { |
394 | if (latexLastWordIsMathEnv(match, styler)) |
395 | mode = 0; |
396 | } |
397 | } |
398 | state = SCE_L_COMMAND; |
399 | } else if (latexIsSpecial(chNext)) { |
400 | styler.ColourTo(i + 1, SCE_L_SPECIAL); |
401 | i++; |
402 | chNext = styler.SafeGetCharAt(i + 1); |
403 | } else if (chNext == '\r' || chNext == '\n') { |
404 | styler.ColourTo(i, SCE_L_ERROR); |
405 | } else if (IsASCII(chNext)) { |
406 | if (chNext == ']') { |
407 | mode = 0; |
408 | state = SCE_L_DEFAULT; |
409 | } |
410 | styler.ColourTo(i + 1, SCE_L_SHORTCMD); |
411 | i++; |
412 | chNext = styler.SafeGetCharAt(i + 1); |
413 | } |
414 | break; |
415 | case '$' : |
416 | styler.ColourTo(i - 1, state); |
417 | if (chNext == '$') { |
418 | styler.ColourTo(i + 1, SCE_L_SHORTCMD); |
419 | i++; |
420 | chNext = styler.SafeGetCharAt(i + 1); |
421 | mode = 0; |
422 | state = SCE_L_DEFAULT; |
423 | } else { // This may not be an error, e.g. \begin{equation}\text{$a$}\end{equation} |
424 | styler.ColourTo(i, SCE_L_SHORTCMD); |
425 | } |
426 | break; |
427 | case '%' : |
428 | styler.ColourTo(i - 1, state); |
429 | state = SCE_L_COMMENT; |
430 | break; |
431 | } |
432 | break; |
433 | case SCE_L_COMMENT : |
434 | if (ch == '\r' || ch == '\n') { |
435 | styler.ColourTo(i - 1, state); |
436 | latexStateReset(mode, state); |
437 | } |
438 | break; |
439 | case SCE_L_COMMENT2 : |
440 | if (ch == '\\') { |
441 | Sci_Position match = i + 3; |
442 | if (latexLastWordIs(match, styler, "\\end" )) { |
443 | match++; |
444 | if (latexIsTagValid(match, lengthDoc, styler)) { |
445 | if (latexLastWordIs(match, styler, "{comment}" )) { |
446 | styler.ColourTo(i - 1, state); |
447 | state = SCE_L_COMMAND; |
448 | } |
449 | } |
450 | } |
451 | } |
452 | break; |
453 | case SCE_L_VERBATIM : |
454 | if (ch == '\\') { |
455 | Sci_Position match = i + 3; |
456 | if (latexLastWordIs(match, styler, "\\end" )) { |
457 | match++; |
458 | if (latexIsTagValid(match, lengthDoc, styler)) { |
459 | if (latexLastWordIs(match, styler, "{verbatim}" )) { |
460 | styler.ColourTo(i - 1, state); |
461 | state = SCE_L_COMMAND; |
462 | } else if (latexLastWordIs(match, styler, "{lstlisting}" )) { |
463 | styler.ColourTo(i - 1, state); |
464 | state = SCE_L_COMMAND; |
465 | } |
466 | } |
467 | } |
468 | } else if (chNext == chVerbatimDelim) { |
469 | styler.ColourTo(i + 1, state); |
470 | latexStateReset(mode, state); |
471 | chVerbatimDelim = '\0'; |
472 | i++; |
473 | chNext = styler.SafeGetCharAt(i + 1); |
474 | } else if (chVerbatimDelim != '\0' && (ch == '\n' || ch == '\r')) { |
475 | styler.ColourTo(i, SCE_L_ERROR); |
476 | latexStateReset(mode, state); |
477 | chVerbatimDelim = '\0'; |
478 | } |
479 | break; |
480 | } |
481 | } |
482 | if (lengthDoc == styler.Length()) truncModes(styler.GetLine(lengthDoc - 1)); |
483 | styler.ColourTo(lengthDoc - 1, state); |
484 | styler.Flush(); |
485 | } |
486 | |
487 | static int latexFoldSaveToInt(const latexFoldSave &save) { |
488 | int sum = 0; |
489 | for (int i = 0; i <= save.structLev; ++i) |
490 | sum += save.openBegins[i]; |
491 | return ((sum + save.structLev + SC_FOLDLEVELBASE) & SC_FOLDLEVELNUMBERMASK); |
492 | } |
493 | |
494 | // Change folding state while processing a line |
495 | // Return the level before the first relevant command |
496 | void SCI_METHOD LexerLaTeX::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) { |
497 | const char *structWords[7] = {"part" , "chapter" , "section" , "subsection" , |
498 | "subsubsection" , "paragraph" , "subparagraph" }; |
499 | Accessor styler(pAccess, &props); |
500 | Sci_PositionU endPos = startPos + length; |
501 | Sci_Position curLine = styler.GetLine(startPos); |
502 | latexFoldSave save; |
503 | getSave(curLine - 1, save); |
504 | do { |
505 | char ch, buf[16]; |
506 | Sci_Position i, j; |
507 | int lev = -1; |
508 | bool needFold = false; |
509 | for (i = static_cast<Sci_Position>(startPos); i < static_cast<Sci_Position>(endPos); ++i) { |
510 | ch = styler.SafeGetCharAt(i); |
511 | if (ch == '\r' || ch == '\n') break; |
512 | if (ch != '\\' || styler.StyleAt(i) != SCE_L_COMMAND) continue; |
513 | for (j = 0; j < 15 && i + 1 < static_cast<Sci_Position>(endPos); ++j, ++i) { |
514 | buf[j] = styler.SafeGetCharAt(i + 1); |
515 | if (!latexIsLetter(buf[j])) break; |
516 | } |
517 | buf[j] = '\0'; |
518 | if (strcmp(buf, "begin" ) == 0) { |
519 | if (lev < 0) lev = latexFoldSaveToInt(save); |
520 | ++save.openBegins[save.structLev]; |
521 | needFold = true; |
522 | } |
523 | else if (strcmp(buf, "end" ) == 0) { |
524 | while (save.structLev > 0 && save.openBegins[save.structLev] == 0) |
525 | --save.structLev; |
526 | if (lev < 0) lev = latexFoldSaveToInt(save); |
527 | if (save.openBegins[save.structLev] > 0) --save.openBegins[save.structLev]; |
528 | } |
529 | else { |
530 | for (j = 0; j < 7; ++j) |
531 | if (strcmp(buf, structWords[j]) == 0) break; |
532 | if (j >= 7) continue; |
533 | save.structLev = j; // level before the command |
534 | for (j = save.structLev + 1; j < 8; ++j) { |
535 | save.openBegins[save.structLev] += save.openBegins[j]; |
536 | save.openBegins[j] = 0; |
537 | } |
538 | if (lev < 0) lev = latexFoldSaveToInt(save); |
539 | ++save.structLev; // level after the command |
540 | needFold = true; |
541 | } |
542 | } |
543 | if (lev < 0) lev = latexFoldSaveToInt(save); |
544 | if (needFold) lev |= SC_FOLDLEVELHEADERFLAG; |
545 | styler.SetLevel(curLine, lev); |
546 | setSave(curLine, save); |
547 | ++curLine; |
548 | startPos = styler.LineStart(curLine); |
549 | if (static_cast<Sci_Position>(startPos) == styler.Length()) { |
550 | lev = latexFoldSaveToInt(save); |
551 | styler.SetLevel(curLine, lev); |
552 | setSave(curLine, save); |
553 | truncSaves(curLine); |
554 | } |
555 | } while (startPos < endPos); |
556 | styler.Flush(); |
557 | } |
558 | |
559 | static const char *const emptyWordListDesc[] = { |
560 | 0 |
561 | }; |
562 | |
563 | LexerModule lmLatex(SCLEX_LATEX, LexerLaTeX::LexerFactoryLaTeX, "latex" , emptyWordListDesc); |
564 | |