1// Scintilla source code edit control
2/** @file LexLaTeX.cxx
3 ** Lexer for LaTeX2e.
4 **/
5// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6// The License.txt file describes the conditions under which this software may be distributed.
7
8// Modified by G. HU in 2013. Added folding, syntax highting inside math environments, and changed some minor behaviors.
9
10#include <stdlib.h>
11#include <string.h>
12#include <stdio.h>
13#include <stdarg.h>
14#include <assert.h>
15#include <ctype.h>
16
17#include <string>
18#include <string_view>
19#include <vector>
20
21#include "ILexer.h"
22#include "Scintilla.h"
23#include "SciLexer.h"
24
25#include "PropSetSimple.h"
26#include "WordList.h"
27#include "LexAccessor.h"
28#include "Accessor.h"
29#include "StyleContext.h"
30#include "CharacterSet.h"
31#include "LexerModule.h"
32#include "DefaultLexer.h"
33#include "LexerBase.h"
34
35using namespace Scintilla;
36using namespace Lexilla;
37
38using namespace std;
39
40struct latexFoldSave {
41 latexFoldSave() : structLev(0) {
42 for (int i = 0; i < 8; ++i) openBegins[i] = 0;
43 }
44 latexFoldSave(const latexFoldSave &save) : structLev(save.structLev) {
45 for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
46 }
47 latexFoldSave &operator=(const latexFoldSave &save) {
48 if (this != &save) {
49 structLev = save.structLev;
50 for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
51 }
52 return *this;
53 }
54 int openBegins[8];
55 Sci_Position structLev;
56};
57
58class LexerLaTeX : public LexerBase {
59private:
60 vector<int> modes;
61 void setMode(Sci_Position line, int mode) {
62 if (line >= static_cast<Sci_Position>(modes.size())) modes.resize(line + 1, 0);
63 modes[line] = mode;
64 }
65 int getMode(Sci_Position line) {
66 if (line >= 0 && line < static_cast<Sci_Position>(modes.size())) return modes[line];
67 return 0;
68 }
69 void truncModes(Sci_Position numLines) {
70 if (static_cast<Sci_Position>(modes.size()) > numLines * 2 + 256)
71 modes.resize(numLines + 128);
72 }
73
74 vector<latexFoldSave> saves;
75 void setSave(Sci_Position line, const latexFoldSave &save) {
76 if (line >= static_cast<Sci_Position>(saves.size())) saves.resize(line + 1);
77 saves[line] = save;
78 }
79 void getSave(Sci_Position line, latexFoldSave &save) {
80 if (line >= 0 && line < static_cast<Sci_Position>(saves.size())) save = saves[line];
81 else {
82 save.structLev = 0;
83 for (int i = 0; i < 8; ++i) save.openBegins[i] = 0;
84 }
85 }
86 void truncSaves(Sci_Position numLines) {
87 if (static_cast<Sci_Position>(saves.size()) > numLines * 2 + 256)
88 saves.resize(numLines + 128);
89 }
90public:
91 static ILexer5 *LexerFactoryLaTeX() {
92 return new LexerLaTeX();
93 }
94 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
95 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
96
97 // ILexer5 methods
98 const char * SCI_METHOD GetName() override {
99 return "latex";
100 }
101 int SCI_METHOD GetIdentifier() override {
102 return SCLEX_LATEX;
103 }
104};
105
106static bool latexIsSpecial(int ch) {
107 return (ch == '#') || (ch == '$') || (ch == '%') || (ch == '&') || (ch == '_') ||
108 (ch == '{') || (ch == '}') || (ch == ' ');
109}
110
111static bool latexIsBlank(int ch) {
112 return (ch == ' ') || (ch == '\t');
113}
114
115static bool latexIsBlankAndNL(int ch) {
116 return (ch == ' ') || (ch == '\t') || (ch == '\r') || (ch == '\n');
117}
118
119static bool latexIsLetter(int ch) {
120 return IsASCII(ch) && isalpha(ch);
121}
122
123static bool latexIsTagValid(Sci_Position &i, Sci_Position l, Accessor &styler) {
124 while (i < l) {
125 if (styler.SafeGetCharAt(i) == '{') {
126 while (i < l) {
127 i++;
128 if (styler.SafeGetCharAt(i) == '}') {
129 return true;
130 } else if (!latexIsLetter(styler.SafeGetCharAt(i)) &&
131 styler.SafeGetCharAt(i)!='*') {
132 return false;
133 }
134 }
135 } else if (!latexIsBlank(styler.SafeGetCharAt(i))) {
136 return false;
137 }
138 i++;
139 }
140 return false;
141}
142
143static bool latexNextNotBlankIs(Sci_Position i, Accessor &styler, char needle) {
144 char ch;
145 while (i < styler.Length()) {
146 ch = styler.SafeGetCharAt(i);
147 if (!latexIsBlankAndNL(ch) && ch != '*') {
148 if (ch == needle)
149 return true;
150 else
151 return false;
152 }
153 i++;
154 }
155 return false;
156}
157
158static bool latexLastWordIs(Sci_Position start, Accessor &styler, const char *needle) {
159 Sci_PositionU i = 0;
160 Sci_PositionU l = static_cast<Sci_PositionU>(strlen(needle));
161 Sci_Position ini = start-l+1;
162 char s[32];
163
164 while (i < l && i < 31) {
165 s[i] = styler.SafeGetCharAt(ini + i);
166 i++;
167 }
168 s[i] = '\0';
169
170 return (strcmp(s, needle) == 0);
171}
172
173static bool latexLastWordIsMathEnv(Sci_Position pos, Accessor &styler) {
174 Sci_Position i, j;
175 char s[32];
176 const char *mathEnvs[] = { "align", "alignat", "flalign", "gather",
177 "multiline", "displaymath", "eqnarray", "equation" };
178 if (styler.SafeGetCharAt(pos) != '}') return false;
179 for (i = pos - 1; i >= 0; --i) {
180 if (styler.SafeGetCharAt(i) == '{') break;
181 if (pos - i >= 20) return false;
182 }
183 if (i < 0 || i == pos - 1) return false;
184 ++i;
185 for (j = 0; i + j < pos; ++j)
186 s[j] = styler.SafeGetCharAt(i + j);
187 s[j] = '\0';
188 if (j == 0) return false;
189 if (s[j - 1] == '*') s[--j] = '\0';
190 for (i = 0; i < static_cast<int>(sizeof(mathEnvs) / sizeof(const char *)); ++i)
191 if (strcmp(s, mathEnvs[i]) == 0) return true;
192 return false;
193}
194
195static inline void latexStateReset(int &mode, int &state) {
196 switch (mode) {
197 case 1: state = SCE_L_MATH; break;
198 case 2: state = SCE_L_MATH2; break;
199 default: state = SCE_L_DEFAULT; break;
200 }
201}
202
203// There are cases not handled correctly, like $abcd\textrm{what is $x+y$}z+w$.
204// But I think it's already good enough.
205void SCI_METHOD LexerLaTeX::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
206 // startPos is assumed to be the first character of a line
207 Accessor styler(pAccess, &props);
208 styler.StartAt(startPos);
209 int mode = getMode(styler.GetLine(startPos) - 1);
210 int state = initStyle;
211 if (state == SCE_L_ERROR || state == SCE_L_SHORTCMD || state == SCE_L_SPECIAL) // should not happen
212 latexStateReset(mode, state);
213
214 char chNext = styler.SafeGetCharAt(startPos);
215 char chVerbatimDelim = '\0';
216 styler.StartSegment(startPos);
217 Sci_Position lengthDoc = startPos + length;
218
219 for (Sci_Position i = startPos; i < lengthDoc; i++) {
220 char ch = chNext;
221 chNext = styler.SafeGetCharAt(i + 1);
222
223 if (styler.IsLeadByte(ch)) {
224 i++;
225 chNext = styler.SafeGetCharAt(i + 1);
226 continue;
227 }
228
229 if (ch == '\r' || ch == '\n')
230 setMode(styler.GetLine(i), mode);
231
232 switch (state) {
233 case SCE_L_DEFAULT :
234 switch (ch) {
235 case '\\' :
236 styler.ColourTo(i - 1, state);
237 if (latexIsLetter(chNext)) {
238 state = SCE_L_COMMAND;
239 } else if (latexIsSpecial(chNext)) {
240 styler.ColourTo(i + 1, SCE_L_SPECIAL);
241 i++;
242 chNext = styler.SafeGetCharAt(i + 1);
243 } else if (chNext == '\r' || chNext == '\n') {
244 styler.ColourTo(i, SCE_L_ERROR);
245 } else if (IsASCII(chNext)) {
246 styler.ColourTo(i + 1, SCE_L_SHORTCMD);
247 if (chNext == '(') {
248 mode = 1;
249 state = SCE_L_MATH;
250 } else if (chNext == '[') {
251 mode = 2;
252 state = SCE_L_MATH2;
253 }
254 i++;
255 chNext = styler.SafeGetCharAt(i + 1);
256 }
257 break;
258 case '$' :
259 styler.ColourTo(i - 1, state);
260 if (chNext == '$') {
261 styler.ColourTo(i + 1, SCE_L_SHORTCMD);
262 mode = 2;
263 state = SCE_L_MATH2;
264 i++;
265 chNext = styler.SafeGetCharAt(i + 1);
266 } else {
267 styler.ColourTo(i, SCE_L_SHORTCMD);
268 mode = 1;
269 state = SCE_L_MATH;
270 }
271 break;
272 case '%' :
273 styler.ColourTo(i - 1, state);
274 state = SCE_L_COMMENT;
275 break;
276 }
277 break;
278 // These 3 will never be reached.
279 case SCE_L_ERROR:
280 case SCE_L_SPECIAL:
281 case SCE_L_SHORTCMD:
282 break;
283 case SCE_L_COMMAND :
284 if (!latexIsLetter(chNext)) {
285 styler.ColourTo(i, state);
286 if (latexNextNotBlankIs(i + 1, styler, '[' )) {
287 state = SCE_L_CMDOPT;
288 } else if (latexLastWordIs(i, styler, "\\begin")) {
289 state = SCE_L_TAG;
290 } else if (latexLastWordIs(i, styler, "\\end")) {
291 state = SCE_L_TAG2;
292 } else if (latexLastWordIs(i, styler, "\\verb") && chNext != '*' && chNext != ' ') {
293 chVerbatimDelim = chNext;
294 state = SCE_L_VERBATIM;
295 } else {
296 latexStateReset(mode, state);
297 }
298 }
299 break;
300 case SCE_L_CMDOPT :
301 if (ch == ']') {
302 styler.ColourTo(i, state);
303 latexStateReset(mode, state);
304 }
305 break;
306 case SCE_L_TAG :
307 if (latexIsTagValid(i, lengthDoc, styler)) {
308 styler.ColourTo(i, state);
309 latexStateReset(mode, state);
310 if (latexLastWordIs(i, styler, "{verbatim}")) {
311 state = SCE_L_VERBATIM;
312 } else if (latexLastWordIs(i, styler, "{lstlisting}")) {
313 state = SCE_L_VERBATIM;
314 } else if (latexLastWordIs(i, styler, "{comment}")) {
315 state = SCE_L_COMMENT2;
316 } else if (latexLastWordIs(i, styler, "{math}") && mode == 0) {
317 mode = 1;
318 state = SCE_L_MATH;
319 } else if (latexLastWordIsMathEnv(i, styler) && mode == 0) {
320 mode = 2;
321 state = SCE_L_MATH2;
322 }
323 } else {
324 styler.ColourTo(i, SCE_L_ERROR);
325 latexStateReset(mode, state);
326 ch = styler.SafeGetCharAt(i);
327 if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
328 }
329 chNext = styler.SafeGetCharAt(i+1);
330 break;
331 case SCE_L_TAG2 :
332 if (latexIsTagValid(i, lengthDoc, styler)) {
333 styler.ColourTo(i, state);
334 latexStateReset(mode, state);
335 } else {
336 styler.ColourTo(i, SCE_L_ERROR);
337 latexStateReset(mode, state);
338 ch = styler.SafeGetCharAt(i);
339 if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
340 }
341 chNext = styler.SafeGetCharAt(i+1);
342 break;
343 case SCE_L_MATH :
344 switch (ch) {
345 case '\\' :
346 styler.ColourTo(i - 1, state);
347 if (latexIsLetter(chNext)) {
348 Sci_Position match = i + 3;
349 if (latexLastWordIs(match, styler, "\\end")) {
350 match++;
351 if (latexIsTagValid(match, lengthDoc, styler)) {
352 if (latexLastWordIs(match, styler, "{math}"))
353 mode = 0;
354 }
355 }
356 state = SCE_L_COMMAND;
357 } else if (latexIsSpecial(chNext)) {
358 styler.ColourTo(i + 1, SCE_L_SPECIAL);
359 i++;
360 chNext = styler.SafeGetCharAt(i + 1);
361 } else if (chNext == '\r' || chNext == '\n') {
362 styler.ColourTo(i, SCE_L_ERROR);
363 } else if (IsASCII(chNext)) {
364 if (chNext == ')') {
365 mode = 0;
366 state = SCE_L_DEFAULT;
367 }
368 styler.ColourTo(i + 1, SCE_L_SHORTCMD);
369 i++;
370 chNext = styler.SafeGetCharAt(i + 1);
371 }
372 break;
373 case '$' :
374 styler.ColourTo(i - 1, state);
375 styler.ColourTo(i, SCE_L_SHORTCMD);
376 mode = 0;
377 state = SCE_L_DEFAULT;
378 break;
379 case '%' :
380 styler.ColourTo(i - 1, state);
381 state = SCE_L_COMMENT;
382 break;
383 }
384 break;
385 case SCE_L_MATH2 :
386 switch (ch) {
387 case '\\' :
388 styler.ColourTo(i - 1, state);
389 if (latexIsLetter(chNext)) {
390 Sci_Position match = i + 3;
391 if (latexLastWordIs(match, styler, "\\end")) {
392 match++;
393 if (latexIsTagValid(match, lengthDoc, styler)) {
394 if (latexLastWordIsMathEnv(match, styler))
395 mode = 0;
396 }
397 }
398 state = SCE_L_COMMAND;
399 } else if (latexIsSpecial(chNext)) {
400 styler.ColourTo(i + 1, SCE_L_SPECIAL);
401 i++;
402 chNext = styler.SafeGetCharAt(i + 1);
403 } else if (chNext == '\r' || chNext == '\n') {
404 styler.ColourTo(i, SCE_L_ERROR);
405 } else if (IsASCII(chNext)) {
406 if (chNext == ']') {
407 mode = 0;
408 state = SCE_L_DEFAULT;
409 }
410 styler.ColourTo(i + 1, SCE_L_SHORTCMD);
411 i++;
412 chNext = styler.SafeGetCharAt(i + 1);
413 }
414 break;
415 case '$' :
416 styler.ColourTo(i - 1, state);
417 if (chNext == '$') {
418 styler.ColourTo(i + 1, SCE_L_SHORTCMD);
419 i++;
420 chNext = styler.SafeGetCharAt(i + 1);
421 mode = 0;
422 state = SCE_L_DEFAULT;
423 } else { // This may not be an error, e.g. \begin{equation}\text{$a$}\end{equation}
424 styler.ColourTo(i, SCE_L_SHORTCMD);
425 }
426 break;
427 case '%' :
428 styler.ColourTo(i - 1, state);
429 state = SCE_L_COMMENT;
430 break;
431 }
432 break;
433 case SCE_L_COMMENT :
434 if (ch == '\r' || ch == '\n') {
435 styler.ColourTo(i - 1, state);
436 latexStateReset(mode, state);
437 }
438 break;
439 case SCE_L_COMMENT2 :
440 if (ch == '\\') {
441 Sci_Position match = i + 3;
442 if (latexLastWordIs(match, styler, "\\end")) {
443 match++;
444 if (latexIsTagValid(match, lengthDoc, styler)) {
445 if (latexLastWordIs(match, styler, "{comment}")) {
446 styler.ColourTo(i - 1, state);
447 state = SCE_L_COMMAND;
448 }
449 }
450 }
451 }
452 break;
453 case SCE_L_VERBATIM :
454 if (ch == '\\') {
455 Sci_Position match = i + 3;
456 if (latexLastWordIs(match, styler, "\\end")) {
457 match++;
458 if (latexIsTagValid(match, lengthDoc, styler)) {
459 if (latexLastWordIs(match, styler, "{verbatim}")) {
460 styler.ColourTo(i - 1, state);
461 state = SCE_L_COMMAND;
462 } else if (latexLastWordIs(match, styler, "{lstlisting}")) {
463 styler.ColourTo(i - 1, state);
464 state = SCE_L_COMMAND;
465 }
466 }
467 }
468 } else if (chNext == chVerbatimDelim) {
469 styler.ColourTo(i + 1, state);
470 latexStateReset(mode, state);
471 chVerbatimDelim = '\0';
472 i++;
473 chNext = styler.SafeGetCharAt(i + 1);
474 } else if (chVerbatimDelim != '\0' && (ch == '\n' || ch == '\r')) {
475 styler.ColourTo(i, SCE_L_ERROR);
476 latexStateReset(mode, state);
477 chVerbatimDelim = '\0';
478 }
479 break;
480 }
481 }
482 if (lengthDoc == styler.Length()) truncModes(styler.GetLine(lengthDoc - 1));
483 styler.ColourTo(lengthDoc - 1, state);
484 styler.Flush();
485}
486
487static int latexFoldSaveToInt(const latexFoldSave &save) {
488 int sum = 0;
489 for (int i = 0; i <= save.structLev; ++i)
490 sum += save.openBegins[i];
491 return ((sum + save.structLev + SC_FOLDLEVELBASE) & SC_FOLDLEVELNUMBERMASK);
492}
493
494// Change folding state while processing a line
495// Return the level before the first relevant command
496void SCI_METHOD LexerLaTeX::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) {
497 const char *structWords[7] = {"part", "chapter", "section", "subsection",
498 "subsubsection", "paragraph", "subparagraph"};
499 Accessor styler(pAccess, &props);
500 Sci_PositionU endPos = startPos + length;
501 Sci_Position curLine = styler.GetLine(startPos);
502 latexFoldSave save;
503 getSave(curLine - 1, save);
504 do {
505 char ch, buf[16];
506 Sci_Position i, j;
507 int lev = -1;
508 bool needFold = false;
509 for (i = static_cast<Sci_Position>(startPos); i < static_cast<Sci_Position>(endPos); ++i) {
510 ch = styler.SafeGetCharAt(i);
511 if (ch == '\r' || ch == '\n') break;
512 if (ch != '\\' || styler.StyleAt(i) != SCE_L_COMMAND) continue;
513 for (j = 0; j < 15 && i + 1 < static_cast<Sci_Position>(endPos); ++j, ++i) {
514 buf[j] = styler.SafeGetCharAt(i + 1);
515 if (!latexIsLetter(buf[j])) break;
516 }
517 buf[j] = '\0';
518 if (strcmp(buf, "begin") == 0) {
519 if (lev < 0) lev = latexFoldSaveToInt(save);
520 ++save.openBegins[save.structLev];
521 needFold = true;
522 }
523 else if (strcmp(buf, "end") == 0) {
524 while (save.structLev > 0 && save.openBegins[save.structLev] == 0)
525 --save.structLev;
526 if (lev < 0) lev = latexFoldSaveToInt(save);
527 if (save.openBegins[save.structLev] > 0) --save.openBegins[save.structLev];
528 }
529 else {
530 for (j = 0; j < 7; ++j)
531 if (strcmp(buf, structWords[j]) == 0) break;
532 if (j >= 7) continue;
533 save.structLev = j; // level before the command
534 for (j = save.structLev + 1; j < 8; ++j) {
535 save.openBegins[save.structLev] += save.openBegins[j];
536 save.openBegins[j] = 0;
537 }
538 if (lev < 0) lev = latexFoldSaveToInt(save);
539 ++save.structLev; // level after the command
540 needFold = true;
541 }
542 }
543 if (lev < 0) lev = latexFoldSaveToInt(save);
544 if (needFold) lev |= SC_FOLDLEVELHEADERFLAG;
545 styler.SetLevel(curLine, lev);
546 setSave(curLine, save);
547 ++curLine;
548 startPos = styler.LineStart(curLine);
549 if (static_cast<Sci_Position>(startPos) == styler.Length()) {
550 lev = latexFoldSaveToInt(save);
551 styler.SetLevel(curLine, lev);
552 setSave(curLine, save);
553 truncSaves(curLine);
554 }
555 } while (startPos < endPos);
556 styler.Flush();
557}
558
559static const char *const emptyWordListDesc[] = {
560 0
561};
562
563LexerModule lmLatex(SCLEX_LATEX, LexerLaTeX::LexerFactoryLaTeX, "latex", emptyWordListDesc);
564