1// Scintilla source code edit control
2/** @file LexPython.cxx
3 ** Lexer for Python.
4 **/
5// Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6// The License.txt file describes the conditions under which this software may be distributed.
7
8#include <cstdlib>
9#include <cassert>
10#include <cstring>
11
12#include <string>
13#include <string_view>
14#include <vector>
15#include <map>
16#include <algorithm>
17#include <functional>
18
19#include "ILexer.h"
20#include "Scintilla.h"
21#include "SciLexer.h"
22
23#include "StringCopy.h"
24#include "WordList.h"
25#include "LexAccessor.h"
26#include "Accessor.h"
27#include "StyleContext.h"
28#include "CharacterSet.h"
29#include "CharacterCategory.h"
30#include "LexerModule.h"
31#include "OptionSet.h"
32#include "SubStyles.h"
33#include "DefaultLexer.h"
34
35using namespace Scintilla;
36using namespace Lexilla;
37
38namespace {
39// Use an unnamed namespace to protect the functions and classes from name conflicts
40
41/* Notes on f-strings: f-strings are strings prefixed with f (e.g. f'') that may
42 have arbitrary expressions in {}. The tokens in the expressions are lexed as if
43 they were outside of any string. Expressions may contain { and } characters as
44 long as there is a closing } for every {, may be 2+ lines in a triple quoted
45 string, and may have a formatting specifier following a ! or :, but both !
46 and : are valid inside of a bracketed expression and != is a valid
47 expression token even outside of a bracketed expression.
48
49 When in an f-string expression, the lexer keeps track of the state value of
50 the f-string and the nesting count for the expression (# of [, (, { seen - # of
51 }, ), ] seen). f-strings may be nested (e.g. f'{ a + f"{1+2}"') so a stack of
52 states and nesting counts is kept. If a f-string expression continues beyond
53 the end of a line, this stack is saved in a std::map that maps a line number to
54 the stack at the end of that line. std::vector is used for the stack.
55
56 The PEP for f-strings is at https://www.python.org/dev/peps/pep-0498/
57*/
58struct SingleFStringExpState {
59 int state;
60 int nestingCount;
61};
62
63/* kwCDef, kwCTypeName only used for Cython */
64enum kwType { kwOther, kwClass, kwDef, kwImport, kwCDef, kwCTypeName, kwCPDef };
65
66enum literalsAllowed { litNone = 0, litU = 1, litB = 2, litF = 4 };
67
68constexpr int indicatorWhitespace = 1;
69
70bool IsPyComment(Accessor &styler, Sci_Position pos, Sci_Position len) {
71 return len > 0 && styler[pos] == '#';
72}
73
74constexpr bool IsPyStringTypeChar(int ch, literalsAllowed allowed) noexcept {
75 return
76 ((allowed & litB) && (ch == 'b' || ch == 'B')) ||
77 ((allowed & litU) && (ch == 'u' || ch == 'U')) ||
78 ((allowed & litF) && (ch == 'f' || ch == 'F'));
79}
80
81bool IsPyStringStart(int ch, int chNext, int chNext2, literalsAllowed allowed) noexcept {
82 if (ch == '\'' || ch == '"')
83 return true;
84 if (IsPyStringTypeChar(ch, allowed)) {
85 if (chNext == '"' || chNext == '\'')
86 return true;
87 if ((chNext == 'r' || chNext == 'R') && (chNext2 == '"' || chNext2 == '\''))
88 return true;
89 }
90 if ((ch == 'r' || ch == 'R') && (chNext == '"' || chNext == '\''))
91 return true;
92
93 return false;
94}
95
96constexpr bool IsPyFStringState(int st) noexcept {
97 return ((st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING) ||
98 (st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
99}
100
101constexpr bool IsPySingleQuoteStringState(int st) noexcept {
102 return ((st == SCE_P_CHARACTER) || (st == SCE_P_STRING) ||
103 (st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING));
104}
105
106constexpr bool IsPyTripleQuoteStringState(int st) noexcept {
107 return ((st == SCE_P_TRIPLE) || (st == SCE_P_TRIPLEDOUBLE) ||
108 (st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
109}
110
111char GetPyStringQuoteChar(int st) noexcept {
112 if ((st == SCE_P_CHARACTER) || (st == SCE_P_FCHARACTER) ||
113 (st == SCE_P_TRIPLE) || (st == SCE_P_FTRIPLE))
114 return '\'';
115 if ((st == SCE_P_STRING) || (st == SCE_P_FSTRING) ||
116 (st == SCE_P_TRIPLEDOUBLE) || (st == SCE_P_FTRIPLEDOUBLE))
117 return '"';
118
119 return '\0';
120}
121
122void PushStateToStack(int state, std::vector<SingleFStringExpState> &stack, SingleFStringExpState *&currentFStringExp) {
123 SingleFStringExpState single = {state, 0};
124 stack.push_back(single);
125
126 currentFStringExp = &stack.back();
127}
128
129int PopFromStateStack(std::vector<SingleFStringExpState> &stack, SingleFStringExpState *&currentFStringExp) noexcept {
130 int state = 0;
131
132 if (!stack.empty()) {
133 state = stack.back().state;
134 stack.pop_back();
135 }
136
137 if (stack.empty()) {
138 currentFStringExp = nullptr;
139 } else {
140 currentFStringExp = &stack.back();
141 }
142
143 return state;
144}
145
146/* Return the state to use for the string starting at i; *nextIndex will be set to the first index following the quote(s) */
147int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex, literalsAllowed allowed) {
148 char ch = styler.SafeGetCharAt(i);
149 char chNext = styler.SafeGetCharAt(i + 1);
150 const int firstIsF = (ch == 'f' || ch == 'F');
151
152 // Advance beyond r, u, or ur prefix (or r, b, or br in Python 2.7+ and r, f, or fr in Python 3.6+), but bail if there are any unexpected chars
153 if (ch == 'r' || ch == 'R') {
154 i++;
155 ch = styler.SafeGetCharAt(i);
156 chNext = styler.SafeGetCharAt(i + 1);
157 } else if (IsPyStringTypeChar(ch, allowed)) {
158 if (chNext == 'r' || chNext == 'R')
159 i += 2;
160 else
161 i += 1;
162 ch = styler.SafeGetCharAt(i);
163 chNext = styler.SafeGetCharAt(i + 1);
164 }
165
166 if (ch != '"' && ch != '\'') {
167 *nextIndex = i + 1;
168 return SCE_P_DEFAULT;
169 }
170
171 if (ch == chNext && ch == styler.SafeGetCharAt(i + 2)) {
172 *nextIndex = i + 3;
173
174 if (ch == '"')
175 return (firstIsF ? SCE_P_FTRIPLEDOUBLE : SCE_P_TRIPLEDOUBLE);
176 else
177 return (firstIsF ? SCE_P_FTRIPLE : SCE_P_TRIPLE);
178 } else {
179 *nextIndex = i + 1;
180
181 if (ch == '"')
182 return (firstIsF ? SCE_P_FSTRING : SCE_P_STRING);
183 else
184 return (firstIsF ? SCE_P_FCHARACTER : SCE_P_CHARACTER);
185 }
186}
187
188inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {
189 if (IsASCII(ch))
190 return (IsAlphaNumeric(ch) || ch == '.' || ch == '_');
191
192 if (!unicodeIdentifiers)
193 return false;
194
195 // Python uses the XID_Continue set from Unicode data
196 return IsXidContinue(ch);
197}
198
199inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
200 if (IsASCII(ch))
201 return (IsUpperOrLowerCase(ch) || ch == '_');
202
203 if (!unicodeIdentifiers)
204 return false;
205
206 // Python uses the XID_Start set from Unicode data
207 return IsXidStart(ch);
208}
209
210bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) {
211 const Sci_Position line = styler.GetLine(pos);
212 const Sci_Position start_pos = styler.LineStart(line);
213 for (Sci_Position i = start_pos; i < pos; i++) {
214 const char ch = styler[i];
215 if (!(ch == ' ' || ch == '\t'))
216 return false;
217 }
218 return true;
219}
220
221// Options used for LexerPython
222struct OptionsPython {
223 int whingeLevel;
224 bool base2or8Literals;
225 bool stringsU;
226 bool stringsB;
227 bool stringsF;
228 bool stringsOverNewline;
229 bool keywords2NoSubIdentifiers;
230 bool fold;
231 bool foldQuotes;
232 bool foldCompact;
233 bool unicodeIdentifiers;
234
235 OptionsPython() noexcept {
236 whingeLevel = 0;
237 base2or8Literals = true;
238 stringsU = true;
239 stringsB = true;
240 stringsF = true;
241 stringsOverNewline = false;
242 keywords2NoSubIdentifiers = false;
243 fold = false;
244 foldQuotes = false;
245 foldCompact = false;
246 unicodeIdentifiers = true;
247 }
248
249 literalsAllowed AllowedLiterals() const noexcept {
250 literalsAllowed allowedLiterals = stringsU ? litU : litNone;
251 if (stringsB)
252 allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litB);
253 if (stringsF)
254 allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litF);
255 return allowedLiterals;
256 }
257};
258
259const char *const pythonWordListDesc[] = {
260 "Keywords",
261 "Highlighted identifiers",
262 nullptr
263};
264
265struct OptionSetPython : public OptionSet<OptionsPython> {
266 OptionSetPython() {
267 DefineProperty("tab.timmy.whinge.level", &OptionsPython::whingeLevel,
268 "For Python code, checks whether indenting is consistent. "
269 "The default, 0 turns off indentation checking, "
270 "1 checks whether each line is potentially inconsistent with the previous line, "
271 "2 checks whether any space characters occur before a tab character in the indentation, "
272 "3 checks whether any spaces are in the indentation, and "
273 "4 checks for any tab characters in the indentation. "
274 "1 is a good level to use.");
275
276 DefineProperty("lexer.python.literals.binary", &OptionsPython::base2or8Literals,
277 "Set to 0 to not recognise Python 3 binary and octal literals: 0b1011 0o712.");
278
279 DefineProperty("lexer.python.strings.u", &OptionsPython::stringsU,
280 "Set to 0 to not recognise Python Unicode literals u\"x\" as used before Python 3.");
281
282 DefineProperty("lexer.python.strings.b", &OptionsPython::stringsB,
283 "Set to 0 to not recognise Python 3 bytes literals b\"x\".");
284
285 DefineProperty("lexer.python.strings.f", &OptionsPython::stringsF,
286 "Set to 0 to not recognise Python 3.6 f-string literals f\"var={var}\".");
287
288 DefineProperty("lexer.python.strings.over.newline", &OptionsPython::stringsOverNewline,
289 "Set to 1 to allow strings to span newline characters.");
290
291 DefineProperty("lexer.python.keywords2.no.sub.identifiers", &OptionsPython::keywords2NoSubIdentifiers,
292 "When enabled, it will not style keywords2 items that are used as a sub-identifier. "
293 "Example: when set, will not highlight \"foo.open\" when \"open\" is a keywords2 item.");
294
295 DefineProperty("fold", &OptionsPython::fold);
296
297 DefineProperty("fold.quotes.python", &OptionsPython::foldQuotes,
298 "This option enables folding multi-line quoted strings when using the Python lexer.");
299
300 DefineProperty("fold.compact", &OptionsPython::foldCompact);
301
302 DefineProperty("lexer.python.unicode.identifiers", &OptionsPython::unicodeIdentifiers,
303 "Set to 0 to not recognise Python 3 Unicode identifiers.");
304
305 DefineWordListSets(pythonWordListDesc);
306 }
307};
308
309const char styleSubable[] = { SCE_P_IDENTIFIER, 0 };
310
311LexicalClass lexicalClasses[] = {
312 // Lexer Python SCLEX_PYTHON SCE_P_:
313 0, "SCE_P_DEFAULT", "default", "White space",
314 1, "SCE_P_COMMENTLINE", "comment line", "Comment",
315 2, "SCE_P_NUMBER", "literal numeric", "Number",
316 3, "SCE_P_STRING", "literal string", "String",
317 4, "SCE_P_CHARACTER", "literal string", "Single quoted string",
318 5, "SCE_P_WORD", "keyword", "Keyword",
319 6, "SCE_P_TRIPLE", "literal string", "Triple quotes",
320 7, "SCE_P_TRIPLEDOUBLE", "literal string", "Triple double quotes",
321 8, "SCE_P_CLASSNAME", "identifier", "Class name definition",
322 9, "SCE_P_DEFNAME", "identifier", "Function or method name definition",
323 10, "SCE_P_OPERATOR", "operator", "Operators",
324 11, "SCE_P_IDENTIFIER", "identifier", "Identifiers",
325 12, "SCE_P_COMMENTBLOCK", "comment", "Comment-blocks",
326 13, "SCE_P_STRINGEOL", "error literal string", "End of line where string is not closed",
327 14, "SCE_P_WORD2", "identifier", "Highlighted identifiers",
328 15, "SCE_P_DECORATOR", "preprocessor", "Decorators",
329 16, "SCE_P_FSTRING", "literal string interpolated", "F-String",
330 17, "SCE_P_FCHARACTER", "literal string interpolated", "Single quoted f-string",
331 18, "SCE_P_FTRIPLE", "literal string interpolated", "Triple quoted f-string",
332 19, "SCE_P_FTRIPLEDOUBLE", "literal string interpolated", "Triple double quoted f-string",
333};
334
335}
336
337class LexerPython : public DefaultLexer {
338 WordList keywords;
339 WordList keywords2;
340 OptionsPython options;
341 OptionSetPython osPython;
342 enum { ssIdentifier };
343 SubStyles subStyles;
344 std::map<Sci_Position, std::vector<SingleFStringExpState> > ftripleStateAtEol;
345public:
346 explicit LexerPython() :
347 DefaultLexer("python", SCLEX_PYTHON, lexicalClasses, ELEMENTS(lexicalClasses)),
348 subStyles(styleSubable, 0x80, 0x40, 0) {
349 }
350 ~LexerPython() override {
351 }
352 void SCI_METHOD Release() override {
353 delete this;
354 }
355 int SCI_METHOD Version() const override {
356 return lvRelease5;
357 }
358 const char *SCI_METHOD PropertyNames() override {
359 return osPython.PropertyNames();
360 }
361 int SCI_METHOD PropertyType(const char *name) override {
362 return osPython.PropertyType(name);
363 }
364 const char *SCI_METHOD DescribeProperty(const char *name) override {
365 return osPython.DescribeProperty(name);
366 }
367 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override;
368 const char * SCI_METHOD PropertyGet(const char *key) override {
369 return osPython.PropertyGet(key);
370 }
371 const char *SCI_METHOD DescribeWordListSets() override {
372 return osPython.DescribeWordListSets();
373 }
374 Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override;
375 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
376 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
377
378 void *SCI_METHOD PrivateCall(int, void *) override {
379 return nullptr;
380 }
381
382 int SCI_METHOD LineEndTypesSupported() override {
383 return SC_LINE_END_TYPE_UNICODE;
384 }
385
386 int SCI_METHOD AllocateSubStyles(int styleBase, int numberStyles) override {
387 return subStyles.Allocate(styleBase, numberStyles);
388 }
389 int SCI_METHOD SubStylesStart(int styleBase) override {
390 return subStyles.Start(styleBase);
391 }
392 int SCI_METHOD SubStylesLength(int styleBase) override {
393 return subStyles.Length(styleBase);
394 }
395 int SCI_METHOD StyleFromSubStyle(int subStyle) override {
396 const int styleBase = subStyles.BaseStyle(subStyle);
397 return styleBase;
398 }
399 int SCI_METHOD PrimaryStyleFromStyle(int style) override {
400 return style;
401 }
402 void SCI_METHOD FreeSubStyles() override {
403 subStyles.Free();
404 }
405 void SCI_METHOD SetIdentifiers(int style, const char *identifiers) override {
406 subStyles.SetIdentifiers(style, identifiers);
407 }
408 int SCI_METHOD DistanceToSecondaryStyles() override {
409 return 0;
410 }
411 const char *SCI_METHOD GetSubStyleBases() override {
412 return styleSubable;
413 }
414
415 static ILexer5 *LexerFactoryPython() {
416 return new LexerPython();
417 }
418
419private:
420 void ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *&currentFStringExp, bool &inContinuedString);
421};
422
423Sci_Position SCI_METHOD LexerPython::PropertySet(const char *key, const char *val) {
424 if (osPython.PropertySet(&options, key, val)) {
425 return 0;
426 }
427 return -1;
428}
429
430Sci_Position SCI_METHOD LexerPython::WordListSet(int n, const char *wl) {
431 WordList *wordListN = nullptr;
432 switch (n) {
433 case 0:
434 wordListN = &keywords;
435 break;
436 case 1:
437 wordListN = &keywords2;
438 break;
439 default:
440 break;
441 }
442 Sci_Position firstModification = -1;
443 if (wordListN) {
444 WordList wlNew;
445 wlNew.Set(wl);
446 if (*wordListN != wlNew) {
447 wordListN->Set(wl);
448 firstModification = 0;
449 }
450 }
451 return firstModification;
452}
453
454void LexerPython::ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *&currentFStringExp, bool &inContinuedString) {
455 long deepestSingleStateIndex = -1;
456 unsigned long i;
457
458 // Find the deepest single quote state because that string will end; no \ continuation in f-string
459 for (i = 0; i < fstringStateStack.size(); i++) {
460 if (IsPySingleQuoteStringState(fstringStateStack[i].state)) {
461 deepestSingleStateIndex = i;
462 break;
463 }
464 }
465
466 if (deepestSingleStateIndex != -1) {
467 sc.SetState(fstringStateStack[deepestSingleStateIndex].state);
468 while (fstringStateStack.size() > static_cast<unsigned long>(deepestSingleStateIndex)) {
469 PopFromStateStack(fstringStateStack, currentFStringExp);
470 }
471 }
472 if (!fstringStateStack.empty()) {
473 std::pair<Sci_Position, std::vector<SingleFStringExpState> > val;
474 val.first = sc.currentLine;
475 val.second = fstringStateStack;
476
477 ftripleStateAtEol.insert(val);
478 }
479
480 if ((sc.state == SCE_P_DEFAULT)
481 || IsPyTripleQuoteStringState(sc.state)) {
482 // Perform colourisation of white space and triple quoted strings at end of each line to allow
483 // tab marking to work inside white space and triple quoted strings
484 sc.SetState(sc.state);
485 }
486 if (IsPySingleQuoteStringState(sc.state)) {
487 if (inContinuedString || options.stringsOverNewline) {
488 inContinuedString = false;
489 } else {
490 sc.ChangeState(SCE_P_STRINGEOL);
491 sc.ForwardSetState(SCE_P_DEFAULT);
492 }
493 }
494}
495
496void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
497 Accessor styler(pAccess, nullptr);
498
499 // Track whether in f-string expression; vector is used for a stack to
500 // handle nested f-strings such as f"""{f'''{f"{f'{1}'}"}'''}"""
501 std::vector<SingleFStringExpState> fstringStateStack;
502 SingleFStringExpState *currentFStringExp = nullptr;
503
504 const Sci_Position endPos = startPos + length;
505
506 // Backtrack to previous line in case need to fix its tab whinging
507 Sci_Position lineCurrent = styler.GetLine(startPos);
508 if (startPos > 0) {
509 if (lineCurrent > 0) {
510 lineCurrent--;
511 // Look for backslash-continued lines
512 while (lineCurrent > 0) {
513 const Sci_Position eolPos = styler.LineStart(lineCurrent) - 1;
514 const int eolStyle = styler.StyleAt(eolPos);
515 if (eolStyle == SCE_P_STRING
516 || eolStyle == SCE_P_CHARACTER
517 || eolStyle == SCE_P_STRINGEOL) {
518 lineCurrent -= 1;
519 } else {
520 break;
521 }
522 }
523 startPos = styler.LineStart(lineCurrent);
524 }
525 initStyle = startPos == 0 ? SCE_P_DEFAULT : styler.StyleAt(startPos - 1);
526 }
527
528 const literalsAllowed allowedLiterals = options.AllowedLiterals();
529
530 initStyle = initStyle & 31;
531 if (initStyle == SCE_P_STRINGEOL) {
532 initStyle = SCE_P_DEFAULT;
533 }
534
535 // Set up fstate stack from last line and remove any subsequent ftriple at eol states
536 std::map<Sci_Position, std::vector<SingleFStringExpState> >::iterator it;
537 it = ftripleStateAtEol.find(lineCurrent - 1);
538 if (it != ftripleStateAtEol.end() && !it->second.empty()) {
539 fstringStateStack = it->second;
540 currentFStringExp = &fstringStateStack.back();
541 }
542 it = ftripleStateAtEol.lower_bound(lineCurrent);
543 if (it != ftripleStateAtEol.end()) {
544 ftripleStateAtEol.erase(it, ftripleStateAtEol.end());
545 }
546
547 kwType kwLast = kwOther;
548 int spaceFlags = 0;
549 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
550 bool base_n_number = false;
551
552 const WordClassifier &classifierIdentifiers = subStyles.Classifier(SCE_P_IDENTIFIER);
553
554 StyleContext sc(startPos, endPos - startPos, initStyle, styler);
555
556 bool indentGood = true;
557 Sci_Position startIndicator = sc.currentPos;
558 bool inContinuedString = false;
559
560 for (; sc.More(); sc.Forward()) {
561
562 if (sc.atLineStart) {
563 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
564 indentGood = true;
565 if (options.whingeLevel == 1) {
566 indentGood = (spaceFlags & wsInconsistent) == 0;
567 } else if (options.whingeLevel == 2) {
568 indentGood = (spaceFlags & wsSpaceTab) == 0;
569 } else if (options.whingeLevel == 3) {
570 indentGood = (spaceFlags & wsSpace) == 0;
571 } else if (options.whingeLevel == 4) {
572 indentGood = (spaceFlags & wsTab) == 0;
573 }
574 if (!indentGood) {
575 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
576 startIndicator = sc.currentPos;
577 }
578 }
579
580 if (sc.atLineEnd) {
581 ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
582 lineCurrent++;
583 if (!sc.More())
584 break;
585 }
586
587 bool needEOLCheck = false;
588
589
590 if (sc.state == SCE_P_OPERATOR) {
591 kwLast = kwOther;
592 sc.SetState(SCE_P_DEFAULT);
593 } else if (sc.state == SCE_P_NUMBER) {
594 if (!IsAWordChar(sc.ch, false) &&
595 !(!base_n_number && ((sc.ch == '+' || sc.ch == '-') && (sc.chPrev == 'e' || sc.chPrev == 'E')))) {
596 sc.SetState(SCE_P_DEFAULT);
597 }
598 } else if (sc.state == SCE_P_IDENTIFIER) {
599 if ((sc.ch == '.') || (!IsAWordChar(sc.ch, options.unicodeIdentifiers))) {
600 char s[100];
601 sc.GetCurrent(s, sizeof(s));
602 int style = SCE_P_IDENTIFIER;
603 if ((kwLast == kwImport) && (strcmp(s, "as") == 0)) {
604 style = SCE_P_WORD;
605 } else if (keywords.InList(s)) {
606 style = SCE_P_WORD;
607 } else if (kwLast == kwClass) {
608 style = SCE_P_CLASSNAME;
609 } else if (kwLast == kwDef) {
610 style = SCE_P_DEFNAME;
611 } else if (kwLast == kwCDef || kwLast == kwCPDef) {
612 Sci_Position pos = sc.currentPos;
613 unsigned char ch = styler.SafeGetCharAt(pos, '\0');
614 while (ch != '\0') {
615 if (ch == '(') {
616 style = SCE_P_DEFNAME;
617 break;
618 } else if (ch == ':') {
619 style = SCE_P_CLASSNAME;
620 break;
621 } else if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {
622 pos++;
623 ch = styler.SafeGetCharAt(pos, '\0');
624 } else {
625 break;
626 }
627 }
628 } else if (keywords2.InList(s)) {
629 if (options.keywords2NoSubIdentifiers) {
630 // We don't want to highlight keywords2
631 // that are used as a sub-identifier,
632 // i.e. not open in "foo.open".
633 const Sci_Position pos = styler.GetStartSegment() - 1;
634 if (pos < 0 || (styler.SafeGetCharAt(pos, '\0') != '.'))
635 style = SCE_P_WORD2;
636 } else {
637 style = SCE_P_WORD2;
638 }
639 } else {
640 const int subStyle = classifierIdentifiers.ValueFor(s);
641 if (subStyle >= 0) {
642 style = subStyle;
643 }
644 }
645 sc.ChangeState(style);
646 sc.SetState(SCE_P_DEFAULT);
647 if (style == SCE_P_WORD) {
648 if (0 == strcmp(s, "class"))
649 kwLast = kwClass;
650 else if (0 == strcmp(s, "def"))
651 kwLast = kwDef;
652 else if (0 == strcmp(s, "import"))
653 kwLast = kwImport;
654 else if (0 == strcmp(s, "cdef"))
655 kwLast = kwCDef;
656 else if (0 == strcmp(s, "cpdef"))
657 kwLast = kwCPDef;
658 else if (0 == strcmp(s, "cimport"))
659 kwLast = kwImport;
660 else if (kwLast != kwCDef && kwLast != kwCPDef)
661 kwLast = kwOther;
662 } else if (kwLast != kwCDef && kwLast != kwCPDef) {
663 kwLast = kwOther;
664 }
665 }
666 } else if ((sc.state == SCE_P_COMMENTLINE) || (sc.state == SCE_P_COMMENTBLOCK)) {
667 if (sc.ch == '\r' || sc.ch == '\n') {
668 sc.SetState(SCE_P_DEFAULT);
669 }
670 } else if (sc.state == SCE_P_DECORATOR) {
671 if (!IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
672 sc.SetState(SCE_P_DEFAULT);
673 }
674 } else if (IsPySingleQuoteStringState(sc.state)) {
675 if (sc.ch == '\\') {
676 if ((sc.chNext == '\r') && (sc.GetRelative(2) == '\n')) {
677 sc.Forward();
678 }
679 if (sc.chNext == '\n' || sc.chNext == '\r') {
680 inContinuedString = true;
681 } else {
682 // Don't roll over the newline.
683 sc.Forward();
684 }
685 } else if (sc.ch == GetPyStringQuoteChar(sc.state)) {
686 sc.ForwardSetState(SCE_P_DEFAULT);
687 needEOLCheck = true;
688 }
689 } else if ((sc.state == SCE_P_TRIPLE) || (sc.state == SCE_P_FTRIPLE)) {
690 if (sc.ch == '\\') {
691 sc.Forward();
692 } else if (sc.Match(R"(''')")) {
693 sc.Forward();
694 sc.Forward();
695 sc.ForwardSetState(SCE_P_DEFAULT);
696 needEOLCheck = true;
697 }
698 } else if ((sc.state == SCE_P_TRIPLEDOUBLE) || (sc.state == SCE_P_FTRIPLEDOUBLE)) {
699 if (sc.ch == '\\') {
700 sc.Forward();
701 } else if (sc.Match(R"(""")")) {
702 sc.Forward();
703 sc.Forward();
704 sc.ForwardSetState(SCE_P_DEFAULT);
705 needEOLCheck = true;
706 }
707 }
708
709 // Note if used and not if else because string states also match
710 // some of the above clauses
711 if (IsPyFStringState(sc.state) && sc.ch == '{') {
712 if (sc.chNext == '{') {
713 sc.Forward();
714 } else {
715 PushStateToStack(sc.state, fstringStateStack, currentFStringExp);
716 sc.ForwardSetState(SCE_P_DEFAULT);
717 }
718 needEOLCheck = true;
719 }
720
721 // If in an f-string expression, check for the ending quote(s)
722 // and end f-string to handle syntactically incorrect cases like
723 // f'{' and f"""{"""
724 if (!fstringStateStack.empty() && (sc.ch == '\'' || sc.ch == '"')) {
725 long matching_stack_i = -1;
726 for (unsigned long stack_i = 0; stack_i < fstringStateStack.size() && matching_stack_i == -1; stack_i++) {
727 const int stack_state = fstringStateStack[stack_i].state;
728 const char quote = GetPyStringQuoteChar(stack_state);
729 if (sc.ch == quote) {
730 if (IsPySingleQuoteStringState(stack_state)) {
731 matching_stack_i = stack_i;
732 } else if (quote == '"' ? sc.Match(R"(""")") : sc.Match("'''")) {
733 matching_stack_i = stack_i;
734 }
735 }
736 }
737
738 if (matching_stack_i != -1) {
739 sc.SetState(fstringStateStack[matching_stack_i].state);
740 if (IsPyTripleQuoteStringState(fstringStateStack[matching_stack_i].state)) {
741 sc.Forward();
742 sc.Forward();
743 }
744 sc.ForwardSetState(SCE_P_DEFAULT);
745 needEOLCheck = true;
746
747 while (fstringStateStack.size() > static_cast<unsigned long>(matching_stack_i)) {
748 PopFromStateStack(fstringStateStack, currentFStringExp);
749 }
750 }
751 }
752 // End of code to find the end of a state
753
754 if (!indentGood && !IsASpaceOrTab(sc.ch)) {
755 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 1);
756 startIndicator = sc.currentPos;
757 indentGood = true;
758 }
759
760 // One cdef or cpdef line, clear kwLast only at end of line
761 if ((kwLast == kwCDef || kwLast == kwCPDef) && sc.atLineEnd) {
762 kwLast = kwOther;
763 }
764
765 // State exit code may have moved on to end of line
766 if (needEOLCheck && sc.atLineEnd) {
767 ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
768 lineCurrent++;
769 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
770 if (!sc.More())
771 break;
772 }
773
774 // If in f-string expression, check for }, :, ! to resume f-string state or update nesting count
775 if (currentFStringExp && !IsPySingleQuoteStringState(sc.state) && !IsPyTripleQuoteStringState(sc.state)) {
776 if (currentFStringExp->nestingCount == 0 && (sc.ch == '}' || sc.ch == ':' || (sc.ch == '!' && sc.chNext != '='))) {
777 sc.SetState(PopFromStateStack(fstringStateStack, currentFStringExp));
778 } else {
779 if (sc.ch == '{' || sc.ch == '[' || sc.ch == '(') {
780 currentFStringExp->nestingCount++;
781 } else if (sc.ch == '}' || sc.ch == ']' || sc.ch == ')') {
782 currentFStringExp->nestingCount--;
783 }
784 }
785 }
786
787 // Check for a new state starting character
788 if (sc.state == SCE_P_DEFAULT) {
789 if (IsADigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext))) {
790 if (sc.ch == '0' && (sc.chNext == 'x' || sc.chNext == 'X')) {
791 base_n_number = true;
792 sc.SetState(SCE_P_NUMBER);
793 } else if (sc.ch == '0' &&
794 (sc.chNext == 'o' || sc.chNext == 'O' || sc.chNext == 'b' || sc.chNext == 'B')) {
795 if (options.base2or8Literals) {
796 base_n_number = true;
797 sc.SetState(SCE_P_NUMBER);
798 } else {
799 sc.SetState(SCE_P_NUMBER);
800 sc.ForwardSetState(SCE_P_IDENTIFIER);
801 }
802 } else {
803 base_n_number = false;
804 sc.SetState(SCE_P_NUMBER);
805 }
806 } else if (isoperator(sc.ch) || sc.ch == '`') {
807 sc.SetState(SCE_P_OPERATOR);
808 } else if (sc.ch == '#') {
809 sc.SetState(sc.chNext == '#' ? SCE_P_COMMENTBLOCK : SCE_P_COMMENTLINE);
810 } else if (sc.ch == '@') {
811 if (IsFirstNonWhitespace(sc.currentPos, styler))
812 sc.SetState(SCE_P_DECORATOR);
813 else
814 sc.SetState(SCE_P_OPERATOR);
815 } else if (IsPyStringStart(sc.ch, sc.chNext, sc.GetRelative(2), allowedLiterals)) {
816 Sci_PositionU nextIndex = 0;
817 sc.SetState(GetPyStringState(styler, sc.currentPos, &nextIndex, allowedLiterals));
818 while (nextIndex > (sc.currentPos + 1) && sc.More()) {
819 sc.Forward();
820 }
821 } else if (IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
822 sc.SetState(SCE_P_IDENTIFIER);
823 }
824 }
825 }
826 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
827 sc.Complete();
828}
829
830static bool IsCommentLine(Sci_Position line, Accessor &styler) {
831 const Sci_Position pos = styler.LineStart(line);
832 const Sci_Position eol_pos = styler.LineStart(line + 1) - 1;
833 for (Sci_Position i = pos; i < eol_pos; i++) {
834 const char ch = styler[i];
835 if (ch == '#')
836 return true;
837 else if (ch != ' ' && ch != '\t')
838 return false;
839 }
840 return false;
841}
842
843static bool IsQuoteLine(Sci_Position line, const Accessor &styler) {
844 const int style = styler.StyleAt(styler.LineStart(line)) & 31;
845 return IsPyTripleQuoteStringState(style);
846}
847
848
849void SCI_METHOD LexerPython::Fold(Sci_PositionU startPos, Sci_Position length, int /*initStyle - unused*/, IDocument *pAccess) {
850 if (!options.fold)
851 return;
852
853 Accessor styler(pAccess, nullptr);
854
855 const Sci_Position maxPos = startPos + length;
856 const Sci_Position maxLines = (maxPos == styler.Length()) ? styler.GetLine(maxPos) : styler.GetLine(maxPos - 1); // Requested last line
857 const Sci_Position docLines = styler.GetLine(styler.Length()); // Available last line
858
859 // Backtrack to previous non-blank line so we can determine indent level
860 // for any white space lines (needed esp. within triple quoted strings)
861 // and so we can fix any preceding fold level (which is why we go back
862 // at least one line in all cases)
863 int spaceFlags = 0;
864 Sci_Position lineCurrent = styler.GetLine(startPos);
865 int indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, nullptr);
866 while (lineCurrent > 0) {
867 lineCurrent--;
868 indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, nullptr);
869 if (!(indentCurrent & SC_FOLDLEVELWHITEFLAG) &&
870 (!IsCommentLine(lineCurrent, styler)) &&
871 (!IsQuoteLine(lineCurrent, styler)))
872 break;
873 }
874 int indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
875
876 // Set up initial loop state
877 startPos = styler.LineStart(lineCurrent);
878 int prev_state = SCE_P_DEFAULT & 31;
879 if (lineCurrent >= 1)
880 prev_state = styler.StyleAt(startPos - 1) & 31;
881 int prevQuote = options.foldQuotes && IsPyTripleQuoteStringState(prev_state);
882
883 // Process all characters to end of requested range or end of any triple quote
884 //that hangs over the end of the range. Cap processing in all cases
885 // to end of document (in case of unclosed quote at end).
886 while ((lineCurrent <= docLines) && ((lineCurrent <= maxLines) || prevQuote)) {
887
888 // Gather info
889 int lev = indentCurrent;
890 Sci_Position lineNext = lineCurrent + 1;
891 int indentNext = indentCurrent;
892 int quote = false;
893 if (lineNext <= docLines) {
894 // Information about next line is only available if not at end of document
895 indentNext = styler.IndentAmount(lineNext, &spaceFlags, nullptr);
896 const Sci_Position lookAtPos = (styler.LineStart(lineNext) == styler.Length()) ? styler.Length() - 1 : styler.LineStart(lineNext);
897 const int style = styler.StyleAt(lookAtPos) & 31;
898 quote = options.foldQuotes && IsPyTripleQuoteStringState(style);
899 }
900 const bool quote_start = (quote && !prevQuote);
901 const bool quote_continue = (quote && prevQuote);
902 if (!quote || !prevQuote)
903 indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
904 if (quote)
905 indentNext = indentCurrentLevel;
906 if (indentNext & SC_FOLDLEVELWHITEFLAG)
907 indentNext = SC_FOLDLEVELWHITEFLAG | indentCurrentLevel;
908
909 if (quote_start) {
910 // Place fold point at start of triple quoted string
911 lev |= SC_FOLDLEVELHEADERFLAG;
912 } else if (quote_continue || prevQuote) {
913 // Add level to rest of lines in the string
914 lev = lev + 1;
915 }
916
917 // Skip past any blank lines for next indent level info; we skip also
918 // comments (all comments, not just those starting in column 0)
919 // which effectively folds them into surrounding code rather
920 // than screwing up folding. If comments end file, use the min
921 // comment indent as the level after
922
923 int minCommentLevel = indentCurrentLevel;
924 while (!quote &&
925 (lineNext < docLines) &&
926 ((indentNext & SC_FOLDLEVELWHITEFLAG) || (IsCommentLine(lineNext, styler)))) {
927
928 if (IsCommentLine(lineNext, styler) && indentNext < minCommentLevel) {
929 minCommentLevel = indentNext;
930 }
931
932 lineNext++;
933 indentNext = styler.IndentAmount(lineNext, &spaceFlags, nullptr);
934 }
935
936 const int levelAfterComments = ((lineNext < docLines) ? indentNext & SC_FOLDLEVELNUMBERMASK : minCommentLevel);
937 const int levelBeforeComments = std::max(indentCurrentLevel, levelAfterComments);
938
939 // Now set all the indent levels on the lines we skipped
940 // Do this from end to start. Once we encounter one line
941 // which is indented more than the line after the end of
942 // the comment-block, use the level of the block before
943
944 Sci_Position skipLine = lineNext;
945 int skipLevel = levelAfterComments;
946
947 while (--skipLine > lineCurrent) {
948 const int skipLineIndent = styler.IndentAmount(skipLine, &spaceFlags, nullptr);
949
950 if (options.foldCompact) {
951 if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments)
952 skipLevel = levelBeforeComments;
953
954 const int whiteFlag = skipLineIndent & SC_FOLDLEVELWHITEFLAG;
955
956 styler.SetLevel(skipLine, skipLevel | whiteFlag);
957 } else {
958 if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments &&
959 !(skipLineIndent & SC_FOLDLEVELWHITEFLAG) &&
960 !IsCommentLine(skipLine, styler))
961 skipLevel = levelBeforeComments;
962
963 styler.SetLevel(skipLine, skipLevel);
964 }
965 }
966
967 // Set fold header on non-quote line
968 if (!quote && !(indentCurrent & SC_FOLDLEVELWHITEFLAG)) {
969 if ((indentCurrent & SC_FOLDLEVELNUMBERMASK) < (indentNext & SC_FOLDLEVELNUMBERMASK))
970 lev |= SC_FOLDLEVELHEADERFLAG;
971 }
972
973 // Keep track of triple quote state of previous line
974 prevQuote = quote;
975
976 // Set fold level for this line and move to next line
977 styler.SetLevel(lineCurrent, options.foldCompact ? lev : lev & ~SC_FOLDLEVELWHITEFLAG);
978 indentCurrent = indentNext;
979 lineCurrent = lineNext;
980 }
981
982 // NOTE: Cannot set level of last line here because indentCurrent doesn't have
983 // header flag set; the loop above is crafted to take care of this case!
984 //styler.SetLevel(lineCurrent, indentCurrent);
985}
986
987LexerModule lmPython(SCLEX_PYTHON, LexerPython::LexerFactoryPython, "python",
988 pythonWordListDesc);
989