1// Scintilla source code edit control
2/** @file LexOScript.cxx
3 ** Lexer for OScript sources; ocx files and/or OSpace dumps.
4 ** OScript is a programming language used to develop applications for the
5 ** Livelink server platform.
6 **/
7// Written by Ferdinand Prantl <prantlf@gmail.com>, inspired by the code from
8// LexVB.cxx and LexPascal.cxx. The License.txt file describes the conditions
9// under which this software may be distributed.
10
11#include <stdlib.h>
12#include <string.h>
13#include <stdio.h>
14#include <stdarg.h>
15#include <assert.h>
16#include <ctype.h>
17
18#include <string>
19#include <string_view>
20
21#include "ILexer.h"
22#include "Scintilla.h"
23#include "SciLexer.h"
24
25#include "WordList.h"
26#include "LexAccessor.h"
27#include "Accessor.h"
28#include "StyleContext.h"
29#include "CharacterSet.h"
30#include "LexerModule.h"
31
32using namespace Lexilla;
33
34// -----------------------------------------
35// Functions classifying a single character.
36
37// This function is generic and should be probably moved to CharSet.h where
38// IsAlphaNumeric the others reside.
39inline bool IsAlpha(int ch) {
40 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
41}
42
43static inline bool IsIdentifierChar(int ch) {
44 // Identifiers cannot contain non-ASCII letters; a word with non-English
45 // language-specific characters cannot be an identifier.
46 return IsAlphaNumeric(ch) || ch == '_';
47}
48
49static inline bool IsIdentifierStart(int ch) {
50 // Identifiers cannot contain non-ASCII letters; a word with non-English
51 // language-specific characters cannot be an identifier.
52 return IsAlpha(ch) || ch == '_';
53}
54
55static inline bool IsNumberChar(int ch, int chNext) {
56 // Numeric constructs are not checked for lexical correctness. They are
57 // expected to look like +1.23-E9 but actually any bunch of the following
58 // characters will be styled as number.
59 // KNOWN PROBLEM: if you put + or - operators immediately after a number
60 // and the next operand starts with the letter E, the operator will not be
61 // recognized and it will be styled together with the preceding number.
62 // This should not occur; at least not often. The coding style recommends
63 // putting spaces around operators.
64 return IsADigit(ch) || toupper(ch) == 'E' || ch == '.' ||
65 ((ch == '-' || ch == '+') && toupper(chNext) == 'E');
66}
67
68// This function checks for the start or a natural number without any symbols
69// or operators as a prefix; the IsPrefixedNumberStart should be called
70// immediately after this one to cover all possible numeric constructs.
71static inline bool IsNaturalNumberStart(int ch) {
72 return IsADigit(ch) != 0;
73}
74
75static inline bool IsPrefixedNumberStart(int ch, int chNext) {
76 // KNOWN PROBLEM: if you put + or - operators immediately before a number
77 // the operator will not be recognized and it will be styled together with
78 // the succeeding number. This should not occur; at least not often. The
79 // coding style recommends putting spaces around operators.
80 return (ch == '.' || ch == '-' || ch == '+') && IsADigit(chNext);
81}
82
83static inline bool IsOperator(int ch) {
84 return strchr("%^&*()-+={}[]:;<>,/?!.~|\\", ch) != NULL;
85}
86
87// ---------------------------------------------------------------
88// Functions classifying a token currently processed in the lexer.
89
90// Checks if the current line starts with the preprocessor directive used
91// usually to introduce documentation comments: #ifdef DOC. This method is
92// supposed to be called if the line has been recognized as a preprocessor
93// directive already.
94static bool IsDocCommentStart(StyleContext &sc) {
95 // Check the line back to its start only if the end looks promising.
96 if (sc.LengthCurrent() == 10 && !IsAlphaNumeric(sc.ch)) {
97 char s[11];
98 sc.GetCurrentLowered(s, sizeof(s));
99 return strcmp(s, "#ifdef doc") == 0;
100 }
101 return false;
102}
103
104// Checks if the current line starts with the preprocessor directive that
105// is complementary to the #ifdef DOC start: #endif. This method is supposed
106// to be called if the current state point to the documentation comment.
107// QUESTIONAL ASSUMPTION: The complete #endif directive is not checked; just
108// the starting #e. However, there is no other preprocessor directive with
109// the same starting letter and thus this optimization should always work.
110static bool IsDocCommentEnd(StyleContext &sc) {
111 return sc.ch == '#' && sc.chNext == 'e';
112}
113
114class IdentifierClassifier {
115 WordList &keywords; // Passed from keywords property.
116 WordList &constants; // Passed from keywords2 property.
117 WordList &operators; // Passed from keywords3 property.
118 WordList &types; // Passed from keywords4 property.
119 WordList &functions; // Passed from keywords5 property.
120 WordList &objects; // Passed from keywords6 property.
121
122 IdentifierClassifier(IdentifierClassifier const&);
123 IdentifierClassifier& operator=(IdentifierClassifier const&);
124
125public:
126 IdentifierClassifier(WordList *keywordlists[]) :
127 keywords(*keywordlists[0]), constants(*keywordlists[1]),
128 operators(*keywordlists[2]), types(*keywordlists[3]),
129 functions(*keywordlists[4]), objects(*keywordlists[5])
130 {}
131
132 void ClassifyIdentifier(StyleContext &sc) {
133 // Opening parenthesis following an identifier makes it a possible
134 // function call.
135 // KNOWN PROBLEM: If some whitespace is inserted between the
136 // identifier and the parenthesis they will not be able to be
137 // recognized as a function call. This should not occur; at
138 // least not often. Such coding style would be weird.
139 if (sc.Match('(')) {
140 char s[100];
141 sc.GetCurrentLowered(s, sizeof(s));
142 // Before an opening brace can be control statements and
143 // operators too; function call is the last option.
144 if (keywords.InList(s)) {
145 sc.ChangeState(SCE_OSCRIPT_KEYWORD);
146 } else if (operators.InList(s)) {
147 sc.ChangeState(SCE_OSCRIPT_OPERATOR);
148 } else if (functions.InList(s)) {
149 sc.ChangeState(SCE_OSCRIPT_FUNCTION);
150 } else {
151 sc.ChangeState(SCE_OSCRIPT_METHOD);
152 }
153 sc.SetState(SCE_OSCRIPT_OPERATOR);
154 } else {
155 char s[100];
156 sc.GetCurrentLowered(s, sizeof(s));
157 // A dot following an identifier means an access to an object
158 // member. The related object identifier can be special.
159 // KNOWN PROBLEM: If there is whitespace between the identifier
160 // and the following dot, the identifier will not be recognized
161 // as an object in an object member access. If it is one of the
162 // listed static objects it will not be styled.
163 if (sc.Match('.') && objects.InList(s)) {
164 sc.ChangeState(SCE_OSCRIPT_OBJECT);
165 sc.SetState(SCE_OSCRIPT_OPERATOR);
166 } else {
167 if (keywords.InList(s)) {
168 sc.ChangeState(SCE_OSCRIPT_KEYWORD);
169 } else if (constants.InList(s)) {
170 sc.ChangeState(SCE_OSCRIPT_CONSTANT);
171 } else if (operators.InList(s)) {
172 sc.ChangeState(SCE_OSCRIPT_OPERATOR);
173 } else if (types.InList(s)) {
174 sc.ChangeState(SCE_OSCRIPT_TYPE);
175 } else if (functions.InList(s)) {
176 sc.ChangeState(SCE_OSCRIPT_FUNCTION);
177 }
178 sc.SetState(SCE_OSCRIPT_DEFAULT);
179 }
180 }
181 }
182};
183
184// ------------------------------------------------
185// Function colourising an excerpt of OScript code.
186
187static void ColouriseOScriptDoc(Sci_PositionU startPos, Sci_Position length,
188 int initStyle, WordList *keywordlists[],
189 Accessor &styler) {
190 // I wonder how whole-line styles ended by EOLN can escape the resetting
191 // code in the loop below and overflow to the next line. Let us make sure
192 // that a new line does not start with them carried from the previous one.
193 // NOTE: An overflowing string is intentionally not checked; it reminds
194 // the developer that the string must be ended on the same line.
195 if (initStyle == SCE_OSCRIPT_LINE_COMMENT ||
196 initStyle == SCE_OSCRIPT_PREPROCESSOR) {
197 initStyle = SCE_OSCRIPT_DEFAULT;
198 }
199
200 styler.StartAt(startPos);
201 StyleContext sc(startPos, length, initStyle, styler);
202 IdentifierClassifier identifierClassifier(keywordlists);
203
204 // It starts with true at the beginning of a line and changes to false as
205 // soon as the first non-whitespace character has been processed.
206 bool isFirstToken = true;
207 // It starts with true at the beginning of a line and changes to false as
208 // soon as the first identifier on the line is passed by.
209 bool isFirstIdentifier = true;
210 // It becomes false when #ifdef DOC (the preprocessor directive often
211 // used to start a documentation comment) is encountered and remain false
212 // until the end of the documentation block is not detected. This is done
213 // by checking for the complementary #endif preprocessor directive.
214 bool endDocComment = false;
215
216 for (; sc.More(); sc.Forward()) {
217
218 if (sc.atLineStart) {
219 isFirstToken = true;
220 isFirstIdentifier = true;
221 // Detect the current state is neither whitespace nor identifier. It
222 // means that no next identifier can be the first token on the line.
223 } else if (isFirstIdentifier && sc.state != SCE_OSCRIPT_DEFAULT &&
224 sc.state != SCE_OSCRIPT_IDENTIFIER) {
225 isFirstIdentifier = false;
226 }
227
228 // Check if the current state should be changed.
229 if (sc.state == SCE_OSCRIPT_OPERATOR) {
230 // Multiple-symbol operators are marked by single characters.
231 sc.SetState(SCE_OSCRIPT_DEFAULT);
232 } else if (sc.state == SCE_OSCRIPT_IDENTIFIER) {
233 if (!IsIdentifierChar(sc.ch)) {
234 // Colon after an identifier makes it a label if it is the
235 // first token on the line.
236 // KNOWN PROBLEM: If some whitespace is inserted between the
237 // identifier and the colon they will not be recognized as a
238 // label. This should not occur; at least not often. It would
239 // make the code structure less legible and examples in the
240 // Livelink documentation do not show it.
241 if (sc.Match(':') && isFirstIdentifier) {
242 sc.ChangeState(SCE_OSCRIPT_LABEL);
243 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
244 } else {
245 identifierClassifier.ClassifyIdentifier(sc);
246 }
247 // Avoid a sequence of two words be mistaken for a label. A
248 // switch case would be an example.
249 isFirstIdentifier = false;
250 }
251 } else if (sc.state == SCE_OSCRIPT_GLOBAL) {
252 if (!IsIdentifierChar(sc.ch)) {
253 sc.SetState(SCE_OSCRIPT_DEFAULT);
254 }
255 } else if (sc.state == SCE_OSCRIPT_PROPERTY) {
256 if (!IsIdentifierChar(sc.ch)) {
257 // Any member access introduced by the dot operator is
258 // initially marked as a property access. If an opening
259 // parenthesis is detected later it is changed to method call.
260 // KNOWN PROBLEM: The same as at the function call recognition
261 // for SCE_OSCRIPT_IDENTIFIER above.
262 if (sc.Match('(')) {
263 sc.ChangeState(SCE_OSCRIPT_METHOD);
264 }
265 sc.SetState(SCE_OSCRIPT_DEFAULT);
266 }
267 } else if (sc.state == SCE_OSCRIPT_NUMBER) {
268 if (!IsNumberChar(sc.ch, sc.chNext)) {
269 sc.SetState(SCE_OSCRIPT_DEFAULT);
270 }
271 } else if (sc.state == SCE_OSCRIPT_SINGLEQUOTE_STRING) {
272 if (sc.ch == '\'') {
273 // Two consequential apostrophes convert to a single one.
274 if (sc.chNext == '\'') {
275 sc.Forward();
276 } else {
277 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
278 }
279 } else if (sc.atLineEnd) {
280 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
281 }
282 } else if (sc.state == SCE_OSCRIPT_DOUBLEQUOTE_STRING) {
283 if (sc.ch == '\"') {
284 // Two consequential quotation marks convert to a single one.
285 if (sc.chNext == '\"') {
286 sc.Forward();
287 } else {
288 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
289 }
290 } else if (sc.atLineEnd) {
291 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
292 }
293 } else if (sc.state == SCE_OSCRIPT_BLOCK_COMMENT) {
294 if (sc.Match('*', '/')) {
295 sc.Forward();
296 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
297 }
298 } else if (sc.state == SCE_OSCRIPT_LINE_COMMENT) {
299 if (sc.atLineEnd) {
300 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
301 }
302 } else if (sc.state == SCE_OSCRIPT_PREPROCESSOR) {
303 if (IsDocCommentStart(sc)) {
304 sc.ChangeState(SCE_OSCRIPT_DOC_COMMENT);
305 endDocComment = false;
306 } else if (sc.atLineEnd) {
307 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
308 }
309 } else if (sc.state == SCE_OSCRIPT_DOC_COMMENT) {
310 // KNOWN PROBLEM: The first line detected that would close a
311 // conditional preprocessor block (#endif) the documentation
312 // comment block will end. (Nested #if-#endif blocks are not
313 // supported. Hopefully it will not occur often that a line
314 // within the text block would stat with #endif.
315 if (isFirstToken && IsDocCommentEnd(sc)) {
316 endDocComment = true;
317 } else if (sc.atLineEnd && endDocComment) {
318 sc.ForwardSetState(SCE_OSCRIPT_DEFAULT);
319 }
320 }
321
322 // Check what state starts with the current character.
323 if (sc.state == SCE_OSCRIPT_DEFAULT) {
324 if (sc.Match('\'')) {
325 sc.SetState(SCE_OSCRIPT_SINGLEQUOTE_STRING);
326 } else if (sc.Match('\"')) {
327 sc.SetState(SCE_OSCRIPT_DOUBLEQUOTE_STRING);
328 } else if (sc.Match('/', '/')) {
329 sc.SetState(SCE_OSCRIPT_LINE_COMMENT);
330 sc.Forward();
331 } else if (sc.Match('/', '*')) {
332 sc.SetState(SCE_OSCRIPT_BLOCK_COMMENT);
333 sc.Forward();
334 } else if (isFirstToken && sc.Match('#')) {
335 sc.SetState(SCE_OSCRIPT_PREPROCESSOR);
336 } else if (sc.Match('$')) {
337 // Both process-global ($xxx) and thread-global ($$xxx)
338 // variables are handled as one global.
339 sc.SetState(SCE_OSCRIPT_GLOBAL);
340 } else if (IsNaturalNumberStart(sc.ch)) {
341 sc.SetState(SCE_OSCRIPT_NUMBER);
342 } else if (IsPrefixedNumberStart(sc.ch, sc.chNext)) {
343 sc.SetState(SCE_OSCRIPT_NUMBER);
344 sc.Forward();
345 } else if (sc.Match('.') && IsIdentifierStart(sc.chNext)) {
346 // Every object member access is marked as a property access
347 // initially. The decision between property and method is made
348 // after parsing the identifier and looking what comes then.
349 // KNOWN PROBLEM: If there is whitespace between the following
350 // identifier and the dot, the dot will not be recognized
351 // as a member accessing operator. In turn, the identifier
352 // will not be recognizable as a property or a method too.
353 sc.SetState(SCE_OSCRIPT_OPERATOR);
354 sc.Forward();
355 sc.SetState(SCE_OSCRIPT_PROPERTY);
356 } else if (IsIdentifierStart(sc.ch)) {
357 sc.SetState(SCE_OSCRIPT_IDENTIFIER);
358 } else if (IsOperator(sc.ch)) {
359 sc.SetState(SCE_OSCRIPT_OPERATOR);
360 }
361 }
362
363 if (isFirstToken && !IsASpaceOrTab(sc.ch)) {
364 isFirstToken = false;
365 }
366 }
367
368 sc.Complete();
369}
370
371// ------------------------------------------
372// Functions supporting OScript code folding.
373
374static inline bool IsBlockComment(int style) {
375 return style == SCE_OSCRIPT_BLOCK_COMMENT;
376}
377
378static bool IsLineComment(Sci_Position line, Accessor &styler) {
379 Sci_Position pos = styler.LineStart(line);
380 Sci_Position eolPos = styler.LineStart(line + 1) - 1;
381 for (Sci_Position i = pos; i < eolPos; i++) {
382 char ch = styler[i];
383 char chNext = styler.SafeGetCharAt(i + 1);
384 int style = styler.StyleAt(i);
385 if (ch == '/' && chNext == '/' && style == SCE_OSCRIPT_LINE_COMMENT) {
386 return true;
387 } else if (!IsASpaceOrTab(ch)) {
388 return false;
389 }
390 }
391 return false;
392}
393
394static inline bool IsPreprocessor(int style) {
395 return style == SCE_OSCRIPT_PREPROCESSOR ||
396 style == SCE_OSCRIPT_DOC_COMMENT;
397}
398
399static void GetRangeLowered(Sci_PositionU start, Sci_PositionU end,
400 Accessor &styler, char *s, Sci_PositionU len) {
401 Sci_PositionU i = 0;
402 while (i < end - start + 1 && i < len - 1) {
403 s[i] = static_cast<char>(tolower(styler[start + i]));
404 i++;
405 }
406 s[i] = '\0';
407}
408
409static void GetForwardWordLowered(Sci_PositionU start, Accessor &styler,
410 char *s, Sci_PositionU len) {
411 Sci_PositionU i = 0;
412 while (i < len - 1 && IsAlpha(styler.SafeGetCharAt(start + i))) {
413 s[i] = static_cast<char>(tolower(styler.SafeGetCharAt(start + i)));
414 i++;
415 }
416 s[i] = '\0';
417}
418
419static void UpdatePreprocessorFoldLevel(int &levelCurrent,
420 Sci_PositionU startPos, Accessor &styler) {
421 char s[7]; // Size of the longest possible keyword + null.
422 GetForwardWordLowered(startPos, styler, s, sizeof(s));
423
424 if (strcmp(s, "ifdef") == 0 ||
425 strcmp(s, "ifndef") == 0) {
426 levelCurrent++;
427 } else if (strcmp(s, "endif") == 0) {
428 levelCurrent--;
429 if (levelCurrent < SC_FOLDLEVELBASE) {
430 levelCurrent = SC_FOLDLEVELBASE;
431 }
432 }
433}
434
435static void UpdateKeywordFoldLevel(int &levelCurrent, Sci_PositionU lastStart,
436 Sci_PositionU currentPos, Accessor &styler) {
437 char s[9];
438 GetRangeLowered(lastStart, currentPos, styler, s, sizeof(s));
439
440 if (strcmp(s, "if") == 0 || strcmp(s, "for") == 0 ||
441 strcmp(s, "switch") == 0 || strcmp(s, "function") == 0 ||
442 strcmp(s, "while") == 0 || strcmp(s, "repeat") == 0) {
443 levelCurrent++;
444 } else if (strcmp(s, "end") == 0 || strcmp(s, "until") == 0) {
445 levelCurrent--;
446 if (levelCurrent < SC_FOLDLEVELBASE) {
447 levelCurrent = SC_FOLDLEVELBASE;
448 }
449 }
450}
451
452// ------------------------------
453// Function folding OScript code.
454
455static void FoldOScriptDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
456 WordList *[], Accessor &styler) {
457 bool foldComment = styler.GetPropertyInt("fold.comment") != 0;
458 bool foldPreprocessor = styler.GetPropertyInt("fold.preprocessor") != 0;
459 bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0;
460 Sci_Position endPos = startPos + length;
461 int visibleChars = 0;
462 Sci_Position lineCurrent = styler.GetLine(startPos);
463 int levelPrev = styler.LevelAt(lineCurrent) & SC_FOLDLEVELNUMBERMASK;
464 int levelCurrent = levelPrev;
465 char chNext = styler[startPos];
466 int styleNext = styler.StyleAt(startPos);
467 int style = initStyle;
468 Sci_Position lastStart = 0;
469
470 for (Sci_Position i = startPos; i < endPos; i++) {
471 char ch = chNext;
472 chNext = styler.SafeGetCharAt(i + 1);
473 int stylePrev = style;
474 style = styleNext;
475 styleNext = styler.StyleAt(i + 1);
476 bool atLineEnd = (ch == '\r' && chNext != '\n') || (ch == '\n');
477
478 if (foldComment && IsBlockComment(style)) {
479 if (!IsBlockComment(stylePrev)) {
480 levelCurrent++;
481 } else if (!IsBlockComment(styleNext) && !atLineEnd) {
482 // Comments do not end at end of line and the next character
483 // may not be styled.
484 levelCurrent--;
485 }
486 }
487 if (foldComment && atLineEnd && IsLineComment(lineCurrent, styler)) {
488 if (!IsLineComment(lineCurrent - 1, styler) &&
489 IsLineComment(lineCurrent + 1, styler))
490 levelCurrent++;
491 else if (IsLineComment(lineCurrent - 1, styler) &&
492 !IsLineComment(lineCurrent+1, styler))
493 levelCurrent--;
494 }
495 if (foldPreprocessor) {
496 if (ch == '#' && IsPreprocessor(style)) {
497 UpdatePreprocessorFoldLevel(levelCurrent, i + 1, styler);
498 }
499 }
500
501 if (stylePrev != SCE_OSCRIPT_KEYWORD && style == SCE_OSCRIPT_KEYWORD) {
502 lastStart = i;
503 }
504 if (stylePrev == SCE_OSCRIPT_KEYWORD) {
505 if(IsIdentifierChar(ch) && !IsIdentifierChar(chNext)) {
506 UpdateKeywordFoldLevel(levelCurrent, lastStart, i, styler);
507 }
508 }
509
510 if (!IsASpace(ch))
511 visibleChars++;
512
513 if (atLineEnd) {
514 int level = levelPrev;
515 if (visibleChars == 0 && foldCompact)
516 level |= SC_FOLDLEVELWHITEFLAG;
517 if ((levelCurrent > levelPrev) && (visibleChars > 0))
518 level |= SC_FOLDLEVELHEADERFLAG;
519 if (level != styler.LevelAt(lineCurrent)) {
520 styler.SetLevel(lineCurrent, level);
521 }
522 lineCurrent++;
523 levelPrev = levelCurrent;
524 visibleChars = 0;
525 }
526 }
527
528 // If we did not reach EOLN in the previous loop, store the line level and
529 // whitespace information. The rest will be filled in later.
530 int lev = levelPrev;
531 if (visibleChars == 0 && foldCompact)
532 lev |= SC_FOLDLEVELWHITEFLAG;
533 styler.SetLevel(lineCurrent, lev);
534}
535
536// --------------------------------------------
537// Declaration of the OScript lexer descriptor.
538
539static const char * const oscriptWordListDesc[] = {
540 "Keywords and reserved words",
541 "Literal constants",
542 "Literal operators",
543 "Built-in value and reference types",
544 "Built-in global functions",
545 "Built-in static objects",
546 0
547};
548
549LexerModule lmOScript(SCLEX_OSCRIPT, ColouriseOScriptDoc, "oscript", FoldOScriptDoc, oscriptWordListDesc);
550