1/** @file LexRaku.cxx
2 ** Lexer for Raku
3 **
4 ** Copyright (c) 2019 Mark Reay <mark@reay.net.au>
5 **/
6// Copyright 1998-2005 by Neil Hodgson <neilh@scintilla.org>
7// The License.txt file describes the conditions under which this software may be distributed.
8
9/*
10 * Raku (Perl6) Lexer for Scintilla
11 * ---------------------------------
12 * ---------------------------------
13 * 06-Dec-2019: More Unicode support:
14 * - Added a full scope of allowed numbers and letters
15 * 29-Nov-2019: More highlighting / implemented basic folding:
16 * - Operators (blanket cover, no sequence checking)
17 * - Class / Grammar name highlighting
18 * - Folding:
19 * - Comments: line / multi-line
20 * - POD sections
21 * - Code blocks {}
22 * 26-Nov-2019: Basic syntax highlighting covering the following:
23 * - Comments, both line and embedded (multi-line)
24 * - POD, no inline highlighting as yet...
25 * - Heredoc block string, with variable highlighting (with qq)
26 * - Strings, with variable highlighting (with ")
27 * - Q Language, including adverbs (also basic q and qq)
28 * - Regex, including adverbs
29 * - Numbers
30 * - Bareword / identifiers
31 * - Types
32 * - Variables: mu, positional, associative, callable
33 * TODO:
34 * - POD inline
35 * - Better operator sequence coverage
36 */
37
38#include <stdlib.h>
39#include <string.h>
40#include <stdio.h>
41#include <stdarg.h>
42#include <assert.h>
43#include <ctype.h>
44
45#include <string>
46#include <string_view>
47#include <vector>
48#include <map>
49#include <functional>
50
51#include "ILexer.h"
52#include "Scintilla.h"
53#include "SciLexer.h"
54
55#include "WordList.h"
56#include "LexAccessor.h"
57#include "StyleContext.h"
58#include "CharacterSet.h"
59#include "CharacterCategory.h"
60#include "LexerModule.h"
61#include "OptionSet.h"
62#include "DefaultLexer.h"
63
64using namespace Scintilla;
65using namespace Lexilla;
66
67namespace { // anonymous namespace to isolate any name clashes
68/*----------------------------------------------------------------------------*
69 * --- DEFINITIONS: OPTIONS / CONSTANTS ---
70 *----------------------------------------------------------------------------*/
71
72// Number types
73#define RAKUNUM_BINARY 1 // order is significant: 1-3 cannot have a dot
74#define RAKUNUM_OCTAL 2
75#define RAKUNUM_FLOAT_EXP 3 // exponent part only
76#define RAKUNUM_HEX 4 // may be a hex float
77#define RAKUNUM_DECIMAL 5 // 1-5 are numbers; 6-7 are strings
78#define RAKUNUM_VECTOR 6
79#define RAKUNUM_V_VECTOR 7
80#define RAKUNUM_VERSION 8 // can contain multiple '.'s
81#define RAKUNUM_BAD 9
82
83// Regex / Q string types
84#define RAKUTYPE_REGEX_NORM 0 // 0 char ident
85#define RAKUTYPE_REGEX_S 1 // order is significant:
86#define RAKUTYPE_REGEX_M 2 // 1 char ident
87#define RAKUTYPE_REGEX_Y 3 // 1 char ident
88#define RAKUTYPE_REGEX 4 // > RAKUTYPE_REGEX == 2 char identifiers
89#define RAKUTYPE_REGEX_RX 5 // 2 char ident
90#define RAKUTYPE_REGEX_TR 6 // 2 char ident
91#define RAKUTYPE_QLANG 7 // < RAKUTYPE_QLANG == RAKUTYPE_REGEX_?
92#define RAKUTYPE_STR_WQ 8 // 0 char ident < word quote >
93#define RAKUTYPE_STR_Q 9 // 1 char ident
94#define RAKUTYPE_STR_QX 10 // 2 char ident
95#define RAKUTYPE_STR_QW 11 // 2 char ident
96#define RAKUTYPE_STR_QQ 12 // 2 char ident
97#define RAKUTYPE_STR_QQX 13 // 3 char ident
98#define RAKUTYPE_STR_QQW 14 // 3 char ident
99#define RAKUTYPE_STR_QQWW 15 // 4 char ident
100
101// Delimiter types
102#define RAKUDELIM_BRACKET 0 // bracket: regex, Q language
103#define RAKUDELIM_QUOTE 1 // quote: normal string
104
105// rakuWordLists: keywords as defined in config
106const char *const rakuWordLists[] = {
107 "Keywords and identifiers",
108 "Functions",
109 "Types basic",
110 "Types composite",
111 "Types domain-specific",
112 "Types exception",
113 "Adverbs",
114 nullptr,
115};
116
117// Options and defaults
118struct OptionsRaku {
119 bool fold;
120 bool foldCompact;
121 bool foldComment;
122 bool foldCommentMultiline;
123 bool foldCommentPOD;
124 OptionsRaku() {
125 fold = true;
126 foldCompact = false;
127 foldComment = true;
128 foldCommentMultiline = true;
129 foldCommentPOD = true;
130 }
131};
132
133// init options and words
134struct OptionSetRaku : public OptionSet<OptionsRaku> {
135 OptionSetRaku() {
136 DefineProperty("fold", &OptionsRaku::fold);
137 DefineProperty("fold.comment", &OptionsRaku::foldComment);
138 DefineProperty("fold.compact", &OptionsRaku::foldCompact);
139
140 DefineProperty("fold.raku.comment.multiline", &OptionsRaku::foldCommentMultiline,
141 "Set this property to 0 to disable folding multi-line comments when fold.comment=1.");
142 DefineProperty("fold.raku.comment.pod", &OptionsRaku::foldCommentPOD,
143 "Set this property to 0 to disable folding POD comments when fold.comment=1.");
144
145 // init word lists
146 DefineWordListSets(rakuWordLists);
147 }
148};
149
150// Delimiter pair
151struct DelimPair {
152 int opener; // opener char
153 int closer[2]; // closer chars
154 bool interpol; // can variables be interpolated?
155 short count; // delimiter char count
156 DelimPair() {
157 opener = 0;
158 closer[0] = 0;
159 closer[1] = 0;
160 interpol = false;
161 count = 0;
162 }
163 bool isCloser(int ch) const {
164 return ch == closer[0] || ch == closer[1];
165 }
166};
167
168/*----------------------------------------------------------------------------*
169 * --- FUNCTIONS ---
170 *----------------------------------------------------------------------------*/
171
172/*
173 * IsANewLine
174 * - returns true if this is a new line char
175 */
176constexpr bool IsANewLine(int ch) noexcept {
177 return ch == '\r' || ch == '\n';
178}
179
180/*
181 * IsAWhitespace
182 * - returns true if this is a whitespace (or newline) char
183 */
184bool IsAWhitespace(int ch) noexcept {
185 return IsASpaceOrTab(ch) || IsANewLine(ch);
186}
187
188/*
189 * IsAlphabet
190 * - returns true if this is an alphabetical char
191 */
192constexpr bool IsAlphabet(int ch) noexcept {
193 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
194}
195
196/*
197 * IsCommentLine
198 * - returns true if this is a comment line
199 * - tests: SCE_RAKU_COMMENTLINE or SCE_RAKU_COMMENTEMBED
200 * modified from: LexPerl.cxx
201 */
202bool IsCommentLine(Sci_Position line, LexAccessor &styler, int type = SCE_RAKU_COMMENTLINE) {
203 Sci_Position pos = styler.LineStart(line);
204 Sci_Position eol_pos = styler.LineStart(line + 1) - 1;
205 for (Sci_Position i = pos; i < eol_pos; i++) {
206 char ch = styler[i];
207 int style = styler.StyleAt(i);
208 if (type == SCE_RAKU_COMMENTEMBED) {
209 if (i == (eol_pos - 1) && style == type)
210 return true;
211 } else { // make sure the line is NOT a SCE_RAKU_COMMENTEMBED
212 if (ch == '#' && style == type && styler[i+1] != '`' )
213 return true;
214 else if (!IsASpaceOrTab(ch))
215 return false;
216 }
217 }
218 return false;
219}
220
221/*
222 * ContainsQTo
223 * - returns true if this range contains ":to" in style SCE_RAKU_ADVERB indicating the start
224 * of a SCE_RAKU_HEREDOC_Q or SCE_RAKU_HEREDOC_QQ.
225 */
226bool ContainsQTo(Sci_Position start, Sci_Position end, LexAccessor &styler) {
227 std::string adverb;
228 for (Sci_Position i = start; i < end; i++) {
229 if (styler.StyleAt(i) == SCE_RAKU_ADVERB) {
230 adverb.push_back(styler[i]);
231 }
232 }
233 return adverb.find(":to") != std::string::npos;
234}
235
236/*
237 * GetBracketCloseChar
238 * - returns the end bracket char: opposite of start
239 * - see: http://www.unicode.org/Public/5.1.0/ucd/BidiMirroring.txt (first section)
240 * - Categories are general matches for valid BiDi types
241 * - Most closer chars are opener + 1
242 */
243int GetBracketCloseChar(const int ch) noexcept {
244 const CharacterCategory cc = CategoriseCharacter(ch);
245 switch (cc) {
246 case ccSm:
247 switch (ch) {
248 case 0x3C: return 0x3E; // LESS-THAN SIGN
249 case 0x2208: return 0x220B; // ELEMENT OF
250 case 0x2209: return 0x220C; // NOT AN ELEMENT OF
251 case 0x220A: return 0x220D; // SMALL ELEMENT OF
252 case 0x2215: return 0x29F5; // DIVISION SLASH
253 case 0x2243: return 0x22CD; // ASYMPTOTICALLY EQUAL TO
254 case 0x2298: return 0x29B8; // CIRCLED DIVISION SLASH
255 case 0x22A6: return 0x2ADE; // ASSERTION
256 case 0x22A8: return 0x2AE4; // TRUE
257 case 0x22A9: return 0x2AE3; // FORCES
258 case 0x22AB: return 0x2AE5; // DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
259 case 0x22F2: return 0x22FA; // ELEMENT OF WITH LONG HORIZONTAL STROKE
260 case 0x22F3: return 0x22FB; // ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
261 case 0x22F4: return 0x22FC; // SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
262 case 0x22F6: return 0x22FD; // ELEMENT OF WITH OVERBAR
263 case 0x22F7: return 0x22FE; // SMALL ELEMENT OF WITH OVERBAR
264 case 0xFF1C: return 0xFF1E; // FULLWIDTH LESS-THAN SIGN
265 }
266 break;
267 case ccPs:
268 switch (ch) {
269 case 0x5B: return 0x5D; // LEFT SQUARE BRACKET
270 case 0x7B: return 0x7D; // LEFT CURLY BRACKET
271 case 0x298D: return 0x2990; // LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
272 case 0x298F: return 0x298E; // LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
273 case 0xFF3B: return 0xFF3D; // FULLWIDTH LEFT SQUARE BRACKET
274 case 0xFF5B: return 0xFF5D; // FULLWIDTH LEFT CURLY BRACKET
275 }
276 break;
277 case ccPi:
278 break;
279 default: return 0;
280 }
281 return ch + 1;
282}
283
284/*
285 * IsValidQuoteOpener
286 * -
287 */
288bool IsValidQuoteOpener(const int ch, DelimPair &dp, int type = RAKUDELIM_BRACKET) noexcept {
289 dp.closer[0] = 0;
290 dp.closer[1] = 0;
291 dp.interpol = true;
292 if (type == RAKUDELIM_QUOTE) {
293 switch (ch) {
294 // Opener Closer Description
295 case '\'': dp.closer[0] = '\''; // APOSTROPHE
296 dp.interpol = false;
297 break;
298 case '"': dp.closer[0] = '"'; // QUOTATION MARK
299 break;
300 case 0x2018: dp.closer[0] = 0x2019; // LEFT SINGLE QUOTATION MARK
301 dp.interpol = false;
302 break;
303 case 0x201C: dp.closer[0] = 0x201D; // LEFT DOUBLE QUOTATION MARK
304 break;
305 case 0x201D: dp.closer[0] = 0x201C; // RIGHT DOUBLE QUOTATION MARK
306 break;
307 case 0x201E: dp.closer[0] = 0x201C; // DOUBLE LOW-9 QUOTATION MARK
308 dp.closer[1] = 0x201D;
309 break;
310 case 0xFF62: dp.closer[0] = 0xFF63; // HALFWIDTH LEFT CORNER BRACKET
311 dp.interpol = false;
312 break;
313 default: return false;
314 }
315 } else if (type == RAKUDELIM_BRACKET) {
316 dp.closer[0] = GetBracketCloseChar(ch);
317 }
318 dp.opener = ch;
319 dp.count = 1;
320 return dp.closer[0] > 0;
321}
322
323/*
324 * IsBracketOpenChar
325 * - true if this is a valid start bracket character
326 */
327bool IsBracketOpenChar(int ch) noexcept {
328 return GetBracketCloseChar(ch) > 0;
329}
330
331/*
332 * IsValidRegOrQAdjacent
333 * - returns true if ch is a valid character to put directly after Q / q
334 * * ref: Q Language: https://docs.raku.org/language/quoting
335 */
336bool IsValidRegOrQAdjacent(int ch) noexcept {
337 return !(IsAlphaNumeric(ch) || ch == '_' || ch == '(' || ch == ')' || ch == '\'' );
338}
339
340/*
341 * IsValidRegOrQPrecede
342 * - returns true if ch is a valid preceeding character to put directly before Q / q
343 * * ref: Q Language: https://docs.raku.org/language/quoting
344 */
345bool IsValidRegOrQPrecede(int ch) noexcept {
346 return !(IsAlphaNumeric(ch) || ch == '_');
347}
348
349/*
350 * MatchCharInRange
351 * - returns true if the mach character is found in range (of length)
352 * - ignoreDelim (default false)
353 */
354bool MatchCharInRange(StyleContext &sc, const Sci_Position length,
355 const int match, bool ignoreDelim = false) {
356 Sci_Position len = 0;
357 int chPrev = sc.chPrev;
358 while (++len < length) {
359 const int ch = sc.GetRelativeCharacter(len);
360 if (ch == match && (ignoreDelim || chPrev != '\\'))
361 return true;
362 }
363 return false;
364}
365
366/*
367 * PrevNonWhitespaceChar
368 * - returns the last non-whitespace char
369 */
370int PrevNonWhitespaceChar(StyleContext &sc) {
371 Sci_Position rel = 0;
372 Sci_Position max_back = 0 - sc.currentPos;
373 while (--rel > max_back) {
374 const int ch = sc.GetRelativeCharacter(rel);
375 if (!IsAWhitespace(ch))
376 return ch;
377 }
378 return 0; // no matching char
379}
380
381/*
382 * IsQLangStartAtScPos
383 * - returns true if this is a valid Q Language sc position
384 * - ref: https://docs.raku.org/language/quoting
385 * - Q :adverb :adverb //;
386 * - q,qx,qw,qq,qqx,qqw,qqww :adverb /:adverb /;
387 */
388bool IsQLangStartAtScPos(StyleContext &sc, int &type, const Sci_Position length) {
389 const bool valid_adj = IsValidRegOrQAdjacent(sc.chNext);
390 const int chFw2 = sc.GetRelativeCharacter(2);
391 const int chFw3 = sc.GetRelativeCharacter(3);
392 type = -1;
393 if (IsValidRegOrQPrecede(sc.chPrev)) {
394 if (sc.ch == 'Q' && valid_adj) {
395 type = RAKUTYPE_QLANG;
396 } else if (sc.ch == 'q') {
397 switch (sc.chNext) {
398 case 'x':
399 type = RAKUTYPE_STR_QX;
400 break;
401 case 'w':
402 type = RAKUTYPE_STR_QW;
403 break;
404 case 'q':
405 if (chFw2 == 'x') {
406 type = RAKUTYPE_STR_QQX;
407 } else if (chFw2 == 'w') {
408 if (chFw3 == 'w') {
409 type = RAKUTYPE_STR_QQWW;
410 } else {
411 type = RAKUTYPE_STR_QQW;
412 }
413 } else {
414 type = RAKUTYPE_STR_QQ;
415 }
416 break;
417 default:
418 type = RAKUTYPE_STR_Q;
419 }
420 } else if (sc.ch == '<' && MatchCharInRange(sc, length, '>')) {
421 type = RAKUTYPE_STR_WQ; // < word quote >
422 }
423 }
424 return type >= 0;
425}
426
427/*
428 * IsRegexStartAtScPos
429 * - returns true if this is a valid Regex sc position
430 * - ref: https://docs.raku.org/language/regexes
431 * - Regex: (rx/s/m/tr/y) :adverb /:adverb /;
432 * - regex R :adverb //;
433 * - /:adverb /;
434 */
435bool IsRegexStartAtScPos(StyleContext &sc, int &type, CharacterSet &set) {
436 const bool valid_adj = IsValidRegOrQAdjacent(sc.chNext);
437 type = -1;
438 if (IsValidRegOrQPrecede(sc.chPrev)) {
439 switch (sc.ch) {
440 case 'r':
441 if (sc.chNext == 'x')
442 type = RAKUTYPE_REGEX_RX;
443 break;
444 case 't':
445 case 'T':
446 if (sc.chNext == 'r' || sc.chNext == 'R')
447 type = RAKUTYPE_REGEX_TR;
448 break;
449 case 'm':
450 if (valid_adj)
451 type = RAKUTYPE_REGEX_M;
452 break;
453 case 's':
454 case 'S':
455 if (valid_adj)
456 type = RAKUTYPE_REGEX_S;
457 break;
458 case 'y':
459 if (valid_adj)
460 type = RAKUTYPE_REGEX_Y;
461 break;
462 case '/':
463 if (set.Contains(PrevNonWhitespaceChar(sc)))
464 type = RAKUTYPE_REGEX_NORM;
465 }
466 }
467 return type >= 0;
468}
469
470/*
471 * IsValidIdentPrecede
472 * - returns if ch is a valid preceeding char to put directly before an identifier
473 */
474bool IsValidIdentPrecede(int ch) noexcept {
475 return !(IsAlphaNumeric(ch) || ch == '_' || ch == '@' || ch == '$' || ch == '%');
476}
477
478/*
479 * IsValidDelimiter
480 * - returns if ch is a valid delimiter (most chars are valid)
481 * * ref: Q Language: https://docs.raku.org/language/quoting
482 */
483bool IsValidDelimiter(int ch) noexcept {
484 return !(IsAlphaNumeric(ch) || ch == ':');
485}
486
487/*
488 * GetDelimiterCloseChar
489 * - returns the corrisponding close char for a given delimiter (could be the same char)
490 */
491int GetDelimiterCloseChar(int ch) noexcept {
492 int ch_end = GetBracketCloseChar(ch);
493 if (ch_end == 0 && IsValidDelimiter(ch)) {
494 ch_end = ch;
495 }
496 return ch_end;
497}
498
499/*
500 * GetRepeatCharCount
501 * - returns the occurence count of match
502 */
503Sci_Position GetRepeatCharCount(StyleContext &sc, int chMatch, Sci_Position length) {
504 Sci_Position cnt = 0;
505 while (cnt < length) {
506 if (sc.GetRelativeCharacter(cnt) != chMatch) {
507 break;
508 }
509 cnt++;
510 }
511 return cnt;
512}
513
514/*
515 * LengthToDelimiter
516 * - returns the length until the end of a delimited string section
517 * - Ignores nested delimiters (if opener != closer)
518 * - no trailing char after last closer (default false)
519 */
520Sci_Position LengthToDelimiter(StyleContext &sc, const DelimPair &dp,
521 Sci_Position length, bool noTrailing = false) {
522 short cnt_open = 0; // count open bracket
523 short cnt_close = 0; // count close bracket
524 Sci_Position len = 0; // count characters
525 int chOpener = dp.opener; // look for nested opener / closer
526 if (dp.opener == dp.closer[0])
527 chOpener = 0; // no opening delimiter (no nesting possible)
528
529 while (len < length) {
530 const int chPrev = sc.GetRelativeCharacter(len - 1);
531 const int ch = sc.GetRelativeCharacter(len);
532 const int chNext = sc.GetRelativeCharacter(len+1);
533
534 if (cnt_open == 0 && cnt_close == dp.count) {
535 return len; // end condition has been met
536 } else {
537 if (chPrev != '\\' && ch == chOpener) { // ignore escape sequence
538 cnt_open++; // open nested bracket
539 } else if (chPrev != '\\' && dp.isCloser(ch)) { // ignore escape sequence
540 if ( cnt_open > 0 ) {
541 cnt_open--; // close nested bracket
542 } else if (dp.count > 1 && cnt_close < (dp.count - 1)) {
543 if (cnt_close > 1) {
544 if (dp.isCloser(chPrev)) {
545 cnt_close++;
546 } else { // reset if previous char was not close
547 cnt_close = 0;
548 }
549 } else {
550 cnt_close++;
551 }
552 } else if (!noTrailing || (IsAWhitespace(chNext))) {
553 cnt_close++; // found last close
554 if (cnt_close > 1 && !dp.isCloser(chPrev)) {
555 cnt_close = 0; // reset if previous char was not close
556 }
557 } else {
558 cnt_close = 0; // non handled close: reset
559 }
560 } else if (IsANewLine(ch)) {
561 cnt_open = 0; // reset after each line
562 cnt_close = 0;
563 }
564 }
565 len++;
566 }
567 return -1; // end condition has NOT been met
568}
569
570/*
571 * LengthToEndHeredoc
572 * - returns the length until the end of a heredoc section
573 * - delimiter string MUST begin on a new line
574 */
575Sci_Position LengthToEndHeredoc(const StyleContext &sc, LexAccessor &styler,
576 const Sci_Position length, const char *delim) {
577 bool on_new_ln = false;
578 int i = 0; // str index
579 for (int n = 0; n < length; n++) {
580 const char ch = styler.SafeGetCharAt(sc.currentPos + n, 0);
581 if (on_new_ln) {
582 if (delim[i] == '\0')
583 return n; // at end of str, match found!
584 if (ch != delim[i++])
585 i = 0; // no char match, reset 'i'ndex
586 }
587 if (i == 0) // detect new line
588 on_new_ln = IsANewLine(ch);
589 }
590 return -1; // no match found
591}
592
593/*
594 * LengthToNextChar
595 * - returns the length until the next character
596 */
597Sci_Position LengthToNextChar(StyleContext &sc, const Sci_Position length) {
598 Sci_Position len = 0;
599 while (++len < length) {
600 const int ch = sc.GetRelativeCharacter(len);
601 if (!IsASpaceOrTab(ch) && !IsANewLine(ch)) {
602 break;
603 }
604 }
605 return len;
606}
607
608/*
609 * GetRelativeString
610 * - gets a relitive string and sets it in &str
611 * - resets string before seting
612 */
613void GetRelativeString(StyleContext &sc, Sci_Position offset, Sci_Position length,
614 std::string &str) {
615 Sci_Position pos = offset;
616 str.clear();
617 while (pos < length) {
618 str += sc.GetRelativeCharacter(pos++);
619 }
620}
621
622} // end anonymous namespace
623
624/*----------------------------------------------------------------------------*
625 * --- class: LexerRaku ---
626 *----------------------------------------------------------------------------*/
627//class LexerRaku : public ILexerWithMetaData {
628class LexerRaku : public DefaultLexer {
629 CharacterSet setWord;
630 CharacterSet setSigil;
631 CharacterSet setTwigil;
632 CharacterSet setOperator;
633 CharacterSet setSpecialVar;
634 WordList regexIdent; // identifiers that specify a regex
635 OptionsRaku options; // Options from config
636 OptionSetRaku osRaku;
637 WordList keywords; // Word Lists from config
638 WordList functions;
639 WordList typesBasic;
640 WordList typesComposite;
641 WordList typesDomainSpecific;
642 WordList typesExceptions;
643 WordList adverbs;
644
645public:
646 // Defined as explicit, so that constructor can not be copied
647 explicit LexerRaku() :
648 DefaultLexer("raku", SCLEX_RAKU),
649 setWord(CharacterSet::setAlphaNum, "-_", 0x80),
650 setSigil(CharacterSet::setNone, "$&%@"),
651 setTwigil(CharacterSet::setNone, "!*.:<=?^~"),
652 setOperator(CharacterSet::setNone, "^&\\()-+=|{}[]:;<>,?!.~"),
653 setSpecialVar(CharacterSet::setNone, "_/!") {
654 regexIdent.Set("regex rule token");
655 }
656 // Deleted so LexerRaku objects can not be copied.
657 LexerRaku(const LexerRaku &) = delete;
658 LexerRaku(LexerRaku &&) = delete;
659 void operator=(const LexerRaku &) = delete;
660 void operator=(LexerRaku &&) = delete;
661 virtual ~LexerRaku() {
662 }
663 void SCI_METHOD Release() noexcept override {
664 delete this;
665 }
666 int SCI_METHOD Version() const noexcept override {
667 return lvRelease5;
668 }
669 const char *SCI_METHOD PropertyNames() override {
670 return osRaku.PropertyNames();
671 }
672 int SCI_METHOD PropertyType(const char *name) override {
673 return osRaku.PropertyType(name);
674 }
675 const char *SCI_METHOD DescribeProperty(const char *name) override {
676 return osRaku.DescribeProperty(name);
677 }
678 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override;
679 const char *SCI_METHOD PropertyGet(const char *key) override {
680 return osRaku.PropertyGet(key);
681 }
682 const char *SCI_METHOD DescribeWordListSets() override {
683 return osRaku.DescribeWordListSets();
684 }
685 Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override;
686 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
687 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
688
689 static ILexer5 *LexerFactoryRaku() {
690 return new LexerRaku();
691 }
692
693protected:
694 bool IsOperatorChar(const int ch);
695 bool IsWordChar(const int ch, bool allowNumber = true);
696 bool IsWordStartChar(const int ch);
697 bool IsNumberChar(const int ch, int base = 10);
698 bool ProcessRegexTwinCapture(StyleContext &sc, const Sci_Position length,
699 int &type, const DelimPair &dp);
700 void ProcessStringVars(StyleContext &sc, const Sci_Position length, const int varState);
701 bool ProcessValidRegQlangStart(StyleContext &sc, Sci_Position length, const int type,
702 WordList &wordsAdverbs, DelimPair &dp);
703 Sci_Position LengthToNonWordChar(StyleContext &sc, Sci_Position length,
704 char *s, const int size, Sci_Position offset = 0);
705};
706
707/*----------------------------------------------------------------------------*
708 * --- METHODS: LexerRaku ---
709 *----------------------------------------------------------------------------*/
710
711/*
712 * LexerRaku::IsOperatorChar
713 * - Test for both ASCII and Unicode operators
714 * see: https://docs.raku.org/language/unicode_entry
715 */
716bool LexerRaku::IsOperatorChar(const int ch) {
717 if (ch > 0x7F) {
718 switch (ch) {
719 // Unicode ASCII Equiv.
720 case 0x2208: // (elem)
721 case 0x2209: // !(elem)
722 case 0x220B: // (cont)
723 case 0x220C: // !(cont)
724 case 0x2216: // (-)
725 case 0x2229: // (&)
726 case 0x222A: // (|)
727 case 0x2282: // (<)
728 case 0x2283: // (>)
729 case 0x2284: // !(<)
730 case 0x2285: // !(>)
731 case 0x2286: // (<=)
732 case 0x2287: // (>=)
733 case 0x2288: // !(<=)
734 case 0x2289: // !(>=)
735 case 0x228D: // (.)
736 case 0x228E: // (+)
737 case 0x2296: // (^)
738 return true;
739 }
740 }
741 return setOperator.Contains(ch);
742}
743
744/*
745 * LexerRaku::IsWordChar
746 * - Test for both ASCII and Unicode identifier characters
747 * see: https://docs.raku.org/language/unicode_ascii
748 * also: ftp://ftp.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
749 * FIXME: *still* may not contain all valid characters
750 */
751bool LexerRaku::IsWordChar(const int ch, bool allowNumber) {
752 // Unicode numbers should not apear in word identifiers
753 if (ch > 0x7F) {
754 const CharacterCategory cc = CategoriseCharacter(ch);
755 switch (cc) {
756 // Letters
757 case ccLu:
758 case ccLl:
759 case ccLt:
760 case ccLm:
761 case ccLo:
762 return true;
763 default:
764 return false;
765 }
766 } else if (allowNumber && IsADigit(ch)) {
767 return true; // an ASCII number type
768 }
769 return setWord.Contains(ch);
770}
771
772/*
773 * LexerRaku::IsWordStartChar
774 * - Test for both ASCII and Unicode identifier "start / first" characters
775 */
776bool LexerRaku::IsWordStartChar(const int ch) {
777 return ch != '-' && IsWordChar(ch, false); // no numbers allowed
778}
779
780/*
781 * LexerRaku::IsNumberChar
782 * - Test for both ASCII and Unicode identifier number characters
783 * see: https://docs.raku.org/language/unicode_ascii
784 * also: ftp://ftp.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
785 * FILTERED by Unicode letters that are NUMBER
786 * and NOT PARENTHESIZED or CIRCLED
787 * FIXME: *still* may not contain all valid number characters
788 */
789bool LexerRaku::IsNumberChar(const int ch, int base) {
790 if (ch > 0x7F) {
791 const CharacterCategory cc = CategoriseCharacter(ch);
792 switch (cc) {
793 // Numbers
794 case ccNd:
795 case ccNl:
796 case ccNo:
797 return true;
798 default:
799 return false;
800 }
801 }
802 return IsADigit(ch, base);
803}
804
805/*
806 * LexerRaku::PropertySet
807 * -
808 */
809Sci_Position SCI_METHOD LexerRaku::PropertySet(const char *key, const char *val) {
810 if (osRaku.PropertySet(&options, key, val))
811 return 0;
812 return -1;
813}
814
815/*
816 * LexerRaku::WordListSet
817 * -
818 */
819Sci_Position SCI_METHOD LexerRaku::WordListSet(int n, const char *wl) {
820 WordList *wordListN = nullptr;
821 switch (n) {
822 case 0:
823 wordListN = &keywords;
824 break;
825 case 1:
826 wordListN = &functions;
827 break;
828 case 2:
829 wordListN = &typesBasic;
830 break;
831 case 3:
832 wordListN = &typesComposite;
833 break;
834 case 4:
835 wordListN = &typesDomainSpecific;
836 break;
837 case 5:
838 wordListN = &typesExceptions;
839 break;
840 case 6:
841 wordListN = &adverbs;
842 break;
843 }
844 Sci_Position firstModification = -1;
845 if (wordListN) {
846 WordList wlNew;
847 wlNew.Set(wl);
848 if (*wordListN != wlNew) {
849 wordListN->Set(wl);
850 firstModification = 0;
851 }
852 }
853 return firstModification;
854}
855
856/*
857 * LexerRaku::ProcessRegexTwinCapture
858 * - processes the transition between a regex pair (two sets of delimiters)
859 * - moves to first new delimiter, if a bracket
860 * - returns true when valid delimiter start found (if bracket)
861 */
862bool LexerRaku::ProcessRegexTwinCapture(StyleContext &sc, const Sci_Position length,
863 int &type, const DelimPair &dp) {
864
865 if (type == RAKUTYPE_REGEX_S || type == RAKUTYPE_REGEX_TR || type == RAKUTYPE_REGEX_Y) {
866 type = -1; // clear type
867
868 // move past chRegQClose if it was the previous char
869 if (dp.isCloser(sc.chPrev))
870 sc.Forward();
871
872 // no processing needed for non-bracket
873 if (dp.isCloser(dp.opener))
874 return true;
875
876 // move to next opening bracket
877 const Sci_Position len = LengthToNextChar(sc, length);
878 if (sc.GetRelativeCharacter(len) == dp.opener) {
879 sc.Forward(len);
880 return true;
881 }
882 }
883 return false;
884}
885
886/*
887 * LexerRaku::ProcessStringVars
888 * - processes a string and highlights any valid variables
889 */
890void LexerRaku::ProcessStringVars(StyleContext &sc, const Sci_Position length, const int varState) {
891 const int state = sc.state;
892 for (Sci_Position pos = 0; pos < length; pos++) {
893 if (sc.state == varState && !IsWordChar(sc.ch)) {
894 sc.SetState(state);
895 } else if (sc.chPrev != '\\'
896 && (sc.ch == '$' || sc.ch == '@')
897 && IsWordStartChar(sc.chNext)) {
898 sc.SetState(varState);
899 }
900 sc.Forward(); // Next character
901 }
902}
903/*
904 * LexerRaku::ProcessValidRegQlangStart
905 * - processes a section of the document range from after a Regex / Q delimiter
906 * - returns true on success
907 * - sets: adverbs, chOpen, chClose, chCount
908 * ref: https://docs.raku.org/language/regexes
909 */
910bool LexerRaku::ProcessValidRegQlangStart(StyleContext &sc, Sci_Position length, const int type,
911 WordList &wordsAdverbs, DelimPair &dp) {
912 Sci_Position startPos = sc.currentPos;
913 Sci_Position startLen = length;
914 const int target_state = sc.state;
915 int state = SCE_RAKU_DEFAULT;
916 std::string str;
917
918 // find our opening delimiter (and occurrences) / save any adverbs
919 dp.opener = 0; // adverbs can be after the first delimiter
920 bool got_all_adverbs = false; // in Regex statements
921 bool got_ident = false; // regex can have an identifier: 'regex R'
922 sc.SetState(state); // set state default to avoid pre-highlights
923 while ((dp.opener == 0 || !got_all_adverbs) && sc.More()) {
924
925 // move to the next non-space character
926 const bool was_space = IsAWhitespace(sc.ch);
927 if (!got_all_adverbs && was_space) {
928 sc.Forward(LengthToNextChar(sc, length));
929 }
930 length = startLen - (sc.currentPos - startPos); // update length remaining
931
932 // parse / eat an identifier (if type == RAKUTYPE_REGEX)
933 if (dp.opener == 0 && !got_ident && type == RAKUTYPE_REGEX && IsAlphabet(sc.ch)) {
934
935 // eat identifier / account for special adverb :sym<name>
936 bool got_sym = false;
937 while (sc.More()) {
938 sc.SetState(SCE_RAKU_IDENTIFIER);
939 while (sc.More() && (IsAlphaNumeric(sc.chNext)
940 || sc.chNext == '_' || sc.chNext == '-')) {
941 sc.Forward();
942 }
943 sc.Forward();
944 if (got_sym && sc.ch == '>') {
945 sc.SetState(SCE_RAKU_OPERATOR); // '>'
946 sc.Forward();
947 break;
948 } else if (type == RAKUTYPE_REGEX && sc.Match(":sym<")) {
949 sc.SetState(SCE_RAKU_ADVERB); // ':sym'
950 sc.Forward(4);
951 sc.SetState(SCE_RAKU_OPERATOR); // '<'
952 sc.Forward();
953 got_sym = true;
954 } else {
955 break;
956 }
957 }
958 sc.SetState(state);
959 got_ident = true;
960 }
961
962 // parse / save an adverb: RAKUTYPE_REGEX only has adverbs after delim
963 // >= RAKUTYPE_QLANG only has adverbs before delim
964 else if (!got_all_adverbs && sc.ch == ':' && (!(dp.opener == 0 && got_ident)
965 && !(dp.opener > 0 && type >= RAKUTYPE_QLANG))) {
966 sc.SetState(SCE_RAKU_ADVERB);
967 while (IsAlphaNumeric(sc.chNext) && sc.More()) {
968 sc.Forward();
969 str += sc.ch;
970 }
971 str += ' ';
972 sc.Forward();
973 sc.SetState(state);
974 }
975
976 // find starting delimiter
977 else if (dp.opener == 0 && (was_space || IsValidRegOrQAdjacent(sc.ch))
978 && IsValidDelimiter(sc.ch)) { // make sure the delimiter is legal (most are)
979 sc.SetState((state = target_state));// start state here...
980 dp.opener = sc.ch; // this is our delimiter, get count
981 if (type < RAKUTYPE_QLANG) // type is Regex
982 dp.count = 1; // has only one delimiter
983 else
984 dp.count = GetRepeatCharCount(sc, dp.opener, length);
985 sc.Forward(dp.count);
986 }
987
988 // we must have all the adverbs by now...
989 else {
990 if (got_all_adverbs)
991 break; // prevent infinite loop: occurs on missing open char
992 got_all_adverbs = true;
993 }
994 }
995
996 // set word list / find a valid closing delimiter (or bomb!)
997 wordsAdverbs.Set(str.c_str());
998 dp.closer[0] = GetDelimiterCloseChar(dp.opener);
999 dp.closer[1] = 0; // no other closer char
1000 return dp.closer[0] > 0;
1001}
1002
1003/*
1004 * LexerRaku::LengthToNonWordChar
1005 * - returns the length until the next non "word" character: AlphaNum + '_'
1006 * - also sets all the parsed chars in 's'
1007 */
1008Sci_Position LexerRaku::LengthToNonWordChar(StyleContext &sc, Sci_Position length,
1009 char *s, const int size, Sci_Position offset) {
1010 Sci_Position len = 0;
1011 Sci_Position max_length = size < length ? size : length;
1012 while (len <= max_length) {
1013 const int ch = sc.GetRelativeCharacter(len + offset);
1014 if (!IsWordChar(ch)) {
1015 s[len] = '\0';
1016 break;
1017 }
1018 s[len] = ch;
1019 len++;
1020 }
1021 s[len + 1] = '\0';
1022 return len;
1023}
1024
1025/*
1026 * LexerRaku::Lex
1027 * - Main lexer method
1028 */
1029void SCI_METHOD LexerRaku::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
1030 LexAccessor styler(pAccess);
1031 DelimPair dpEmbeded; // delimiter pair: embeded comments
1032 DelimPair dpString; // delimiter pair: string
1033 DelimPair dpRegQ; // delimiter pair: Regex / Q Lang
1034 std::string hereDelim; // heredoc delimiter (if in heredoc)
1035 int hereState = 0; // heredoc state to use (Q / QQ)
1036 int numState = 0; // number state / type
1037 short cntDecimal = 0; // number decinal count
1038 std::string wordLast; // last word seen
1039 std::string identLast; // last identifier seen
1040 std::string adverbLast; // last (single) adverb seen
1041 WordList lastAdverbs; // last adverbs seen
1042 Sci_Position len; // temp length value
1043 char s[100]; // temp char string
1044 int typeDetect; // temp type detected (for regex and Q lang)
1045 Sci_Position lengthToEnd; // length until the end of range
1046
1047 // Backtrack to safe start position before complex quoted elements
1048
1049 Sci_PositionU newStartPos = startPos;
1050 if (initStyle != SCE_RAKU_DEFAULT) {
1051 // Backtrack to last SCE_RAKU_DEFAULT or 0
1052 while (newStartPos > 0) {
1053 newStartPos--;
1054 if (styler.StyleAt(newStartPos) == SCE_RAKU_DEFAULT)
1055 break;
1056 }
1057 // Backtrack to start of line before SCE_RAKU_HEREDOC_Q?
1058 if (initStyle == SCE_RAKU_HEREDOC_Q || initStyle == SCE_RAKU_HEREDOC_QQ) {
1059 if (newStartPos > 0) {
1060 newStartPos = styler.LineStart(styler.GetLine(newStartPos));
1061 }
1062 }
1063 } else {
1064 const Sci_Position line = styler.GetLine(newStartPos);
1065 if (line > 0) {
1066 // If the previous line is a start of a q or qq heredoc, backtrack to start of line
1067 const Sci_Position startPreviousLine = styler.LineStart(line-1);
1068 if (ContainsQTo(startPreviousLine, newStartPos, styler)) {
1069 newStartPos = startPreviousLine;
1070 }
1071 }
1072 }
1073
1074
1075 // Re-calculate (any) changed startPos, length and initStyle state
1076 if (newStartPos < startPos) {
1077 initStyle = SCE_RAKU_DEFAULT;
1078 length += startPos - newStartPos;
1079 startPos = newStartPos;
1080 }
1081
1082 // init StyleContext
1083 StyleContext sc(startPos, length, initStyle, styler);
1084
1085 // StyleContext Loop
1086 for (; sc.More(); sc.Forward()) {
1087 lengthToEnd = (length - (sc.currentPos - startPos)); // end of range
1088
1089 /* *** Determine if the current state should terminate ************** *
1090 * Everything within the 'switch' statement processes characters up
1091 * until the end of a syntax highlight section / state.
1092 * ****************************************************************** */
1093 switch (sc.state) {
1094 case SCE_RAKU_OPERATOR:
1095 sc.SetState(SCE_RAKU_DEFAULT);
1096 break; // FIXME: better valid operator sequences needed?
1097 case SCE_RAKU_COMMENTLINE:
1098 if (IsANewLine(sc.ch)) {
1099 sc.SetState(SCE_RAKU_DEFAULT);
1100 }
1101 break;
1102 case SCE_RAKU_COMMENTEMBED:
1103 if ((len = LengthToDelimiter(sc, dpEmbeded, lengthToEnd)) >= 0) {
1104 sc.Forward(len); // Move to end delimiter
1105 sc.SetState(SCE_RAKU_DEFAULT);
1106 } else {
1107 sc.Forward(lengthToEnd); // no end delimiter found
1108 }
1109 break;
1110 case SCE_RAKU_POD:
1111 if (sc.atLineStart && sc.Match("=end pod")) {
1112 sc.Forward(8);
1113 sc.SetState(SCE_RAKU_DEFAULT);
1114 }
1115 break;
1116 case SCE_RAKU_STRING:
1117
1118 // Process the string for variables: move to end delimiter
1119 if ((len = LengthToDelimiter(sc, dpString, lengthToEnd)) >= 0) {
1120 if (dpString.interpol) {
1121 ProcessStringVars(sc, len, SCE_RAKU_STRING_VAR);
1122 } else {
1123 sc.Forward(len);
1124 }
1125 sc.SetState(SCE_RAKU_DEFAULT);
1126 } else {
1127 sc.Forward(lengthToEnd); // no end delimiter found
1128 }
1129 break;
1130 case SCE_RAKU_STRING_Q:
1131 case SCE_RAKU_STRING_QQ:
1132 case SCE_RAKU_STRING_Q_LANG:
1133
1134 // No string: previous char was the delimiter
1135 if (dpRegQ.count == 1 && dpRegQ.isCloser(sc.chPrev)) {
1136 sc.SetState(SCE_RAKU_DEFAULT);
1137 }
1138
1139 // Process the string for variables: move to end delimiter
1140 else if ((len = LengthToDelimiter(sc, dpRegQ, lengthToEnd)) >= 0) {
1141
1142 // set (any) heredoc delimiter string
1143 if (lastAdverbs.InList("to")) {
1144 GetRelativeString(sc, -1, len - dpRegQ.count, hereDelim);
1145 hereState = SCE_RAKU_HEREDOC_Q; // default heredoc state
1146 }
1147
1148 // select variable identifiers
1149 if (sc.state == SCE_RAKU_STRING_QQ || lastAdverbs.InList("qq")) {
1150 ProcessStringVars(sc, len, SCE_RAKU_STRING_VAR);
1151 hereState = SCE_RAKU_HEREDOC_QQ; // potential heredoc state
1152 } else {
1153 sc.Forward(len);
1154 }
1155 sc.SetState(SCE_RAKU_DEFAULT);
1156 } else {
1157 sc.Forward(lengthToEnd); // no end delimiter found
1158 }
1159 break;
1160 case SCE_RAKU_HEREDOC_Q:
1161 case SCE_RAKU_HEREDOC_QQ:
1162 if ((len = LengthToEndHeredoc(sc, styler, lengthToEnd, hereDelim.c_str())) >= 0) {
1163 // select variable identifiers
1164 if (sc.state == SCE_RAKU_HEREDOC_QQ) {
1165 ProcessStringVars(sc, len, SCE_RAKU_STRING_VAR);
1166 } else {
1167 sc.Forward(len);
1168 }
1169 sc.SetState(SCE_RAKU_DEFAULT);
1170 } else {
1171 sc.Forward(lengthToEnd); // no end delimiter found
1172 }
1173 hereDelim.clear(); // clear heredoc delimiter
1174 break;
1175 case SCE_RAKU_REGEX:
1176 // account for typeDetect = RAKUTYPE_REGEX_S/TR/Y
1177 while (sc.state == SCE_RAKU_REGEX) {
1178
1179 // No string: previous char was the delimiter
1180 if (dpRegQ.count == 1 && dpRegQ.isCloser(sc.chPrev)) {
1181 if (ProcessRegexTwinCapture(sc, lengthToEnd, typeDetect, dpRegQ))
1182 continue;
1183 sc.SetState(SCE_RAKU_DEFAULT);
1184 break;
1185 }
1186
1187 // Process the string for variables: move to end delimiter
1188 else if ((len = LengthToDelimiter(sc, dpRegQ, lengthToEnd)) >= 0) {
1189 ProcessStringVars(sc, len, SCE_RAKU_REGEX_VAR);
1190 if (ProcessRegexTwinCapture(sc, lengthToEnd, typeDetect, dpRegQ))
1191 continue;
1192 sc.SetState(SCE_RAKU_DEFAULT);
1193 break;
1194 } else {
1195 sc.Forward(lengthToEnd); // no end delimiter found
1196 break;
1197 }
1198 }
1199 break;
1200 case SCE_RAKU_NUMBER:
1201 if (sc.ch == '.') {
1202 if (sc.chNext == '.') { // '..' is an operator
1203 sc.SetState(SCE_RAKU_OPERATOR);
1204 sc.Forward();
1205 if (sc.chNext == '.') // '...' is also an operator
1206 sc.Forward();
1207 break;
1208 } else if (numState > RAKUNUM_FLOAT_EXP
1209 && (cntDecimal < 1 || numState == RAKUNUM_VERSION)) {
1210 cntDecimal++;
1211 sc.Forward();
1212 } else {
1213 sc.SetState(SCE_RAKU_DEFAULT);
1214 break; // too many decinal places
1215 }
1216 }
1217 switch (numState) {
1218 case RAKUNUM_BINARY:
1219 if (!IsNumberChar(sc.ch, 2))
1220 sc.SetState(SCE_RAKU_DEFAULT);
1221 break;
1222 case RAKUNUM_OCTAL:
1223 if (!IsNumberChar(sc.ch, 8))
1224 sc.SetState(SCE_RAKU_DEFAULT);
1225 break;
1226 case RAKUNUM_HEX:
1227 if (!IsNumberChar(sc.ch, 16))
1228 sc.SetState(SCE_RAKU_DEFAULT);
1229 break;
1230 case RAKUNUM_DECIMAL:
1231 case RAKUNUM_VERSION:
1232 if (!IsNumberChar(sc.ch))
1233 sc.SetState(SCE_RAKU_DEFAULT);
1234 }
1235 break;
1236 case SCE_RAKU_WORD:
1237 case SCE_RAKU_FUNCTION:
1238 case SCE_RAKU_TYPEDEF:
1239 case SCE_RAKU_ADVERB:
1240 sc.SetState(SCE_RAKU_DEFAULT);
1241 break;
1242 case SCE_RAKU_MU:
1243 case SCE_RAKU_POSITIONAL:
1244 case SCE_RAKU_ASSOCIATIVE:
1245 case SCE_RAKU_CALLABLE:
1246 case SCE_RAKU_IDENTIFIER:
1247 case SCE_RAKU_GRAMMAR:
1248 case SCE_RAKU_CLASS:
1249 sc.SetState(SCE_RAKU_DEFAULT);
1250 break;
1251 }
1252
1253 /* *** Determine if a new state should be entered ******************* *
1254 * Everything below here identifies the beginning of a state, all or part
1255 * of the characters within this state are processed here, the rest are
1256 * completed above in the terminate state section.
1257 * ****************************************************************** */
1258 if (sc.state == SCE_RAKU_DEFAULT) {
1259
1260 // --- Single line comment
1261 if (sc.ch == '#') {
1262 sc.SetState(SCE_RAKU_COMMENTLINE);
1263 }
1264
1265 // --- POD block
1266 else if (sc.atLineStart && sc.Match("=begin pod")) {
1267 sc.SetState(SCE_RAKU_POD);
1268 sc.Forward(10);
1269 }
1270
1271 // --- String (normal)
1272 else if (sc.chPrev != '\\' && (IsValidQuoteOpener(sc.ch, dpString, RAKUDELIM_QUOTE))) {
1273 sc.SetState(SCE_RAKU_STRING);
1274 }
1275
1276 // --- String (Q Language) ----------------------------------------
1277 // - https://docs.raku.org/language/quoting
1278 // - Q :adverb :adverb //;
1279 // - q,qx,qw,qq,qqx,qqw,qqww :adverb :adverb //;
1280 else if (IsQLangStartAtScPos(sc, typeDetect, lengthToEnd)) {
1281 int state = SCE_RAKU_STRING_Q_LANG;
1282 Sci_Position forward = 1; // single char ident (default)
1283 if (typeDetect > RAKUTYPE_QLANG) {
1284 state = SCE_RAKU_STRING_Q;
1285 if (typeDetect == RAKUTYPE_STR_WQ)
1286 forward = 0; // no char ident
1287 }
1288 if (typeDetect > RAKUTYPE_STR_Q) {
1289 if (typeDetect == RAKUTYPE_STR_QQ)
1290 state = SCE_RAKU_STRING_QQ;
1291 forward++; // two char ident
1292 }
1293 if (typeDetect > RAKUTYPE_STR_QQ)
1294 forward++; // three char ident
1295 if (typeDetect == RAKUTYPE_STR_QQWW)
1296 forward++; // four char ident
1297
1298 // Proceed: check for a valid character after statement
1299 if (IsValidRegOrQAdjacent(sc.GetRelative(forward)) || typeDetect == RAKUTYPE_QLANG) {
1300 sc.SetState(state);
1301 sc.Forward(forward);
1302 lastAdverbs.Clear();
1303
1304 // Process: adverbs / opening delimiter / adverbs after delim
1305 if (ProcessValidRegQlangStart(sc, lengthToEnd, typeDetect,
1306 lastAdverbs, dpRegQ))
1307 sc.SetState(state);
1308 }
1309 }
1310
1311 // --- Regex (rx/s/m/tr/y) ----------------------------------------
1312 // - https://docs.raku.org/language/regexes
1313 else if ((IsRegexStartAtScPos(sc, typeDetect, setOperator) || regexIdent.InList(wordLast.c_str()))) {
1314 if (typeDetect == -1) { // must be a regex identifier word
1315 wordLast.clear();
1316 typeDetect = RAKUTYPE_REGEX;
1317 }
1318 Sci_Position forward = 0; // no ident (RAKUTYPE_REGEX, RAKUTYPE_REGEX_NORM)
1319 if (typeDetect > 0 && typeDetect != RAKUTYPE_REGEX)
1320 forward++; // single char ident
1321 if (typeDetect > RAKUTYPE_REGEX)
1322 forward++; // two char ident
1323
1324 // Proceed: check for a valid character after statement
1325 if (IsValidRegOrQAdjacent(sc.GetRelative(forward)) || typeDetect == RAKUTYPE_REGEX_NORM) {
1326 sc.SetState(SCE_RAKU_REGEX);
1327 sc.Forward(forward);
1328 lastAdverbs.Clear();
1329
1330 // Process: adverbs / opening delimiter / adverbs after delim
1331 if (ProcessValidRegQlangStart(sc, lengthToEnd, typeDetect,
1332 lastAdverbs, dpRegQ))
1333 sc.SetState(SCE_RAKU_REGEX);
1334 }
1335 }
1336
1337 // --- Numbers ----------------------------------------------------
1338 else if (IsValidIdentPrecede(sc.chPrev) && (IsNumberChar(sc.ch)
1339 || (sc.ch == 'v' && IsNumberChar(sc.chNext) && wordLast == "use"))) {
1340 numState = RAKUNUM_DECIMAL; // default: decimal (base 10)
1341 cntDecimal = 0;
1342 sc.SetState(SCE_RAKU_NUMBER);
1343 if (sc.ch == 'v') // forward past 'v'
1344 sc.Forward();
1345 if (wordLast == "use") { // package version number
1346 numState = RAKUNUM_VERSION;
1347 } else if (sc.ch == '0') { // other type of number
1348 switch (sc.chNext) {
1349 case 'b': // binary (base 2)
1350 numState = RAKUNUM_BINARY;
1351 break;
1352 case 'o': // octal (base 8)
1353 numState = RAKUNUM_OCTAL;
1354 break;
1355 case 'x': // hexadecimal (base 16)
1356 numState = RAKUNUM_HEX;
1357 }
1358 if (numState != RAKUNUM_DECIMAL)
1359 sc.Forward(); // forward to number type char
1360 }
1361 }
1362
1363 // --- Keywords / functions / types / barewords -------------------
1364 else if ((sc.currentPos == 0 || sc.atLineStart || IsValidIdentPrecede(sc.chPrev))
1365 && IsWordStartChar(sc.ch)) {
1366 len = LengthToNonWordChar(sc, lengthToEnd, s, sizeof(s));
1367 if (keywords.InList(s)) {
1368 sc.SetState(SCE_RAKU_WORD); // Keywords
1369 } else if(functions.InList(s)) {
1370 sc.SetState(SCE_RAKU_FUNCTION); // Functions
1371 } else if(typesBasic.InList(s)) {
1372 sc.SetState(SCE_RAKU_TYPEDEF); // Types (basic)
1373 } else if(typesComposite.InList(s)) {
1374 sc.SetState(SCE_RAKU_TYPEDEF); // Types (composite)
1375 } else if(typesDomainSpecific.InList(s)) {
1376 sc.SetState(SCE_RAKU_TYPEDEF); // Types (domain-specific)
1377 } else if(typesExceptions.InList(s)) {
1378 sc.SetState(SCE_RAKU_TYPEDEF); // Types (exceptions)
1379 } else {
1380 if (wordLast == "class")
1381 sc.SetState(SCE_RAKU_CLASS); // a Class ident
1382 else if (wordLast == "grammar")
1383 sc.SetState(SCE_RAKU_GRAMMAR); // a Grammar ident
1384 else
1385 sc.SetState(SCE_RAKU_IDENTIFIER); // Bareword
1386 identLast = s; // save identifier
1387 }
1388 if (adverbLast == "sym") { // special adverb ":sym"
1389 sc.SetState(SCE_RAKU_IDENTIFIER); // treat as identifier
1390 identLast = s; // save identifier
1391 }
1392 if (sc.state != SCE_RAKU_IDENTIFIER)
1393 wordLast = s; // save word
1394 sc.Forward(len - 1); // ...forward past word
1395 }
1396
1397 // --- Adverbs ----------------------------------------------------
1398 else if (sc.ch == ':' && IsWordStartChar(sc.chNext)) {
1399 len = LengthToNonWordChar(sc, lengthToEnd, s, sizeof(s), 1);
1400 if (adverbs.InList(s)) {
1401 sc.SetState(SCE_RAKU_ADVERB); // Adverbs (begin with ':')
1402 adverbLast = s; // save word
1403 sc.Forward(len); // ...forward past word (less offset: 1)
1404 }
1405 }
1406
1407 // --- Identifiers: $mu / @positional / %associative / &callable --
1408 // see: https://docs.raku.org/language/variables
1409 else if (setSigil.Contains(sc.ch) && (setTwigil.Contains(sc.chNext)
1410 || setSpecialVar.Contains(sc.chNext)
1411 || IsWordStartChar(sc.chNext))) {
1412
1413 // State based on sigil
1414 switch (sc.ch) {
1415 case '$': sc.SetState(SCE_RAKU_MU);
1416 break;
1417 case '@': sc.SetState(SCE_RAKU_POSITIONAL);
1418 break;
1419 case '%': sc.SetState(SCE_RAKU_ASSOCIATIVE);
1420 break;
1421 case '&': sc.SetState(SCE_RAKU_CALLABLE);
1422 }
1423 const int state = sc.state;
1424 sc.Forward();
1425 char ch_delim = 0;
1426 if (setSpecialVar.Contains(sc.ch)
1427 && !setWord.Contains(sc.chNext)) { // Process Special Var
1428 ch_delim = -1;
1429 } else if (setTwigil.Contains(sc.ch)) { // Process Twigil
1430 sc.SetState(SCE_RAKU_OPERATOR);
1431 if (sc.ch == '<' && setWord.Contains(sc.chNext))
1432 ch_delim = '>';
1433 sc.Forward();
1434 sc.SetState(state);
1435 }
1436
1437 // Process (any) identifier
1438 if (ch_delim >= 0) {
1439 sc.Forward(LengthToNonWordChar(sc, lengthToEnd, s, sizeof(s)) - 1);
1440 if (ch_delim > 0 && sc.chNext == ch_delim) {
1441 sc.Forward();
1442 sc.SetState(SCE_RAKU_OPERATOR);
1443 }
1444 identLast = s; // save identifier
1445 }
1446 }
1447
1448 // --- Operators --------------------------------------------------
1449 else if (IsOperatorChar(sc.ch)) {
1450 // FIXME: better valid operator sequences needed?
1451 sc.SetState(SCE_RAKU_OPERATOR);
1452 }
1453
1454 // --- Heredoc: begin ---------------------------------------------
1455 else if (!hereDelim.empty() && sc.atLineEnd) {
1456 if (IsANewLine(sc.ch))
1457 sc.Forward(); // skip a possible CRLF situation
1458 sc.SetState(hereState);
1459 }
1460
1461 // Reset words: on operator simi-colon OR '}' (end of statement)
1462 if (sc.state == SCE_RAKU_OPERATOR && (sc.ch == ';' || sc.ch == '}')) {
1463 wordLast.clear();
1464 identLast.clear();
1465 adverbLast.clear();
1466 }
1467 }
1468
1469 /* *** Determine if an "embedded comment" is to be entered ********** *
1470 * This type of embedded comment section, or multi-line comment comes
1471 * after a normal comment has begun... e.g: #`[ ... ]
1472 * ****************************************************************** */
1473 else if (sc.state == SCE_RAKU_COMMENTLINE && sc.chPrev == '#' && sc.ch == '`') {
1474 if (IsBracketOpenChar(sc.chNext)) {
1475 sc.Forward(); // Condition met for "embedded comment"
1476 dpEmbeded.opener = sc.ch;
1477
1478 // Find the opposite (termination) closeing bracket (if any)
1479 dpEmbeded.closer[0] = GetBracketCloseChar(dpEmbeded.opener);
1480 if (dpEmbeded.closer[0] > 0) { // Enter "embedded comment"
1481
1482 // Find multiple opening character occurence
1483 dpEmbeded.count = GetRepeatCharCount(sc, dpEmbeded.opener, lengthToEnd);
1484 sc.SetState(SCE_RAKU_COMMENTEMBED);
1485 sc.Forward(dpEmbeded.count - 1); // incremented in the next loop
1486 }
1487 }
1488 }
1489 }
1490
1491 // And we're done...
1492 sc.Complete();
1493}
1494
1495/*
1496 * LexerRaku::Lex
1497 * - Main fold method
1498 * NOTE: although Raku uses and supports UNICODE characters, we're only looking
1499 * at normal chars here, using 'SafeGetCharAt' - for folding purposes
1500 * that is all we need.
1501 */
1502#define RAKU_HEADFOLD_SHIFT 4
1503#define RAKU_HEADFOLD_MASK 0xF0
1504void SCI_METHOD LexerRaku::Fold(Sci_PositionU startPos, Sci_Position length, int /* initStyle */, IDocument *pAccess) {
1505
1506 // init LexAccessor / return if fold option is off
1507 if (!options.fold) return;
1508 LexAccessor styler(pAccess);
1509
1510 // init char and line positions
1511 const Sci_PositionU endPos = startPos + length;
1512 Sci_Position lineCurrent = styler.GetLine(startPos);
1513
1514 // Backtrack to last SCE_RAKU_DEFAULT line
1515 if (startPos > 0 && lineCurrent > 0) {
1516 while (lineCurrent > 0 && styler.StyleAt(startPos) != SCE_RAKU_DEFAULT) {
1517 lineCurrent--;
1518 startPos = styler.LineStart(lineCurrent);
1519 }
1520 lineCurrent = styler.GetLine(startPos);
1521 }
1522 Sci_PositionU lineStart = startPos;
1523 Sci_PositionU lineStartNext = styler.LineStart(lineCurrent + 1);
1524
1525 // init line folding level
1526 int levelPrev = SC_FOLDLEVELBASE;
1527 if (lineCurrent > 0)
1528 levelPrev = styler.LevelAt(lineCurrent - 1) >> 16;
1529 int levelCurrent = levelPrev;
1530
1531 // init char and style variables
1532 char chNext = styler[startPos];
1533 int stylePrev = styler.StyleAt(startPos - 1);
1534 int styleNext = styler.StyleAt(startPos);
1535 int styleNextStartLine = styler.StyleAt(lineStartNext);
1536 int visibleChars = 0;
1537 bool wasCommentMulti = false;
1538
1539 // main loop
1540 for (Sci_PositionU i = startPos; i < endPos; i++) {
1541
1542 // next char, style and flags
1543 const char ch = chNext;
1544 chNext = styler.SafeGetCharAt(i + 1);
1545 const int style = styleNext;
1546 styleNext = styler.StyleAt(i + 1);
1547 const bool atEOL = i == (lineStartNext - 1);
1548 const bool atLineStart = i == lineStart;
1549
1550 // --- Comments / Multi-line / POD ------------------------------------
1551 if (options.foldComment) {
1552
1553 // Multi-line
1554 if (options.foldCommentMultiline) {
1555 if (style == SCE_RAKU_COMMENTLINE && atLineStart && ch == '#' && chNext == '`'
1556 && styleNextStartLine == SCE_RAKU_COMMENTEMBED) {
1557 levelCurrent++;
1558 wasCommentMulti = true; // don't confuse line comments
1559 } else if (style == SCE_RAKU_COMMENTEMBED && atLineStart
1560 && styleNextStartLine != SCE_RAKU_COMMENTEMBED) {
1561 levelCurrent--;
1562 }
1563 }
1564
1565 // Line comments
1566 if (!wasCommentMulti && atEOL && stylePrev == SCE_RAKU_COMMENTLINE
1567 && IsCommentLine(lineCurrent, styler)) {
1568 if (!IsCommentLine(lineCurrent - 1, styler)
1569 && IsCommentLine(lineCurrent + 1, styler))
1570 levelCurrent++;
1571 else if (IsCommentLine(lineCurrent - 1, styler)
1572 && !IsCommentLine(lineCurrent + 1, styler))
1573 levelCurrent--;
1574 }
1575
1576 // POD
1577 if (options.foldCommentPOD && atLineStart && style == SCE_RAKU_POD) {
1578 if (styler.Match(i, "=begin"))
1579 levelCurrent++;
1580 else if (styler.Match(i, "=end"))
1581 levelCurrent--;
1582 }
1583 }
1584
1585 // --- Code block -----------------------------------------------------
1586 if (style == SCE_RAKU_OPERATOR) {
1587 if (ch == '{') {
1588 if (levelCurrent < levelPrev) levelPrev--;
1589 levelCurrent++;
1590 } else if (ch == '}') {
1591 levelCurrent--;
1592 }
1593 }
1594
1595 // --- at end of line / range / apply fold ----------------------------
1596 if (atEOL) {
1597 int level = levelPrev;
1598
1599 // set level flags
1600 level |= levelCurrent << 16;
1601 if (visibleChars == 0 && options.foldCompact)
1602 level |= SC_FOLDLEVELWHITEFLAG;
1603 if ((levelCurrent > levelPrev) && (visibleChars > 0))
1604 level |= SC_FOLDLEVELHEADERFLAG;
1605 if (level != styler.LevelAt(lineCurrent)) {
1606 styler.SetLevel(lineCurrent, level);
1607 }
1608 lineCurrent++;
1609 lineStart = lineStartNext;
1610 lineStartNext = styler.LineStart(lineCurrent + 1);
1611 styleNextStartLine = styler.StyleAt(lineStartNext);
1612 levelPrev = levelCurrent;
1613 visibleChars = 0;
1614 wasCommentMulti = false;
1615 }
1616
1617 // increment visibleChars / set previous char
1618 if (!isspacechar(ch))
1619 visibleChars++;
1620 stylePrev = style;
1621 }
1622
1623 // Done: set real level of the next line
1624 int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK;
1625 styler.SetLevel(lineCurrent, levelPrev | flagsNext);
1626}
1627
1628/*----------------------------------------------------------------------------*
1629 * --- Scintilla: LexerModule ---
1630 *----------------------------------------------------------------------------*/
1631
1632LexerModule lmRaku(SCLEX_RAKU, LexerRaku::LexerFactoryRaku, "raku", rakuWordLists);
1633