1 | // Scintilla source code edit control |
2 | /** |
3 | * @file LexJSON.cxx |
4 | * @date February 19, 2016 |
5 | * @brief Lexer for JSON and JSON-LD formats |
6 | * @author nkmathew |
7 | * |
8 | * The License.txt file describes the conditions under which this software may |
9 | * be distributed. |
10 | * |
11 | */ |
12 | |
13 | #include <cstdlib> |
14 | #include <cassert> |
15 | #include <cctype> |
16 | #include <cstdio> |
17 | |
18 | #include <string> |
19 | #include <string_view> |
20 | #include <vector> |
21 | #include <map> |
22 | #include <functional> |
23 | |
24 | #include "ILexer.h" |
25 | #include "Scintilla.h" |
26 | #include "SciLexer.h" |
27 | #include "WordList.h" |
28 | #include "LexAccessor.h" |
29 | #include "StyleContext.h" |
30 | #include "CharacterSet.h" |
31 | #include "LexerModule.h" |
32 | #include "OptionSet.h" |
33 | #include "DefaultLexer.h" |
34 | |
35 | using namespace Scintilla; |
36 | using namespace Lexilla; |
37 | |
38 | static const char *const JSONWordListDesc[] = { |
39 | "JSON Keywords" , |
40 | "JSON-LD Keywords" , |
41 | 0 |
42 | }; |
43 | |
44 | /** |
45 | * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the |
46 | * colon separating the prefix and suffix |
47 | * |
48 | * https://www.w3.org/TR/json-ld/#dfn-compact-iri |
49 | */ |
50 | struct CompactIRI { |
51 | int colonCount; |
52 | bool foundInvalidChar; |
53 | CharacterSet setCompactIRI; |
54 | CompactIRI() { |
55 | colonCount = 0; |
56 | foundInvalidChar = false; |
57 | setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-" ); |
58 | } |
59 | void resetState() { |
60 | colonCount = 0; |
61 | foundInvalidChar = false; |
62 | } |
63 | void checkChar(int ch) { |
64 | if (ch == ':') { |
65 | colonCount++; |
66 | } else { |
67 | foundInvalidChar |= !setCompactIRI.Contains(ch); |
68 | } |
69 | } |
70 | bool shouldHighlight() const { |
71 | return !foundInvalidChar && colonCount == 1; |
72 | } |
73 | }; |
74 | |
75 | /** |
76 | * Keeps track of escaped characters in strings as per: |
77 | * |
78 | * https://tools.ietf.org/html/rfc7159#section-7 |
79 | */ |
80 | struct EscapeSequence { |
81 | int digitsLeft; |
82 | CharacterSet setHexDigits; |
83 | CharacterSet setEscapeChars; |
84 | EscapeSequence() { |
85 | digitsLeft = 0; |
86 | setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef" ); |
87 | setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/" ); |
88 | } |
89 | // Returns true if the following character is a valid escaped character |
90 | bool newSequence(int nextChar) { |
91 | digitsLeft = 0; |
92 | if (nextChar == 'u') { |
93 | digitsLeft = 5; |
94 | } else if (!setEscapeChars.Contains(nextChar)) { |
95 | return false; |
96 | } |
97 | return true; |
98 | } |
99 | bool atEscapeEnd() const { |
100 | return digitsLeft <= 0; |
101 | } |
102 | bool isInvalidChar(int currChar) const { |
103 | return !setHexDigits.Contains(currChar); |
104 | } |
105 | }; |
106 | |
107 | struct OptionsJSON { |
108 | bool foldCompact; |
109 | bool fold; |
110 | bool ; |
111 | bool escapeSequence; |
112 | OptionsJSON() { |
113 | foldCompact = false; |
114 | fold = false; |
115 | allowComments = false; |
116 | escapeSequence = false; |
117 | } |
118 | }; |
119 | |
120 | struct OptionSetJSON : public OptionSet<OptionsJSON> { |
121 | OptionSetJSON() { |
122 | DefineProperty("lexer.json.escape.sequence" , &OptionsJSON::escapeSequence, |
123 | "Set to 1 to enable highlighting of escape sequences in strings" ); |
124 | |
125 | DefineProperty("lexer.json.allow.comments" , &OptionsJSON::allowComments, |
126 | "Set to 1 to enable highlighting of line/block comments in JSON" ); |
127 | |
128 | DefineProperty("fold.compact" , &OptionsJSON::foldCompact); |
129 | DefineProperty("fold" , &OptionsJSON::fold); |
130 | DefineWordListSets(JSONWordListDesc); |
131 | } |
132 | }; |
133 | |
134 | class LexerJSON : public DefaultLexer { |
135 | OptionsJSON options; |
136 | OptionSetJSON optSetJSON; |
137 | EscapeSequence escapeSeq; |
138 | WordList keywordsJSON; |
139 | WordList keywordsJSONLD; |
140 | CharacterSet setOperators; |
141 | CharacterSet setURL; |
142 | CharacterSet setKeywordJSONLD; |
143 | CharacterSet setKeywordJSON; |
144 | CompactIRI compactIRI; |
145 | |
146 | static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) { |
147 | Sci_Position i = 0; |
148 | while (i < 50) { |
149 | i++; |
150 | char curr = styler.SafeGetCharAt(start+i, '\0'); |
151 | char next = styler.SafeGetCharAt(start+i+1, '\0'); |
152 | bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n'); |
153 | if (curr == ch) { |
154 | return true; |
155 | } else if (!isspacechar(curr) || atEOL) { |
156 | return false; |
157 | } |
158 | } |
159 | return false; |
160 | } |
161 | |
162 | /** |
163 | * Looks for the colon following the end quote |
164 | * |
165 | * Assumes property names of lengths no longer than a 100 characters. |
166 | * The colon is also expected to be less than 50 spaces after the end |
167 | * quote for the string to be considered a property name |
168 | */ |
169 | static bool AtPropertyName(LexAccessor &styler, Sci_Position start) { |
170 | Sci_Position i = 0; |
171 | bool escaped = false; |
172 | while (i < 100) { |
173 | i++; |
174 | char curr = styler.SafeGetCharAt(start+i, '\0'); |
175 | if (escaped) { |
176 | escaped = false; |
177 | continue; |
178 | } |
179 | escaped = curr == '\\'; |
180 | if (curr == '"') { |
181 | return IsNextNonWhitespace(styler, start+i, ':'); |
182 | } else if (!curr) { |
183 | return false; |
184 | } |
185 | } |
186 | return false; |
187 | } |
188 | |
189 | static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet, |
190 | StyleContext &context, LexAccessor &styler) { |
191 | char word[51]; |
192 | Sci_Position currPos = (Sci_Position) context.currentPos; |
193 | int i = 0; |
194 | while (i < 50) { |
195 | char ch = styler.SafeGetCharAt(currPos + i); |
196 | if (!wordSet.Contains(ch)) { |
197 | break; |
198 | } |
199 | word[i] = ch; |
200 | i++; |
201 | } |
202 | word[i] = '\0'; |
203 | return keywordList.InList(word); |
204 | } |
205 | |
206 | public: |
207 | LexerJSON() : |
208 | DefaultLexer("json" , SCLEX_JSON), |
209 | setOperators(CharacterSet::setNone, "[{}]:," ), |
210 | setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),=" ), |
211 | setKeywordJSONLD(CharacterSet::setAlpha, ":@" ), |
212 | setKeywordJSON(CharacterSet::setAlpha, "$_" ) { |
213 | } |
214 | virtual ~LexerJSON() {} |
215 | int SCI_METHOD Version() const override { |
216 | return lvRelease5; |
217 | } |
218 | void SCI_METHOD Release() override { |
219 | delete this; |
220 | } |
221 | const char *SCI_METHOD PropertyNames() override { |
222 | return optSetJSON.PropertyNames(); |
223 | } |
224 | int SCI_METHOD PropertyType(const char *name) override { |
225 | return optSetJSON.PropertyType(name); |
226 | } |
227 | const char *SCI_METHOD DescribeProperty(const char *name) override { |
228 | return optSetJSON.DescribeProperty(name); |
229 | } |
230 | Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override { |
231 | if (optSetJSON.PropertySet(&options, key, val)) { |
232 | return 0; |
233 | } |
234 | return -1; |
235 | } |
236 | const char * SCI_METHOD PropertyGet(const char *key) override { |
237 | return optSetJSON.PropertyGet(key); |
238 | } |
239 | Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override { |
240 | WordList *wordListN = 0; |
241 | switch (n) { |
242 | case 0: |
243 | wordListN = &keywordsJSON; |
244 | break; |
245 | case 1: |
246 | wordListN = &keywordsJSONLD; |
247 | break; |
248 | } |
249 | Sci_Position firstModification = -1; |
250 | if (wordListN) { |
251 | WordList wlNew; |
252 | wlNew.Set(wl); |
253 | if (*wordListN != wlNew) { |
254 | wordListN->Set(wl); |
255 | firstModification = 0; |
256 | } |
257 | } |
258 | return firstModification; |
259 | } |
260 | void *SCI_METHOD PrivateCall(int, void *) override { |
261 | return 0; |
262 | } |
263 | static ILexer5 *LexerFactoryJSON() { |
264 | return new LexerJSON; |
265 | } |
266 | const char *SCI_METHOD DescribeWordListSets() override { |
267 | return optSetJSON.DescribeWordListSets(); |
268 | } |
269 | void SCI_METHOD Lex(Sci_PositionU startPos, |
270 | Sci_Position length, |
271 | int initStyle, |
272 | IDocument *pAccess) override; |
273 | void SCI_METHOD Fold(Sci_PositionU startPos, |
274 | Sci_Position length, |
275 | int initStyle, |
276 | IDocument *pAccess) override; |
277 | }; |
278 | |
279 | void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos, |
280 | Sci_Position length, |
281 | int initStyle, |
282 | IDocument *pAccess) { |
283 | LexAccessor styler(pAccess); |
284 | StyleContext context(startPos, length, initStyle, styler); |
285 | int stringStyleBefore = SCE_JSON_STRING; |
286 | while (context.More()) { |
287 | switch (context.state) { |
288 | case SCE_JSON_BLOCKCOMMENT: |
289 | if (context.Match("*/" )) { |
290 | context.Forward(); |
291 | context.ForwardSetState(SCE_JSON_DEFAULT); |
292 | } |
293 | break; |
294 | case SCE_JSON_LINECOMMENT: |
295 | if (context.atLineEnd) { |
296 | context.SetState(SCE_JSON_DEFAULT); |
297 | } |
298 | break; |
299 | case SCE_JSON_STRINGEOL: |
300 | if (context.atLineStart) { |
301 | context.SetState(SCE_JSON_DEFAULT); |
302 | } |
303 | break; |
304 | case SCE_JSON_ESCAPESEQUENCE: |
305 | escapeSeq.digitsLeft--; |
306 | if (!escapeSeq.atEscapeEnd()) { |
307 | if (escapeSeq.isInvalidChar(context.ch)) { |
308 | context.SetState(SCE_JSON_ERROR); |
309 | } |
310 | break; |
311 | } |
312 | if (context.ch == '"') { |
313 | context.SetState(stringStyleBefore); |
314 | context.ForwardSetState(SCE_C_DEFAULT); |
315 | } else if (context.ch == '\\') { |
316 | if (!escapeSeq.newSequence(context.chNext)) { |
317 | context.SetState(SCE_JSON_ERROR); |
318 | } |
319 | context.Forward(); |
320 | } else { |
321 | context.SetState(stringStyleBefore); |
322 | if (context.atLineEnd) { |
323 | context.ChangeState(SCE_JSON_STRINGEOL); |
324 | } |
325 | } |
326 | break; |
327 | case SCE_JSON_PROPERTYNAME: |
328 | case SCE_JSON_STRING: |
329 | if (context.ch == '"') { |
330 | if (compactIRI.shouldHighlight()) { |
331 | context.ChangeState(SCE_JSON_COMPACTIRI); |
332 | context.ForwardSetState(SCE_JSON_DEFAULT); |
333 | compactIRI.resetState(); |
334 | } else { |
335 | context.ForwardSetState(SCE_JSON_DEFAULT); |
336 | } |
337 | } else if (context.atLineEnd) { |
338 | context.ChangeState(SCE_JSON_STRINGEOL); |
339 | } else if (context.ch == '\\') { |
340 | stringStyleBefore = context.state; |
341 | if (options.escapeSequence) { |
342 | context.SetState(SCE_JSON_ESCAPESEQUENCE); |
343 | if (!escapeSeq.newSequence(context.chNext)) { |
344 | context.SetState(SCE_JSON_ERROR); |
345 | } |
346 | } |
347 | context.Forward(); |
348 | } else if (context.Match("https://" ) || |
349 | context.Match("http://" ) || |
350 | context.Match("ssh://" ) || |
351 | context.Match("git://" ) || |
352 | context.Match("svn://" ) || |
353 | context.Match("ftp://" ) || |
354 | context.Match("mailto:" )) { |
355 | // Handle most common URI schemes only |
356 | stringStyleBefore = context.state; |
357 | context.SetState(SCE_JSON_URI); |
358 | } else if (context.ch == '@') { |
359 | // https://www.w3.org/TR/json-ld/#dfn-keyword |
360 | if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) { |
361 | stringStyleBefore = context.state; |
362 | context.SetState(SCE_JSON_LDKEYWORD); |
363 | } |
364 | } else { |
365 | compactIRI.checkChar(context.ch); |
366 | } |
367 | break; |
368 | case SCE_JSON_LDKEYWORD: |
369 | case SCE_JSON_URI: |
370 | if ((!setKeywordJSONLD.Contains(context.ch) && |
371 | (context.state == SCE_JSON_LDKEYWORD)) || |
372 | (!setURL.Contains(context.ch))) { |
373 | context.SetState(stringStyleBefore); |
374 | } |
375 | if (context.ch == '"') { |
376 | context.ForwardSetState(SCE_JSON_DEFAULT); |
377 | } else if (context.atLineEnd) { |
378 | context.ChangeState(SCE_JSON_STRINGEOL); |
379 | } |
380 | break; |
381 | case SCE_JSON_OPERATOR: |
382 | case SCE_JSON_NUMBER: |
383 | context.SetState(SCE_JSON_DEFAULT); |
384 | break; |
385 | case SCE_JSON_ERROR: |
386 | if (context.atLineEnd) { |
387 | context.SetState(SCE_JSON_DEFAULT); |
388 | } |
389 | break; |
390 | case SCE_JSON_KEYWORD: |
391 | if (!setKeywordJSON.Contains(context.ch)) { |
392 | context.SetState(SCE_JSON_DEFAULT); |
393 | } |
394 | break; |
395 | } |
396 | if (context.state == SCE_JSON_DEFAULT) { |
397 | if (context.ch == '"') { |
398 | compactIRI.resetState(); |
399 | context.SetState(SCE_JSON_STRING); |
400 | Sci_Position currPos = static_cast<Sci_Position>(context.currentPos); |
401 | if (AtPropertyName(styler, currPos)) { |
402 | context.SetState(SCE_JSON_PROPERTYNAME); |
403 | } |
404 | } else if (setOperators.Contains(context.ch)) { |
405 | context.SetState(SCE_JSON_OPERATOR); |
406 | } else if (options.allowComments && context.Match("/*" )) { |
407 | context.SetState(SCE_JSON_BLOCKCOMMENT); |
408 | context.Forward(); |
409 | } else if (options.allowComments && context.Match("//" )) { |
410 | context.SetState(SCE_JSON_LINECOMMENT); |
411 | } else if (setKeywordJSON.Contains(context.ch)) { |
412 | if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) { |
413 | context.SetState(SCE_JSON_KEYWORD); |
414 | } |
415 | } |
416 | bool numberStart = |
417 | IsADigit(context.ch) && (context.chPrev == '+'|| |
418 | context.chPrev == '-' || |
419 | context.atLineStart || |
420 | IsASpace(context.chPrev) || |
421 | setOperators.Contains(context.chPrev)); |
422 | bool exponentPart = |
423 | tolower(context.ch) == 'e' && |
424 | IsADigit(context.chPrev) && |
425 | (IsADigit(context.chNext) || |
426 | context.chNext == '+' || |
427 | context.chNext == '-'); |
428 | bool signPart = |
429 | (context.ch == '-' || context.ch == '+') && |
430 | ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) || |
431 | ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev)) |
432 | && IsADigit(context.chNext))); |
433 | bool adjacentDigit = |
434 | IsADigit(context.ch) && IsADigit(context.chPrev); |
435 | bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e'; |
436 | bool dotPart = context.ch == '.' && |
437 | IsADigit(context.chPrev) && |
438 | IsADigit(context.chNext); |
439 | bool afterDot = IsADigit(context.ch) && context.chPrev == '.'; |
440 | if (numberStart || |
441 | exponentPart || |
442 | signPart || |
443 | adjacentDigit || |
444 | dotPart || |
445 | afterExponent || |
446 | afterDot) { |
447 | context.SetState(SCE_JSON_NUMBER); |
448 | } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) { |
449 | context.SetState(SCE_JSON_ERROR); |
450 | } |
451 | } |
452 | context.Forward(); |
453 | } |
454 | context.Complete(); |
455 | } |
456 | |
457 | void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos, |
458 | Sci_Position length, |
459 | int, |
460 | IDocument *pAccess) { |
461 | if (!options.fold) { |
462 | return; |
463 | } |
464 | LexAccessor styler(pAccess); |
465 | Sci_PositionU currLine = styler.GetLine(startPos); |
466 | Sci_PositionU endPos = startPos + length; |
467 | int currLevel = SC_FOLDLEVELBASE; |
468 | if (currLine > 0) |
469 | currLevel = styler.LevelAt(currLine - 1) >> 16; |
470 | int nextLevel = currLevel; |
471 | int visibleChars = 0; |
472 | for (Sci_PositionU i = startPos; i < endPos; i++) { |
473 | char curr = styler.SafeGetCharAt(i); |
474 | char next = styler.SafeGetCharAt(i+1); |
475 | bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n'); |
476 | if (styler.StyleAt(i) == SCE_JSON_OPERATOR) { |
477 | if (curr == '{' || curr == '[') { |
478 | nextLevel++; |
479 | } else if (curr == '}' || curr == ']') { |
480 | nextLevel--; |
481 | } |
482 | } |
483 | if (atEOL || i == (endPos-1)) { |
484 | int level = currLevel | nextLevel << 16; |
485 | if (!visibleChars && options.foldCompact) { |
486 | level |= SC_FOLDLEVELWHITEFLAG; |
487 | } else if (nextLevel > currLevel) { |
488 | level |= SC_FOLDLEVELHEADERFLAG; |
489 | } |
490 | if (level != styler.LevelAt(currLine)) { |
491 | styler.SetLevel(currLine, level); |
492 | } |
493 | currLine++; |
494 | currLevel = nextLevel; |
495 | visibleChars = 0; |
496 | } |
497 | if (!isspacechar(curr)) { |
498 | visibleChars++; |
499 | } |
500 | } |
501 | } |
502 | |
503 | LexerModule lmJSON(SCLEX_JSON, |
504 | LexerJSON::LexerFactoryJSON, |
505 | "json" , |
506 | JSONWordListDesc); |
507 | |