1// Scintilla source code edit control
2/**
3 * @file LexJSON.cxx
4 * @date February 19, 2016
5 * @brief Lexer for JSON and JSON-LD formats
6 * @author nkmathew
7 *
8 * The License.txt file describes the conditions under which this software may
9 * be distributed.
10 *
11 */
12
13#include <cstdlib>
14#include <cassert>
15#include <cctype>
16#include <cstdio>
17
18#include <string>
19#include <string_view>
20#include <vector>
21#include <map>
22#include <functional>
23
24#include "ILexer.h"
25#include "Scintilla.h"
26#include "SciLexer.h"
27#include "WordList.h"
28#include "LexAccessor.h"
29#include "StyleContext.h"
30#include "CharacterSet.h"
31#include "LexerModule.h"
32#include "OptionSet.h"
33#include "DefaultLexer.h"
34
35using namespace Scintilla;
36using namespace Lexilla;
37
38static const char *const JSONWordListDesc[] = {
39 "JSON Keywords",
40 "JSON-LD Keywords",
41 0
42};
43
44/**
45 * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
46 * colon separating the prefix and suffix
47 *
48 * https://www.w3.org/TR/json-ld/#dfn-compact-iri
49 */
50struct CompactIRI {
51 int colonCount;
52 bool foundInvalidChar;
53 CharacterSet setCompactIRI;
54 CompactIRI() {
55 colonCount = 0;
56 foundInvalidChar = false;
57 setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
58 }
59 void resetState() {
60 colonCount = 0;
61 foundInvalidChar = false;
62 }
63 void checkChar(int ch) {
64 if (ch == ':') {
65 colonCount++;
66 } else {
67 foundInvalidChar |= !setCompactIRI.Contains(ch);
68 }
69 }
70 bool shouldHighlight() const {
71 return !foundInvalidChar && colonCount == 1;
72 }
73};
74
75/**
76 * Keeps track of escaped characters in strings as per:
77 *
78 * https://tools.ietf.org/html/rfc7159#section-7
79 */
80struct EscapeSequence {
81 int digitsLeft;
82 CharacterSet setHexDigits;
83 CharacterSet setEscapeChars;
84 EscapeSequence() {
85 digitsLeft = 0;
86 setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
87 setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
88 }
89 // Returns true if the following character is a valid escaped character
90 bool newSequence(int nextChar) {
91 digitsLeft = 0;
92 if (nextChar == 'u') {
93 digitsLeft = 5;
94 } else if (!setEscapeChars.Contains(nextChar)) {
95 return false;
96 }
97 return true;
98 }
99 bool atEscapeEnd() const {
100 return digitsLeft <= 0;
101 }
102 bool isInvalidChar(int currChar) const {
103 return !setHexDigits.Contains(currChar);
104 }
105};
106
107struct OptionsJSON {
108 bool foldCompact;
109 bool fold;
110 bool allowComments;
111 bool escapeSequence;
112 OptionsJSON() {
113 foldCompact = false;
114 fold = false;
115 allowComments = false;
116 escapeSequence = false;
117 }
118};
119
120struct OptionSetJSON : public OptionSet<OptionsJSON> {
121 OptionSetJSON() {
122 DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
123 "Set to 1 to enable highlighting of escape sequences in strings");
124
125 DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
126 "Set to 1 to enable highlighting of line/block comments in JSON");
127
128 DefineProperty("fold.compact", &OptionsJSON::foldCompact);
129 DefineProperty("fold", &OptionsJSON::fold);
130 DefineWordListSets(JSONWordListDesc);
131 }
132};
133
134class LexerJSON : public DefaultLexer {
135 OptionsJSON options;
136 OptionSetJSON optSetJSON;
137 EscapeSequence escapeSeq;
138 WordList keywordsJSON;
139 WordList keywordsJSONLD;
140 CharacterSet setOperators;
141 CharacterSet setURL;
142 CharacterSet setKeywordJSONLD;
143 CharacterSet setKeywordJSON;
144 CompactIRI compactIRI;
145
146 static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
147 Sci_Position i = 0;
148 while (i < 50) {
149 i++;
150 char curr = styler.SafeGetCharAt(start+i, '\0');
151 char next = styler.SafeGetCharAt(start+i+1, '\0');
152 bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
153 if (curr == ch) {
154 return true;
155 } else if (!isspacechar(curr) || atEOL) {
156 return false;
157 }
158 }
159 return false;
160 }
161
162 /**
163 * Looks for the colon following the end quote
164 *
165 * Assumes property names of lengths no longer than a 100 characters.
166 * The colon is also expected to be less than 50 spaces after the end
167 * quote for the string to be considered a property name
168 */
169 static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
170 Sci_Position i = 0;
171 bool escaped = false;
172 while (i < 100) {
173 i++;
174 char curr = styler.SafeGetCharAt(start+i, '\0');
175 if (escaped) {
176 escaped = false;
177 continue;
178 }
179 escaped = curr == '\\';
180 if (curr == '"') {
181 return IsNextNonWhitespace(styler, start+i, ':');
182 } else if (!curr) {
183 return false;
184 }
185 }
186 return false;
187 }
188
189 static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
190 StyleContext &context, LexAccessor &styler) {
191 char word[51];
192 Sci_Position currPos = (Sci_Position) context.currentPos;
193 int i = 0;
194 while (i < 50) {
195 char ch = styler.SafeGetCharAt(currPos + i);
196 if (!wordSet.Contains(ch)) {
197 break;
198 }
199 word[i] = ch;
200 i++;
201 }
202 word[i] = '\0';
203 return keywordList.InList(word);
204 }
205
206 public:
207 LexerJSON() :
208 DefaultLexer("json", SCLEX_JSON),
209 setOperators(CharacterSet::setNone, "[{}]:,"),
210 setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
211 setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
212 setKeywordJSON(CharacterSet::setAlpha, "$_") {
213 }
214 virtual ~LexerJSON() {}
215 int SCI_METHOD Version() const override {
216 return lvRelease5;
217 }
218 void SCI_METHOD Release() override {
219 delete this;
220 }
221 const char *SCI_METHOD PropertyNames() override {
222 return optSetJSON.PropertyNames();
223 }
224 int SCI_METHOD PropertyType(const char *name) override {
225 return optSetJSON.PropertyType(name);
226 }
227 const char *SCI_METHOD DescribeProperty(const char *name) override {
228 return optSetJSON.DescribeProperty(name);
229 }
230 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override {
231 if (optSetJSON.PropertySet(&options, key, val)) {
232 return 0;
233 }
234 return -1;
235 }
236 const char * SCI_METHOD PropertyGet(const char *key) override {
237 return optSetJSON.PropertyGet(key);
238 }
239 Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override {
240 WordList *wordListN = 0;
241 switch (n) {
242 case 0:
243 wordListN = &keywordsJSON;
244 break;
245 case 1:
246 wordListN = &keywordsJSONLD;
247 break;
248 }
249 Sci_Position firstModification = -1;
250 if (wordListN) {
251 WordList wlNew;
252 wlNew.Set(wl);
253 if (*wordListN != wlNew) {
254 wordListN->Set(wl);
255 firstModification = 0;
256 }
257 }
258 return firstModification;
259 }
260 void *SCI_METHOD PrivateCall(int, void *) override {
261 return 0;
262 }
263 static ILexer5 *LexerFactoryJSON() {
264 return new LexerJSON;
265 }
266 const char *SCI_METHOD DescribeWordListSets() override {
267 return optSetJSON.DescribeWordListSets();
268 }
269 void SCI_METHOD Lex(Sci_PositionU startPos,
270 Sci_Position length,
271 int initStyle,
272 IDocument *pAccess) override;
273 void SCI_METHOD Fold(Sci_PositionU startPos,
274 Sci_Position length,
275 int initStyle,
276 IDocument *pAccess) override;
277};
278
279void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
280 Sci_Position length,
281 int initStyle,
282 IDocument *pAccess) {
283 LexAccessor styler(pAccess);
284 StyleContext context(startPos, length, initStyle, styler);
285 int stringStyleBefore = SCE_JSON_STRING;
286 while (context.More()) {
287 switch (context.state) {
288 case SCE_JSON_BLOCKCOMMENT:
289 if (context.Match("*/")) {
290 context.Forward();
291 context.ForwardSetState(SCE_JSON_DEFAULT);
292 }
293 break;
294 case SCE_JSON_LINECOMMENT:
295 if (context.atLineEnd) {
296 context.SetState(SCE_JSON_DEFAULT);
297 }
298 break;
299 case SCE_JSON_STRINGEOL:
300 if (context.atLineStart) {
301 context.SetState(SCE_JSON_DEFAULT);
302 }
303 break;
304 case SCE_JSON_ESCAPESEQUENCE:
305 escapeSeq.digitsLeft--;
306 if (!escapeSeq.atEscapeEnd()) {
307 if (escapeSeq.isInvalidChar(context.ch)) {
308 context.SetState(SCE_JSON_ERROR);
309 }
310 break;
311 }
312 if (context.ch == '"') {
313 context.SetState(stringStyleBefore);
314 context.ForwardSetState(SCE_C_DEFAULT);
315 } else if (context.ch == '\\') {
316 if (!escapeSeq.newSequence(context.chNext)) {
317 context.SetState(SCE_JSON_ERROR);
318 }
319 context.Forward();
320 } else {
321 context.SetState(stringStyleBefore);
322 if (context.atLineEnd) {
323 context.ChangeState(SCE_JSON_STRINGEOL);
324 }
325 }
326 break;
327 case SCE_JSON_PROPERTYNAME:
328 case SCE_JSON_STRING:
329 if (context.ch == '"') {
330 if (compactIRI.shouldHighlight()) {
331 context.ChangeState(SCE_JSON_COMPACTIRI);
332 context.ForwardSetState(SCE_JSON_DEFAULT);
333 compactIRI.resetState();
334 } else {
335 context.ForwardSetState(SCE_JSON_DEFAULT);
336 }
337 } else if (context.atLineEnd) {
338 context.ChangeState(SCE_JSON_STRINGEOL);
339 } else if (context.ch == '\\') {
340 stringStyleBefore = context.state;
341 if (options.escapeSequence) {
342 context.SetState(SCE_JSON_ESCAPESEQUENCE);
343 if (!escapeSeq.newSequence(context.chNext)) {
344 context.SetState(SCE_JSON_ERROR);
345 }
346 }
347 context.Forward();
348 } else if (context.Match("https://") ||
349 context.Match("http://") ||
350 context.Match("ssh://") ||
351 context.Match("git://") ||
352 context.Match("svn://") ||
353 context.Match("ftp://") ||
354 context.Match("mailto:")) {
355 // Handle most common URI schemes only
356 stringStyleBefore = context.state;
357 context.SetState(SCE_JSON_URI);
358 } else if (context.ch == '@') {
359 // https://www.w3.org/TR/json-ld/#dfn-keyword
360 if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
361 stringStyleBefore = context.state;
362 context.SetState(SCE_JSON_LDKEYWORD);
363 }
364 } else {
365 compactIRI.checkChar(context.ch);
366 }
367 break;
368 case SCE_JSON_LDKEYWORD:
369 case SCE_JSON_URI:
370 if ((!setKeywordJSONLD.Contains(context.ch) &&
371 (context.state == SCE_JSON_LDKEYWORD)) ||
372 (!setURL.Contains(context.ch))) {
373 context.SetState(stringStyleBefore);
374 }
375 if (context.ch == '"') {
376 context.ForwardSetState(SCE_JSON_DEFAULT);
377 } else if (context.atLineEnd) {
378 context.ChangeState(SCE_JSON_STRINGEOL);
379 }
380 break;
381 case SCE_JSON_OPERATOR:
382 case SCE_JSON_NUMBER:
383 context.SetState(SCE_JSON_DEFAULT);
384 break;
385 case SCE_JSON_ERROR:
386 if (context.atLineEnd) {
387 context.SetState(SCE_JSON_DEFAULT);
388 }
389 break;
390 case SCE_JSON_KEYWORD:
391 if (!setKeywordJSON.Contains(context.ch)) {
392 context.SetState(SCE_JSON_DEFAULT);
393 }
394 break;
395 }
396 if (context.state == SCE_JSON_DEFAULT) {
397 if (context.ch == '"') {
398 compactIRI.resetState();
399 context.SetState(SCE_JSON_STRING);
400 Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
401 if (AtPropertyName(styler, currPos)) {
402 context.SetState(SCE_JSON_PROPERTYNAME);
403 }
404 } else if (setOperators.Contains(context.ch)) {
405 context.SetState(SCE_JSON_OPERATOR);
406 } else if (options.allowComments && context.Match("/*")) {
407 context.SetState(SCE_JSON_BLOCKCOMMENT);
408 context.Forward();
409 } else if (options.allowComments && context.Match("//")) {
410 context.SetState(SCE_JSON_LINECOMMENT);
411 } else if (setKeywordJSON.Contains(context.ch)) {
412 if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
413 context.SetState(SCE_JSON_KEYWORD);
414 }
415 }
416 bool numberStart =
417 IsADigit(context.ch) && (context.chPrev == '+'||
418 context.chPrev == '-' ||
419 context.atLineStart ||
420 IsASpace(context.chPrev) ||
421 setOperators.Contains(context.chPrev));
422 bool exponentPart =
423 tolower(context.ch) == 'e' &&
424 IsADigit(context.chPrev) &&
425 (IsADigit(context.chNext) ||
426 context.chNext == '+' ||
427 context.chNext == '-');
428 bool signPart =
429 (context.ch == '-' || context.ch == '+') &&
430 ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
431 ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
432 && IsADigit(context.chNext)));
433 bool adjacentDigit =
434 IsADigit(context.ch) && IsADigit(context.chPrev);
435 bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
436 bool dotPart = context.ch == '.' &&
437 IsADigit(context.chPrev) &&
438 IsADigit(context.chNext);
439 bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
440 if (numberStart ||
441 exponentPart ||
442 signPart ||
443 adjacentDigit ||
444 dotPart ||
445 afterExponent ||
446 afterDot) {
447 context.SetState(SCE_JSON_NUMBER);
448 } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
449 context.SetState(SCE_JSON_ERROR);
450 }
451 }
452 context.Forward();
453 }
454 context.Complete();
455}
456
457void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
458 Sci_Position length,
459 int,
460 IDocument *pAccess) {
461 if (!options.fold) {
462 return;
463 }
464 LexAccessor styler(pAccess);
465 Sci_PositionU currLine = styler.GetLine(startPos);
466 Sci_PositionU endPos = startPos + length;
467 int currLevel = SC_FOLDLEVELBASE;
468 if (currLine > 0)
469 currLevel = styler.LevelAt(currLine - 1) >> 16;
470 int nextLevel = currLevel;
471 int visibleChars = 0;
472 for (Sci_PositionU i = startPos; i < endPos; i++) {
473 char curr = styler.SafeGetCharAt(i);
474 char next = styler.SafeGetCharAt(i+1);
475 bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
476 if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
477 if (curr == '{' || curr == '[') {
478 nextLevel++;
479 } else if (curr == '}' || curr == ']') {
480 nextLevel--;
481 }
482 }
483 if (atEOL || i == (endPos-1)) {
484 int level = currLevel | nextLevel << 16;
485 if (!visibleChars && options.foldCompact) {
486 level |= SC_FOLDLEVELWHITEFLAG;
487 } else if (nextLevel > currLevel) {
488 level |= SC_FOLDLEVELHEADERFLAG;
489 }
490 if (level != styler.LevelAt(currLine)) {
491 styler.SetLevel(currLine, level);
492 }
493 currLine++;
494 currLevel = nextLevel;
495 visibleChars = 0;
496 }
497 if (!isspacechar(curr)) {
498 visibleChars++;
499 }
500 }
501}
502
503LexerModule lmJSON(SCLEX_JSON,
504 LexerJSON::LexerFactoryJSON,
505 "json",
506 JSONWordListDesc);
507