| 1 | // Scintilla source code edit control |
| 2 | /** |
| 3 | * @file LexJSON.cxx |
| 4 | * @date February 19, 2016 |
| 5 | * @brief Lexer for JSON and JSON-LD formats |
| 6 | * @author nkmathew |
| 7 | * |
| 8 | * The License.txt file describes the conditions under which this software may |
| 9 | * be distributed. |
| 10 | * |
| 11 | */ |
| 12 | |
| 13 | #include <cstdlib> |
| 14 | #include <cassert> |
| 15 | #include <cctype> |
| 16 | #include <cstdio> |
| 17 | |
| 18 | #include <string> |
| 19 | #include <string_view> |
| 20 | #include <vector> |
| 21 | #include <map> |
| 22 | #include <functional> |
| 23 | |
| 24 | #include "ILexer.h" |
| 25 | #include "Scintilla.h" |
| 26 | #include "SciLexer.h" |
| 27 | #include "WordList.h" |
| 28 | #include "LexAccessor.h" |
| 29 | #include "StyleContext.h" |
| 30 | #include "CharacterSet.h" |
| 31 | #include "LexerModule.h" |
| 32 | #include "OptionSet.h" |
| 33 | #include "DefaultLexer.h" |
| 34 | |
| 35 | using namespace Scintilla; |
| 36 | using namespace Lexilla; |
| 37 | |
| 38 | static const char *const JSONWordListDesc[] = { |
| 39 | "JSON Keywords" , |
| 40 | "JSON-LD Keywords" , |
| 41 | 0 |
| 42 | }; |
| 43 | |
| 44 | /** |
| 45 | * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the |
| 46 | * colon separating the prefix and suffix |
| 47 | * |
| 48 | * https://www.w3.org/TR/json-ld/#dfn-compact-iri |
| 49 | */ |
| 50 | struct CompactIRI { |
| 51 | int colonCount; |
| 52 | bool foundInvalidChar; |
| 53 | CharacterSet setCompactIRI; |
| 54 | CompactIRI() { |
| 55 | colonCount = 0; |
| 56 | foundInvalidChar = false; |
| 57 | setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-" ); |
| 58 | } |
| 59 | void resetState() { |
| 60 | colonCount = 0; |
| 61 | foundInvalidChar = false; |
| 62 | } |
| 63 | void checkChar(int ch) { |
| 64 | if (ch == ':') { |
| 65 | colonCount++; |
| 66 | } else { |
| 67 | foundInvalidChar |= !setCompactIRI.Contains(ch); |
| 68 | } |
| 69 | } |
| 70 | bool shouldHighlight() const { |
| 71 | return !foundInvalidChar && colonCount == 1; |
| 72 | } |
| 73 | }; |
| 74 | |
| 75 | /** |
| 76 | * Keeps track of escaped characters in strings as per: |
| 77 | * |
| 78 | * https://tools.ietf.org/html/rfc7159#section-7 |
| 79 | */ |
| 80 | struct EscapeSequence { |
| 81 | int digitsLeft; |
| 82 | CharacterSet setHexDigits; |
| 83 | CharacterSet setEscapeChars; |
| 84 | EscapeSequence() { |
| 85 | digitsLeft = 0; |
| 86 | setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef" ); |
| 87 | setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/" ); |
| 88 | } |
| 89 | // Returns true if the following character is a valid escaped character |
| 90 | bool newSequence(int nextChar) { |
| 91 | digitsLeft = 0; |
| 92 | if (nextChar == 'u') { |
| 93 | digitsLeft = 5; |
| 94 | } else if (!setEscapeChars.Contains(nextChar)) { |
| 95 | return false; |
| 96 | } |
| 97 | return true; |
| 98 | } |
| 99 | bool atEscapeEnd() const { |
| 100 | return digitsLeft <= 0; |
| 101 | } |
| 102 | bool isInvalidChar(int currChar) const { |
| 103 | return !setHexDigits.Contains(currChar); |
| 104 | } |
| 105 | }; |
| 106 | |
| 107 | struct OptionsJSON { |
| 108 | bool foldCompact; |
| 109 | bool fold; |
| 110 | bool ; |
| 111 | bool escapeSequence; |
| 112 | OptionsJSON() { |
| 113 | foldCompact = false; |
| 114 | fold = false; |
| 115 | allowComments = false; |
| 116 | escapeSequence = false; |
| 117 | } |
| 118 | }; |
| 119 | |
| 120 | struct OptionSetJSON : public OptionSet<OptionsJSON> { |
| 121 | OptionSetJSON() { |
| 122 | DefineProperty("lexer.json.escape.sequence" , &OptionsJSON::escapeSequence, |
| 123 | "Set to 1 to enable highlighting of escape sequences in strings" ); |
| 124 | |
| 125 | DefineProperty("lexer.json.allow.comments" , &OptionsJSON::allowComments, |
| 126 | "Set to 1 to enable highlighting of line/block comments in JSON" ); |
| 127 | |
| 128 | DefineProperty("fold.compact" , &OptionsJSON::foldCompact); |
| 129 | DefineProperty("fold" , &OptionsJSON::fold); |
| 130 | DefineWordListSets(JSONWordListDesc); |
| 131 | } |
| 132 | }; |
| 133 | |
| 134 | class LexerJSON : public DefaultLexer { |
| 135 | OptionsJSON options; |
| 136 | OptionSetJSON optSetJSON; |
| 137 | EscapeSequence escapeSeq; |
| 138 | WordList keywordsJSON; |
| 139 | WordList keywordsJSONLD; |
| 140 | CharacterSet setOperators; |
| 141 | CharacterSet setURL; |
| 142 | CharacterSet setKeywordJSONLD; |
| 143 | CharacterSet setKeywordJSON; |
| 144 | CompactIRI compactIRI; |
| 145 | |
| 146 | static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) { |
| 147 | Sci_Position i = 0; |
| 148 | while (i < 50) { |
| 149 | i++; |
| 150 | char curr = styler.SafeGetCharAt(start+i, '\0'); |
| 151 | char next = styler.SafeGetCharAt(start+i+1, '\0'); |
| 152 | bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n'); |
| 153 | if (curr == ch) { |
| 154 | return true; |
| 155 | } else if (!isspacechar(curr) || atEOL) { |
| 156 | return false; |
| 157 | } |
| 158 | } |
| 159 | return false; |
| 160 | } |
| 161 | |
| 162 | /** |
| 163 | * Looks for the colon following the end quote |
| 164 | * |
| 165 | * Assumes property names of lengths no longer than a 100 characters. |
| 166 | * The colon is also expected to be less than 50 spaces after the end |
| 167 | * quote for the string to be considered a property name |
| 168 | */ |
| 169 | static bool AtPropertyName(LexAccessor &styler, Sci_Position start) { |
| 170 | Sci_Position i = 0; |
| 171 | bool escaped = false; |
| 172 | while (i < 100) { |
| 173 | i++; |
| 174 | char curr = styler.SafeGetCharAt(start+i, '\0'); |
| 175 | if (escaped) { |
| 176 | escaped = false; |
| 177 | continue; |
| 178 | } |
| 179 | escaped = curr == '\\'; |
| 180 | if (curr == '"') { |
| 181 | return IsNextNonWhitespace(styler, start+i, ':'); |
| 182 | } else if (!curr) { |
| 183 | return false; |
| 184 | } |
| 185 | } |
| 186 | return false; |
| 187 | } |
| 188 | |
| 189 | static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet, |
| 190 | StyleContext &context, LexAccessor &styler) { |
| 191 | char word[51]; |
| 192 | Sci_Position currPos = (Sci_Position) context.currentPos; |
| 193 | int i = 0; |
| 194 | while (i < 50) { |
| 195 | char ch = styler.SafeGetCharAt(currPos + i); |
| 196 | if (!wordSet.Contains(ch)) { |
| 197 | break; |
| 198 | } |
| 199 | word[i] = ch; |
| 200 | i++; |
| 201 | } |
| 202 | word[i] = '\0'; |
| 203 | return keywordList.InList(word); |
| 204 | } |
| 205 | |
| 206 | public: |
| 207 | LexerJSON() : |
| 208 | DefaultLexer("json" , SCLEX_JSON), |
| 209 | setOperators(CharacterSet::setNone, "[{}]:," ), |
| 210 | setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),=" ), |
| 211 | setKeywordJSONLD(CharacterSet::setAlpha, ":@" ), |
| 212 | setKeywordJSON(CharacterSet::setAlpha, "$_" ) { |
| 213 | } |
| 214 | virtual ~LexerJSON() {} |
| 215 | int SCI_METHOD Version() const override { |
| 216 | return lvRelease5; |
| 217 | } |
| 218 | void SCI_METHOD Release() override { |
| 219 | delete this; |
| 220 | } |
| 221 | const char *SCI_METHOD PropertyNames() override { |
| 222 | return optSetJSON.PropertyNames(); |
| 223 | } |
| 224 | int SCI_METHOD PropertyType(const char *name) override { |
| 225 | return optSetJSON.PropertyType(name); |
| 226 | } |
| 227 | const char *SCI_METHOD DescribeProperty(const char *name) override { |
| 228 | return optSetJSON.DescribeProperty(name); |
| 229 | } |
| 230 | Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override { |
| 231 | if (optSetJSON.PropertySet(&options, key, val)) { |
| 232 | return 0; |
| 233 | } |
| 234 | return -1; |
| 235 | } |
| 236 | const char * SCI_METHOD PropertyGet(const char *key) override { |
| 237 | return optSetJSON.PropertyGet(key); |
| 238 | } |
| 239 | Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override { |
| 240 | WordList *wordListN = 0; |
| 241 | switch (n) { |
| 242 | case 0: |
| 243 | wordListN = &keywordsJSON; |
| 244 | break; |
| 245 | case 1: |
| 246 | wordListN = &keywordsJSONLD; |
| 247 | break; |
| 248 | } |
| 249 | Sci_Position firstModification = -1; |
| 250 | if (wordListN) { |
| 251 | WordList wlNew; |
| 252 | wlNew.Set(wl); |
| 253 | if (*wordListN != wlNew) { |
| 254 | wordListN->Set(wl); |
| 255 | firstModification = 0; |
| 256 | } |
| 257 | } |
| 258 | return firstModification; |
| 259 | } |
| 260 | void *SCI_METHOD PrivateCall(int, void *) override { |
| 261 | return 0; |
| 262 | } |
| 263 | static ILexer5 *LexerFactoryJSON() { |
| 264 | return new LexerJSON; |
| 265 | } |
| 266 | const char *SCI_METHOD DescribeWordListSets() override { |
| 267 | return optSetJSON.DescribeWordListSets(); |
| 268 | } |
| 269 | void SCI_METHOD Lex(Sci_PositionU startPos, |
| 270 | Sci_Position length, |
| 271 | int initStyle, |
| 272 | IDocument *pAccess) override; |
| 273 | void SCI_METHOD Fold(Sci_PositionU startPos, |
| 274 | Sci_Position length, |
| 275 | int initStyle, |
| 276 | IDocument *pAccess) override; |
| 277 | }; |
| 278 | |
| 279 | void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos, |
| 280 | Sci_Position length, |
| 281 | int initStyle, |
| 282 | IDocument *pAccess) { |
| 283 | LexAccessor styler(pAccess); |
| 284 | StyleContext context(startPos, length, initStyle, styler); |
| 285 | int stringStyleBefore = SCE_JSON_STRING; |
| 286 | while (context.More()) { |
| 287 | switch (context.state) { |
| 288 | case SCE_JSON_BLOCKCOMMENT: |
| 289 | if (context.Match("*/" )) { |
| 290 | context.Forward(); |
| 291 | context.ForwardSetState(SCE_JSON_DEFAULT); |
| 292 | } |
| 293 | break; |
| 294 | case SCE_JSON_LINECOMMENT: |
| 295 | if (context.atLineEnd) { |
| 296 | context.SetState(SCE_JSON_DEFAULT); |
| 297 | } |
| 298 | break; |
| 299 | case SCE_JSON_STRINGEOL: |
| 300 | if (context.atLineStart) { |
| 301 | context.SetState(SCE_JSON_DEFAULT); |
| 302 | } |
| 303 | break; |
| 304 | case SCE_JSON_ESCAPESEQUENCE: |
| 305 | escapeSeq.digitsLeft--; |
| 306 | if (!escapeSeq.atEscapeEnd()) { |
| 307 | if (escapeSeq.isInvalidChar(context.ch)) { |
| 308 | context.SetState(SCE_JSON_ERROR); |
| 309 | } |
| 310 | break; |
| 311 | } |
| 312 | if (context.ch == '"') { |
| 313 | context.SetState(stringStyleBefore); |
| 314 | context.ForwardSetState(SCE_C_DEFAULT); |
| 315 | } else if (context.ch == '\\') { |
| 316 | if (!escapeSeq.newSequence(context.chNext)) { |
| 317 | context.SetState(SCE_JSON_ERROR); |
| 318 | } |
| 319 | context.Forward(); |
| 320 | } else { |
| 321 | context.SetState(stringStyleBefore); |
| 322 | if (context.atLineEnd) { |
| 323 | context.ChangeState(SCE_JSON_STRINGEOL); |
| 324 | } |
| 325 | } |
| 326 | break; |
| 327 | case SCE_JSON_PROPERTYNAME: |
| 328 | case SCE_JSON_STRING: |
| 329 | if (context.ch == '"') { |
| 330 | if (compactIRI.shouldHighlight()) { |
| 331 | context.ChangeState(SCE_JSON_COMPACTIRI); |
| 332 | context.ForwardSetState(SCE_JSON_DEFAULT); |
| 333 | compactIRI.resetState(); |
| 334 | } else { |
| 335 | context.ForwardSetState(SCE_JSON_DEFAULT); |
| 336 | } |
| 337 | } else if (context.atLineEnd) { |
| 338 | context.ChangeState(SCE_JSON_STRINGEOL); |
| 339 | } else if (context.ch == '\\') { |
| 340 | stringStyleBefore = context.state; |
| 341 | if (options.escapeSequence) { |
| 342 | context.SetState(SCE_JSON_ESCAPESEQUENCE); |
| 343 | if (!escapeSeq.newSequence(context.chNext)) { |
| 344 | context.SetState(SCE_JSON_ERROR); |
| 345 | } |
| 346 | } |
| 347 | context.Forward(); |
| 348 | } else if (context.Match("https://" ) || |
| 349 | context.Match("http://" ) || |
| 350 | context.Match("ssh://" ) || |
| 351 | context.Match("git://" ) || |
| 352 | context.Match("svn://" ) || |
| 353 | context.Match("ftp://" ) || |
| 354 | context.Match("mailto:" )) { |
| 355 | // Handle most common URI schemes only |
| 356 | stringStyleBefore = context.state; |
| 357 | context.SetState(SCE_JSON_URI); |
| 358 | } else if (context.ch == '@') { |
| 359 | // https://www.w3.org/TR/json-ld/#dfn-keyword |
| 360 | if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) { |
| 361 | stringStyleBefore = context.state; |
| 362 | context.SetState(SCE_JSON_LDKEYWORD); |
| 363 | } |
| 364 | } else { |
| 365 | compactIRI.checkChar(context.ch); |
| 366 | } |
| 367 | break; |
| 368 | case SCE_JSON_LDKEYWORD: |
| 369 | case SCE_JSON_URI: |
| 370 | if ((!setKeywordJSONLD.Contains(context.ch) && |
| 371 | (context.state == SCE_JSON_LDKEYWORD)) || |
| 372 | (!setURL.Contains(context.ch))) { |
| 373 | context.SetState(stringStyleBefore); |
| 374 | } |
| 375 | if (context.ch == '"') { |
| 376 | context.ForwardSetState(SCE_JSON_DEFAULT); |
| 377 | } else if (context.atLineEnd) { |
| 378 | context.ChangeState(SCE_JSON_STRINGEOL); |
| 379 | } |
| 380 | break; |
| 381 | case SCE_JSON_OPERATOR: |
| 382 | case SCE_JSON_NUMBER: |
| 383 | context.SetState(SCE_JSON_DEFAULT); |
| 384 | break; |
| 385 | case SCE_JSON_ERROR: |
| 386 | if (context.atLineEnd) { |
| 387 | context.SetState(SCE_JSON_DEFAULT); |
| 388 | } |
| 389 | break; |
| 390 | case SCE_JSON_KEYWORD: |
| 391 | if (!setKeywordJSON.Contains(context.ch)) { |
| 392 | context.SetState(SCE_JSON_DEFAULT); |
| 393 | } |
| 394 | break; |
| 395 | } |
| 396 | if (context.state == SCE_JSON_DEFAULT) { |
| 397 | if (context.ch == '"') { |
| 398 | compactIRI.resetState(); |
| 399 | context.SetState(SCE_JSON_STRING); |
| 400 | Sci_Position currPos = static_cast<Sci_Position>(context.currentPos); |
| 401 | if (AtPropertyName(styler, currPos)) { |
| 402 | context.SetState(SCE_JSON_PROPERTYNAME); |
| 403 | } |
| 404 | } else if (setOperators.Contains(context.ch)) { |
| 405 | context.SetState(SCE_JSON_OPERATOR); |
| 406 | } else if (options.allowComments && context.Match("/*" )) { |
| 407 | context.SetState(SCE_JSON_BLOCKCOMMENT); |
| 408 | context.Forward(); |
| 409 | } else if (options.allowComments && context.Match("//" )) { |
| 410 | context.SetState(SCE_JSON_LINECOMMENT); |
| 411 | } else if (setKeywordJSON.Contains(context.ch)) { |
| 412 | if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) { |
| 413 | context.SetState(SCE_JSON_KEYWORD); |
| 414 | } |
| 415 | } |
| 416 | bool numberStart = |
| 417 | IsADigit(context.ch) && (context.chPrev == '+'|| |
| 418 | context.chPrev == '-' || |
| 419 | context.atLineStart || |
| 420 | IsASpace(context.chPrev) || |
| 421 | setOperators.Contains(context.chPrev)); |
| 422 | bool exponentPart = |
| 423 | tolower(context.ch) == 'e' && |
| 424 | IsADigit(context.chPrev) && |
| 425 | (IsADigit(context.chNext) || |
| 426 | context.chNext == '+' || |
| 427 | context.chNext == '-'); |
| 428 | bool signPart = |
| 429 | (context.ch == '-' || context.ch == '+') && |
| 430 | ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) || |
| 431 | ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev)) |
| 432 | && IsADigit(context.chNext))); |
| 433 | bool adjacentDigit = |
| 434 | IsADigit(context.ch) && IsADigit(context.chPrev); |
| 435 | bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e'; |
| 436 | bool dotPart = context.ch == '.' && |
| 437 | IsADigit(context.chPrev) && |
| 438 | IsADigit(context.chNext); |
| 439 | bool afterDot = IsADigit(context.ch) && context.chPrev == '.'; |
| 440 | if (numberStart || |
| 441 | exponentPart || |
| 442 | signPart || |
| 443 | adjacentDigit || |
| 444 | dotPart || |
| 445 | afterExponent || |
| 446 | afterDot) { |
| 447 | context.SetState(SCE_JSON_NUMBER); |
| 448 | } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) { |
| 449 | context.SetState(SCE_JSON_ERROR); |
| 450 | } |
| 451 | } |
| 452 | context.Forward(); |
| 453 | } |
| 454 | context.Complete(); |
| 455 | } |
| 456 | |
| 457 | void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos, |
| 458 | Sci_Position length, |
| 459 | int, |
| 460 | IDocument *pAccess) { |
| 461 | if (!options.fold) { |
| 462 | return; |
| 463 | } |
| 464 | LexAccessor styler(pAccess); |
| 465 | Sci_PositionU currLine = styler.GetLine(startPos); |
| 466 | Sci_PositionU endPos = startPos + length; |
| 467 | int currLevel = SC_FOLDLEVELBASE; |
| 468 | if (currLine > 0) |
| 469 | currLevel = styler.LevelAt(currLine - 1) >> 16; |
| 470 | int nextLevel = currLevel; |
| 471 | int visibleChars = 0; |
| 472 | for (Sci_PositionU i = startPos; i < endPos; i++) { |
| 473 | char curr = styler.SafeGetCharAt(i); |
| 474 | char next = styler.SafeGetCharAt(i+1); |
| 475 | bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n'); |
| 476 | if (styler.StyleAt(i) == SCE_JSON_OPERATOR) { |
| 477 | if (curr == '{' || curr == '[') { |
| 478 | nextLevel++; |
| 479 | } else if (curr == '}' || curr == ']') { |
| 480 | nextLevel--; |
| 481 | } |
| 482 | } |
| 483 | if (atEOL || i == (endPos-1)) { |
| 484 | int level = currLevel | nextLevel << 16; |
| 485 | if (!visibleChars && options.foldCompact) { |
| 486 | level |= SC_FOLDLEVELWHITEFLAG; |
| 487 | } else if (nextLevel > currLevel) { |
| 488 | level |= SC_FOLDLEVELHEADERFLAG; |
| 489 | } |
| 490 | if (level != styler.LevelAt(currLine)) { |
| 491 | styler.SetLevel(currLine, level); |
| 492 | } |
| 493 | currLine++; |
| 494 | currLevel = nextLevel; |
| 495 | visibleChars = 0; |
| 496 | } |
| 497 | if (!isspacechar(curr)) { |
| 498 | visibleChars++; |
| 499 | } |
| 500 | } |
| 501 | } |
| 502 | |
| 503 | LexerModule lmJSON(SCLEX_JSON, |
| 504 | LexerJSON::LexerFactoryJSON, |
| 505 | "json" , |
| 506 | JSONWordListDesc); |
| 507 | |