| 1 | // Scintilla Lexer for EDIFACT |
| 2 | // @file LexEDIFACT.cxx |
| 3 | // Written by Iain Clarke, IMCSoft & Inobiz AB. |
| 4 | // EDIFACT documented here: https://www.unece.org/cefact/edifact/welcome.html |
| 5 | // and more readably here: https://en.wikipedia.org/wiki/EDIFACT |
| 6 | // This code is subject to the same license terms as the rest of the scintilla project: |
| 7 | // The License.txt file describes the conditions under which this software may be distributed. |
| 8 | // |
| 9 | |
| 10 | // Header order must match order in scripts/HeaderOrder.txt |
| 11 | #include <cstdlib> |
| 12 | #include <cassert> |
| 13 | #include <cstring> |
| 14 | #include <cctype> |
| 15 | |
| 16 | #include <string> |
| 17 | #include <string_view> |
| 18 | |
| 19 | #include "ILexer.h" |
| 20 | #include "Scintilla.h" |
| 21 | #include "SciLexer.h" |
| 22 | |
| 23 | #include "LexAccessor.h" |
| 24 | #include "LexerModule.h" |
| 25 | #include "DefaultLexer.h" |
| 26 | |
| 27 | using namespace Scintilla; |
| 28 | using namespace Lexilla; |
| 29 | |
| 30 | class LexerEDIFACT : public DefaultLexer |
| 31 | { |
| 32 | public: |
| 33 | LexerEDIFACT(); |
| 34 | virtual ~LexerEDIFACT() {} // virtual destructor, as we inherit from ILexer |
| 35 | |
| 36 | static ILexer5 *Factory() { |
| 37 | return new LexerEDIFACT; |
| 38 | } |
| 39 | |
| 40 | int SCI_METHOD Version() const override |
| 41 | { |
| 42 | return lvRelease5; |
| 43 | } |
| 44 | void SCI_METHOD Release() override |
| 45 | { |
| 46 | delete this; |
| 47 | } |
| 48 | |
| 49 | const char * SCI_METHOD PropertyNames() override |
| 50 | { |
| 51 | return "fold\nlexer.edifact.highlight.un.all" ; |
| 52 | } |
| 53 | int SCI_METHOD PropertyType(const char *) override |
| 54 | { |
| 55 | return SC_TYPE_BOOLEAN; // Only one property! |
| 56 | } |
| 57 | const char * SCI_METHOD DescribeProperty(const char *name) override |
| 58 | { |
| 59 | if (!strcmp(name, "fold" )) |
| 60 | return "Whether to apply folding to document or not" ; |
| 61 | if (!strcmp(name, "lexer.edifact.highlight.un.all" )) |
| 62 | return "Whether to apply UN* highlighting to all UN segments, or just to UNH" ; |
| 63 | return NULL; |
| 64 | } |
| 65 | |
| 66 | Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override |
| 67 | { |
| 68 | if (!strcmp(key, "fold" )) |
| 69 | { |
| 70 | m_bFold = strcmp(val, "0" ) ? true : false; |
| 71 | return 0; |
| 72 | } |
| 73 | if (!strcmp(key, "lexer.edifact.highlight.un.all" )) // GetProperty |
| 74 | { |
| 75 | m_bHighlightAllUN = strcmp(val, "0" ) ? true : false; |
| 76 | return 0; |
| 77 | } |
| 78 | return -1; |
| 79 | } |
| 80 | |
| 81 | const char * SCI_METHOD PropertyGet(const char *key) override |
| 82 | { |
| 83 | m_lastPropertyValue = "" ; |
| 84 | if (!strcmp(key, "fold" )) |
| 85 | { |
| 86 | m_lastPropertyValue = m_bFold ? "1" : "0" ; |
| 87 | } |
| 88 | if (!strcmp(key, "lexer.edifact.highlight.un.all" )) // GetProperty |
| 89 | { |
| 90 | m_lastPropertyValue = m_bHighlightAllUN ? "1" : "0" ; |
| 91 | } |
| 92 | return m_lastPropertyValue.c_str(); |
| 93 | } |
| 94 | |
| 95 | const char * SCI_METHOD DescribeWordListSets() override |
| 96 | { |
| 97 | return NULL; |
| 98 | } |
| 99 | Sci_Position SCI_METHOD WordListSet(int, const char *) override |
| 100 | { |
| 101 | return -1; |
| 102 | } |
| 103 | void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
| 104 | void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
| 105 | void * SCI_METHOD PrivateCall(int, void *) override |
| 106 | { |
| 107 | return NULL; |
| 108 | } |
| 109 | |
| 110 | protected: |
| 111 | Sci_Position InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength); |
| 112 | Sci_Position FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const; |
| 113 | Sci_Position ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const; |
| 114 | int DetectSegmentHeader(char [3]) const; |
| 115 | |
| 116 | bool m_bFold; |
| 117 | |
| 118 | // property lexer.edifact.highlight.un.all |
| 119 | // Set to 0 to highlight only UNA segments, or 1 to highlight all UNx segments. |
| 120 | bool m_bHighlightAllUN; |
| 121 | |
| 122 | char m_chComponent; |
| 123 | char m_chData; |
| 124 | char m_chDecimal; |
| 125 | char m_chRelease; |
| 126 | char m_chSegment; |
| 127 | |
| 128 | std::string m_lastPropertyValue; |
| 129 | }; |
| 130 | |
| 131 | LexerModule lmEDIFACT(SCLEX_EDIFACT, LexerEDIFACT::Factory, "edifact" ); |
| 132 | |
| 133 | /////////////////////////////////////////////////////////////////////////////// |
| 134 | |
| 135 | |
| 136 | |
| 137 | /////////////////////////////////////////////////////////////////////////////// |
| 138 | |
| 139 | LexerEDIFACT::LexerEDIFACT() : DefaultLexer("edifact" , SCLEX_EDIFACT) |
| 140 | { |
| 141 | m_bFold = false; |
| 142 | m_bHighlightAllUN = false; |
| 143 | m_chComponent = ':'; |
| 144 | m_chData = '+'; |
| 145 | m_chDecimal = '.'; |
| 146 | m_chRelease = '?'; |
| 147 | m_chSegment = '\''; |
| 148 | } |
| 149 | |
| 150 | void LexerEDIFACT::Lex(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) |
| 151 | { |
| 152 | Sci_PositionU posFinish = startPos + length; |
| 153 | InitialiseFromUNA(pAccess, posFinish); |
| 154 | |
| 155 | // Look backwards for a ' or a document beginning |
| 156 | Sci_PositionU posCurrent = FindPreviousEnd(pAccess, startPos); |
| 157 | // And jump past the ' if this was not the beginning of the document |
| 158 | if (posCurrent != 0) |
| 159 | posCurrent++; |
| 160 | |
| 161 | // Style buffer, so we're not issuing loads of notifications |
| 162 | LexAccessor styler (pAccess); |
| 163 | pAccess->StartStyling(posCurrent); |
| 164 | styler.StartSegment(posCurrent); |
| 165 | Sci_Position posSegmentStart = -1; |
| 166 | |
| 167 | while ((posCurrent < posFinish) && (posSegmentStart == -1)) |
| 168 | { |
| 169 | posCurrent = ForwardPastWhitespace(pAccess, posCurrent, posFinish); |
| 170 | // Mark whitespace as default |
| 171 | styler.ColourTo(posCurrent - 1, SCE_EDI_DEFAULT); |
| 172 | if (posCurrent >= posFinish) |
| 173 | break; |
| 174 | |
| 175 | // Does is start with 3 charaters? ie, UNH |
| 176 | char [4] = { 0 }; |
| 177 | pAccess->GetCharRange(SegmentHeader, posCurrent, 3); |
| 178 | |
| 179 | int SegmentStyle = DetectSegmentHeader(SegmentHeader); |
| 180 | if (SegmentStyle == SCE_EDI_BADSEGMENT) |
| 181 | break; |
| 182 | if (SegmentStyle == SCE_EDI_UNA) |
| 183 | { |
| 184 | posCurrent += 9; |
| 185 | styler.ColourTo(posCurrent - 1, SCE_EDI_UNA); // UNA |
| 186 | continue; |
| 187 | } |
| 188 | posSegmentStart = posCurrent; |
| 189 | posCurrent += 3; |
| 190 | |
| 191 | styler.ColourTo(posCurrent - 1, SegmentStyle); // UNH etc |
| 192 | |
| 193 | // Colour in the rest of the segment |
| 194 | for (char c; posCurrent < posFinish; posCurrent++) |
| 195 | { |
| 196 | pAccess->GetCharRange(&c, posCurrent, 1); |
| 197 | |
| 198 | if (c == m_chRelease) // ? escape character, check first, in case of ?' |
| 199 | posCurrent++; |
| 200 | else if (c == m_chSegment) // ' |
| 201 | { |
| 202 | // Make sure the whole segment is on one line. styler won't let us go back in time, so we'll settle for marking the ' as bad. |
| 203 | Sci_Position lineSegmentStart = pAccess->LineFromPosition(posSegmentStart); |
| 204 | Sci_Position lineSegmentEnd = pAccess->LineFromPosition(posCurrent); |
| 205 | if (lineSegmentStart == lineSegmentEnd) |
| 206 | styler.ColourTo(posCurrent, SCE_EDI_SEGMENTEND); |
| 207 | else |
| 208 | styler.ColourTo(posCurrent, SCE_EDI_BADSEGMENT); |
| 209 | posSegmentStart = -1; |
| 210 | posCurrent++; |
| 211 | break; |
| 212 | } |
| 213 | else if (c == m_chComponent) // : |
| 214 | styler.ColourTo(posCurrent, SCE_EDI_SEP_COMPOSITE); |
| 215 | else if (c == m_chData) // + |
| 216 | styler.ColourTo(posCurrent, SCE_EDI_SEP_ELEMENT); |
| 217 | else |
| 218 | styler.ColourTo(posCurrent, SCE_EDI_DEFAULT); |
| 219 | } |
| 220 | } |
| 221 | styler.Flush(); |
| 222 | |
| 223 | if (posSegmentStart == -1) |
| 224 | return; |
| 225 | |
| 226 | pAccess->StartStyling(posSegmentStart); |
| 227 | pAccess->SetStyleFor(posFinish - posSegmentStart, SCE_EDI_BADSEGMENT); |
| 228 | } |
| 229 | |
| 230 | void LexerEDIFACT::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) |
| 231 | { |
| 232 | if (!m_bFold) |
| 233 | return; |
| 234 | |
| 235 | Sci_PositionU endPos = startPos + length; |
| 236 | startPos = FindPreviousEnd(pAccess, startPos); |
| 237 | char c; |
| 238 | char [4] = { 0 }; |
| 239 | |
| 240 | bool AwaitingSegment = true; |
| 241 | Sci_PositionU currLine = pAccess->LineFromPosition(startPos); |
| 242 | int levelCurrentStyle = SC_FOLDLEVELBASE; |
| 243 | if (currLine > 0) |
| 244 | levelCurrentStyle = pAccess->GetLevel(currLine - 1); // bottom 12 bits are level |
| 245 | int indentCurrent = levelCurrentStyle & SC_FOLDLEVELNUMBERMASK; |
| 246 | int indentNext = indentCurrent; |
| 247 | |
| 248 | while (startPos < endPos) |
| 249 | { |
| 250 | pAccess->GetCharRange(&c, startPos, 1); |
| 251 | switch (c) |
| 252 | { |
| 253 | case '\t': |
| 254 | case '\r': |
| 255 | case ' ': |
| 256 | startPos++; |
| 257 | continue; |
| 258 | case '\n': |
| 259 | currLine = pAccess->LineFromPosition(startPos); |
| 260 | pAccess->SetLevel(currLine, levelCurrentStyle | indentCurrent); |
| 261 | startPos++; |
| 262 | levelCurrentStyle = SC_FOLDLEVELBASE; |
| 263 | indentCurrent = indentNext; |
| 264 | continue; |
| 265 | } |
| 266 | if (c == m_chRelease) |
| 267 | { |
| 268 | startPos += 2; |
| 269 | continue; |
| 270 | } |
| 271 | if (c == m_chSegment) |
| 272 | { |
| 273 | AwaitingSegment = true; |
| 274 | startPos++; |
| 275 | continue; |
| 276 | } |
| 277 | |
| 278 | if (!AwaitingSegment) |
| 279 | { |
| 280 | startPos++; |
| 281 | continue; |
| 282 | } |
| 283 | |
| 284 | // Segment! |
| 285 | pAccess->GetCharRange(SegmentHeader, startPos, 3); |
| 286 | if (SegmentHeader[0] != 'U' || SegmentHeader[1] != 'N') |
| 287 | { |
| 288 | startPos++; |
| 289 | continue; |
| 290 | } |
| 291 | |
| 292 | AwaitingSegment = false; |
| 293 | switch (SegmentHeader[2]) |
| 294 | { |
| 295 | case 'H': |
| 296 | case 'G': |
| 297 | indentNext++; |
| 298 | levelCurrentStyle = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG; |
| 299 | break; |
| 300 | |
| 301 | case 'T': |
| 302 | case 'E': |
| 303 | if (indentNext > 0) |
| 304 | indentNext--; |
| 305 | break; |
| 306 | } |
| 307 | |
| 308 | startPos += 3; |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | Sci_Position LexerEDIFACT::InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength) |
| 313 | { |
| 314 | MaxLength -= 9; // drop 9 chars, to give us room for UNA:+.? ' |
| 315 | |
| 316 | Sci_PositionU startPos = 0; |
| 317 | startPos += ForwardPastWhitespace(pAccess, 0, MaxLength); |
| 318 | if (startPos < MaxLength) |
| 319 | { |
| 320 | char bufUNA[9]; |
| 321 | pAccess->GetCharRange(bufUNA, startPos, 9); |
| 322 | |
| 323 | // Check it's UNA segment |
| 324 | if (!memcmp(bufUNA, "UNA" , 3)) |
| 325 | { |
| 326 | m_chComponent = bufUNA[3]; |
| 327 | m_chData = bufUNA[4]; |
| 328 | m_chDecimal = bufUNA[5]; |
| 329 | m_chRelease = bufUNA[6]; |
| 330 | // bufUNA [7] should be space - reserved. |
| 331 | m_chSegment = bufUNA[8]; |
| 332 | |
| 333 | return 0; // success! |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | // We failed to find a UNA, so drop to defaults |
| 338 | m_chComponent = ':'; |
| 339 | m_chData = '+'; |
| 340 | m_chDecimal = '.'; |
| 341 | m_chRelease = '?'; |
| 342 | m_chSegment = '\''; |
| 343 | |
| 344 | return -1; |
| 345 | } |
| 346 | |
| 347 | Sci_Position LexerEDIFACT::ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const |
| 348 | { |
| 349 | char c; |
| 350 | |
| 351 | while (startPos < MaxLength) |
| 352 | { |
| 353 | pAccess->GetCharRange(&c, startPos, 1); |
| 354 | switch (c) |
| 355 | { |
| 356 | case '\t': |
| 357 | case '\r': |
| 358 | case '\n': |
| 359 | case ' ': |
| 360 | break; |
| 361 | default: |
| 362 | return startPos; |
| 363 | } |
| 364 | |
| 365 | startPos++; |
| 366 | } |
| 367 | |
| 368 | return MaxLength; |
| 369 | } |
| 370 | |
| 371 | int LexerEDIFACT::(char [3]) const |
| 372 | { |
| 373 | if ( |
| 374 | SegmentHeader[0] < 'A' || SegmentHeader[0] > 'Z' || |
| 375 | SegmentHeader[1] < 'A' || SegmentHeader[1] > 'Z' || |
| 376 | SegmentHeader[2] < 'A' || SegmentHeader[2] > 'Z') |
| 377 | return SCE_EDI_BADSEGMENT; |
| 378 | |
| 379 | if (!memcmp(SegmentHeader, "UNA" , 3)) |
| 380 | return SCE_EDI_UNA; |
| 381 | |
| 382 | if (m_bHighlightAllUN && !memcmp(SegmentHeader, "UN" , 2)) |
| 383 | return SCE_EDI_UNH; |
| 384 | else if (!memcmp(SegmentHeader, "UNH" , 3)) |
| 385 | return SCE_EDI_UNH; |
| 386 | else if (!memcmp(SegmentHeader, "UNG" , 3)) |
| 387 | return SCE_EDI_UNH; |
| 388 | |
| 389 | return SCE_EDI_SEGMENTSTART; |
| 390 | } |
| 391 | |
| 392 | // Look backwards for a ' or a document beginning |
| 393 | Sci_Position LexerEDIFACT::FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const |
| 394 | { |
| 395 | for (char c; startPos > 0; startPos--) |
| 396 | { |
| 397 | pAccess->GetCharRange(&c, startPos, 1); |
| 398 | if (c == m_chSegment) |
| 399 | return startPos; |
| 400 | } |
| 401 | // We didn't find a ', so just go with the beginning |
| 402 | return 0; |
| 403 | } |
| 404 | |
| 405 | |
| 406 | |