1 | // Scintilla Lexer for EDIFACT |
2 | // @file LexEDIFACT.cxx |
3 | // Written by Iain Clarke, IMCSoft & Inobiz AB. |
4 | // EDIFACT documented here: https://www.unece.org/cefact/edifact/welcome.html |
5 | // and more readably here: https://en.wikipedia.org/wiki/EDIFACT |
6 | // This code is subject to the same license terms as the rest of the scintilla project: |
7 | // The License.txt file describes the conditions under which this software may be distributed. |
8 | // |
9 | |
10 | // Header order must match order in scripts/HeaderOrder.txt |
11 | #include <cstdlib> |
12 | #include <cassert> |
13 | #include <cstring> |
14 | #include <cctype> |
15 | |
16 | #include <string> |
17 | #include <string_view> |
18 | |
19 | #include "ILexer.h" |
20 | #include "Scintilla.h" |
21 | #include "SciLexer.h" |
22 | |
23 | #include "LexAccessor.h" |
24 | #include "LexerModule.h" |
25 | #include "DefaultLexer.h" |
26 | |
27 | using namespace Scintilla; |
28 | using namespace Lexilla; |
29 | |
30 | class LexerEDIFACT : public DefaultLexer |
31 | { |
32 | public: |
33 | LexerEDIFACT(); |
34 | virtual ~LexerEDIFACT() {} // virtual destructor, as we inherit from ILexer |
35 | |
36 | static ILexer5 *Factory() { |
37 | return new LexerEDIFACT; |
38 | } |
39 | |
40 | int SCI_METHOD Version() const override |
41 | { |
42 | return lvRelease5; |
43 | } |
44 | void SCI_METHOD Release() override |
45 | { |
46 | delete this; |
47 | } |
48 | |
49 | const char * SCI_METHOD PropertyNames() override |
50 | { |
51 | return "fold\nlexer.edifact.highlight.un.all" ; |
52 | } |
53 | int SCI_METHOD PropertyType(const char *) override |
54 | { |
55 | return SC_TYPE_BOOLEAN; // Only one property! |
56 | } |
57 | const char * SCI_METHOD DescribeProperty(const char *name) override |
58 | { |
59 | if (!strcmp(name, "fold" )) |
60 | return "Whether to apply folding to document or not" ; |
61 | if (!strcmp(name, "lexer.edifact.highlight.un.all" )) |
62 | return "Whether to apply UN* highlighting to all UN segments, or just to UNH" ; |
63 | return NULL; |
64 | } |
65 | |
66 | Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override |
67 | { |
68 | if (!strcmp(key, "fold" )) |
69 | { |
70 | m_bFold = strcmp(val, "0" ) ? true : false; |
71 | return 0; |
72 | } |
73 | if (!strcmp(key, "lexer.edifact.highlight.un.all" )) // GetProperty |
74 | { |
75 | m_bHighlightAllUN = strcmp(val, "0" ) ? true : false; |
76 | return 0; |
77 | } |
78 | return -1; |
79 | } |
80 | |
81 | const char * SCI_METHOD PropertyGet(const char *key) override |
82 | { |
83 | m_lastPropertyValue = "" ; |
84 | if (!strcmp(key, "fold" )) |
85 | { |
86 | m_lastPropertyValue = m_bFold ? "1" : "0" ; |
87 | } |
88 | if (!strcmp(key, "lexer.edifact.highlight.un.all" )) // GetProperty |
89 | { |
90 | m_lastPropertyValue = m_bHighlightAllUN ? "1" : "0" ; |
91 | } |
92 | return m_lastPropertyValue.c_str(); |
93 | } |
94 | |
95 | const char * SCI_METHOD DescribeWordListSets() override |
96 | { |
97 | return NULL; |
98 | } |
99 | Sci_Position SCI_METHOD WordListSet(int, const char *) override |
100 | { |
101 | return -1; |
102 | } |
103 | void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
104 | void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override; |
105 | void * SCI_METHOD PrivateCall(int, void *) override |
106 | { |
107 | return NULL; |
108 | } |
109 | |
110 | protected: |
111 | Sci_Position InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength); |
112 | Sci_Position FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const; |
113 | Sci_Position ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const; |
114 | int DetectSegmentHeader(char [3]) const; |
115 | |
116 | bool m_bFold; |
117 | |
118 | // property lexer.edifact.highlight.un.all |
119 | // Set to 0 to highlight only UNA segments, or 1 to highlight all UNx segments. |
120 | bool m_bHighlightAllUN; |
121 | |
122 | char m_chComponent; |
123 | char m_chData; |
124 | char m_chDecimal; |
125 | char m_chRelease; |
126 | char m_chSegment; |
127 | |
128 | std::string m_lastPropertyValue; |
129 | }; |
130 | |
131 | LexerModule lmEDIFACT(SCLEX_EDIFACT, LexerEDIFACT::Factory, "edifact" ); |
132 | |
133 | /////////////////////////////////////////////////////////////////////////////// |
134 | |
135 | |
136 | |
137 | /////////////////////////////////////////////////////////////////////////////// |
138 | |
139 | LexerEDIFACT::LexerEDIFACT() : DefaultLexer("edifact" , SCLEX_EDIFACT) |
140 | { |
141 | m_bFold = false; |
142 | m_bHighlightAllUN = false; |
143 | m_chComponent = ':'; |
144 | m_chData = '+'; |
145 | m_chDecimal = '.'; |
146 | m_chRelease = '?'; |
147 | m_chSegment = '\''; |
148 | } |
149 | |
150 | void LexerEDIFACT::Lex(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) |
151 | { |
152 | Sci_PositionU posFinish = startPos + length; |
153 | InitialiseFromUNA(pAccess, posFinish); |
154 | |
155 | // Look backwards for a ' or a document beginning |
156 | Sci_PositionU posCurrent = FindPreviousEnd(pAccess, startPos); |
157 | // And jump past the ' if this was not the beginning of the document |
158 | if (posCurrent != 0) |
159 | posCurrent++; |
160 | |
161 | // Style buffer, so we're not issuing loads of notifications |
162 | LexAccessor styler (pAccess); |
163 | pAccess->StartStyling(posCurrent); |
164 | styler.StartSegment(posCurrent); |
165 | Sci_Position posSegmentStart = -1; |
166 | |
167 | while ((posCurrent < posFinish) && (posSegmentStart == -1)) |
168 | { |
169 | posCurrent = ForwardPastWhitespace(pAccess, posCurrent, posFinish); |
170 | // Mark whitespace as default |
171 | styler.ColourTo(posCurrent - 1, SCE_EDI_DEFAULT); |
172 | if (posCurrent >= posFinish) |
173 | break; |
174 | |
175 | // Does is start with 3 charaters? ie, UNH |
176 | char [4] = { 0 }; |
177 | pAccess->GetCharRange(SegmentHeader, posCurrent, 3); |
178 | |
179 | int SegmentStyle = DetectSegmentHeader(SegmentHeader); |
180 | if (SegmentStyle == SCE_EDI_BADSEGMENT) |
181 | break; |
182 | if (SegmentStyle == SCE_EDI_UNA) |
183 | { |
184 | posCurrent += 9; |
185 | styler.ColourTo(posCurrent - 1, SCE_EDI_UNA); // UNA |
186 | continue; |
187 | } |
188 | posSegmentStart = posCurrent; |
189 | posCurrent += 3; |
190 | |
191 | styler.ColourTo(posCurrent - 1, SegmentStyle); // UNH etc |
192 | |
193 | // Colour in the rest of the segment |
194 | for (char c; posCurrent < posFinish; posCurrent++) |
195 | { |
196 | pAccess->GetCharRange(&c, posCurrent, 1); |
197 | |
198 | if (c == m_chRelease) // ? escape character, check first, in case of ?' |
199 | posCurrent++; |
200 | else if (c == m_chSegment) // ' |
201 | { |
202 | // Make sure the whole segment is on one line. styler won't let us go back in time, so we'll settle for marking the ' as bad. |
203 | Sci_Position lineSegmentStart = pAccess->LineFromPosition(posSegmentStart); |
204 | Sci_Position lineSegmentEnd = pAccess->LineFromPosition(posCurrent); |
205 | if (lineSegmentStart == lineSegmentEnd) |
206 | styler.ColourTo(posCurrent, SCE_EDI_SEGMENTEND); |
207 | else |
208 | styler.ColourTo(posCurrent, SCE_EDI_BADSEGMENT); |
209 | posSegmentStart = -1; |
210 | posCurrent++; |
211 | break; |
212 | } |
213 | else if (c == m_chComponent) // : |
214 | styler.ColourTo(posCurrent, SCE_EDI_SEP_COMPOSITE); |
215 | else if (c == m_chData) // + |
216 | styler.ColourTo(posCurrent, SCE_EDI_SEP_ELEMENT); |
217 | else |
218 | styler.ColourTo(posCurrent, SCE_EDI_DEFAULT); |
219 | } |
220 | } |
221 | styler.Flush(); |
222 | |
223 | if (posSegmentStart == -1) |
224 | return; |
225 | |
226 | pAccess->StartStyling(posSegmentStart); |
227 | pAccess->SetStyleFor(posFinish - posSegmentStart, SCE_EDI_BADSEGMENT); |
228 | } |
229 | |
230 | void LexerEDIFACT::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) |
231 | { |
232 | if (!m_bFold) |
233 | return; |
234 | |
235 | Sci_PositionU endPos = startPos + length; |
236 | startPos = FindPreviousEnd(pAccess, startPos); |
237 | char c; |
238 | char [4] = { 0 }; |
239 | |
240 | bool AwaitingSegment = true; |
241 | Sci_PositionU currLine = pAccess->LineFromPosition(startPos); |
242 | int levelCurrentStyle = SC_FOLDLEVELBASE; |
243 | if (currLine > 0) |
244 | levelCurrentStyle = pAccess->GetLevel(currLine - 1); // bottom 12 bits are level |
245 | int indentCurrent = levelCurrentStyle & SC_FOLDLEVELNUMBERMASK; |
246 | int indentNext = indentCurrent; |
247 | |
248 | while (startPos < endPos) |
249 | { |
250 | pAccess->GetCharRange(&c, startPos, 1); |
251 | switch (c) |
252 | { |
253 | case '\t': |
254 | case '\r': |
255 | case ' ': |
256 | startPos++; |
257 | continue; |
258 | case '\n': |
259 | currLine = pAccess->LineFromPosition(startPos); |
260 | pAccess->SetLevel(currLine, levelCurrentStyle | indentCurrent); |
261 | startPos++; |
262 | levelCurrentStyle = SC_FOLDLEVELBASE; |
263 | indentCurrent = indentNext; |
264 | continue; |
265 | } |
266 | if (c == m_chRelease) |
267 | { |
268 | startPos += 2; |
269 | continue; |
270 | } |
271 | if (c == m_chSegment) |
272 | { |
273 | AwaitingSegment = true; |
274 | startPos++; |
275 | continue; |
276 | } |
277 | |
278 | if (!AwaitingSegment) |
279 | { |
280 | startPos++; |
281 | continue; |
282 | } |
283 | |
284 | // Segment! |
285 | pAccess->GetCharRange(SegmentHeader, startPos, 3); |
286 | if (SegmentHeader[0] != 'U' || SegmentHeader[1] != 'N') |
287 | { |
288 | startPos++; |
289 | continue; |
290 | } |
291 | |
292 | AwaitingSegment = false; |
293 | switch (SegmentHeader[2]) |
294 | { |
295 | case 'H': |
296 | case 'G': |
297 | indentNext++; |
298 | levelCurrentStyle = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG; |
299 | break; |
300 | |
301 | case 'T': |
302 | case 'E': |
303 | if (indentNext > 0) |
304 | indentNext--; |
305 | break; |
306 | } |
307 | |
308 | startPos += 3; |
309 | } |
310 | } |
311 | |
312 | Sci_Position LexerEDIFACT::InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength) |
313 | { |
314 | MaxLength -= 9; // drop 9 chars, to give us room for UNA:+.? ' |
315 | |
316 | Sci_PositionU startPos = 0; |
317 | startPos += ForwardPastWhitespace(pAccess, 0, MaxLength); |
318 | if (startPos < MaxLength) |
319 | { |
320 | char bufUNA[9]; |
321 | pAccess->GetCharRange(bufUNA, startPos, 9); |
322 | |
323 | // Check it's UNA segment |
324 | if (!memcmp(bufUNA, "UNA" , 3)) |
325 | { |
326 | m_chComponent = bufUNA[3]; |
327 | m_chData = bufUNA[4]; |
328 | m_chDecimal = bufUNA[5]; |
329 | m_chRelease = bufUNA[6]; |
330 | // bufUNA [7] should be space - reserved. |
331 | m_chSegment = bufUNA[8]; |
332 | |
333 | return 0; // success! |
334 | } |
335 | } |
336 | |
337 | // We failed to find a UNA, so drop to defaults |
338 | m_chComponent = ':'; |
339 | m_chData = '+'; |
340 | m_chDecimal = '.'; |
341 | m_chRelease = '?'; |
342 | m_chSegment = '\''; |
343 | |
344 | return -1; |
345 | } |
346 | |
347 | Sci_Position LexerEDIFACT::ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const |
348 | { |
349 | char c; |
350 | |
351 | while (startPos < MaxLength) |
352 | { |
353 | pAccess->GetCharRange(&c, startPos, 1); |
354 | switch (c) |
355 | { |
356 | case '\t': |
357 | case '\r': |
358 | case '\n': |
359 | case ' ': |
360 | break; |
361 | default: |
362 | return startPos; |
363 | } |
364 | |
365 | startPos++; |
366 | } |
367 | |
368 | return MaxLength; |
369 | } |
370 | |
371 | int LexerEDIFACT::(char [3]) const |
372 | { |
373 | if ( |
374 | SegmentHeader[0] < 'A' || SegmentHeader[0] > 'Z' || |
375 | SegmentHeader[1] < 'A' || SegmentHeader[1] > 'Z' || |
376 | SegmentHeader[2] < 'A' || SegmentHeader[2] > 'Z') |
377 | return SCE_EDI_BADSEGMENT; |
378 | |
379 | if (!memcmp(SegmentHeader, "UNA" , 3)) |
380 | return SCE_EDI_UNA; |
381 | |
382 | if (m_bHighlightAllUN && !memcmp(SegmentHeader, "UN" , 2)) |
383 | return SCE_EDI_UNH; |
384 | else if (!memcmp(SegmentHeader, "UNH" , 3)) |
385 | return SCE_EDI_UNH; |
386 | else if (!memcmp(SegmentHeader, "UNG" , 3)) |
387 | return SCE_EDI_UNH; |
388 | |
389 | return SCE_EDI_SEGMENTSTART; |
390 | } |
391 | |
392 | // Look backwards for a ' or a document beginning |
393 | Sci_Position LexerEDIFACT::FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const |
394 | { |
395 | for (char c; startPos > 0; startPos--) |
396 | { |
397 | pAccess->GetCharRange(&c, startPos, 1); |
398 | if (c == m_chSegment) |
399 | return startPos; |
400 | } |
401 | // We didn't find a ', so just go with the beginning |
402 | return 0; |
403 | } |
404 | |
405 | |
406 | |