1 | /* |
2 | * Copyright (C) 2020-2022 Roy Qu (royqh1979@gmail.com) |
3 | * |
4 | * This program is free software: you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation, either version 3 of the License, or |
7 | * (at your option) any later version. |
8 | * |
9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program. If not, see <https://www.gnu.org/licenses/>. |
16 | */ |
17 | #include "cpptokenizer.h" |
18 | |
19 | #include <QFile> |
20 | #include <QTextStream> |
21 | |
22 | CppTokenizer::CppTokenizer() |
23 | { |
24 | |
25 | } |
26 | |
27 | void CppTokenizer::reset() |
28 | { |
29 | mTokenList.clear(); |
30 | mBuffer.clear(); |
31 | mBufferStr.clear(); |
32 | } |
33 | |
34 | void CppTokenizer::tokenize(const QStringList &buffer) |
35 | { |
36 | reset(); |
37 | |
38 | mBuffer = buffer; |
39 | if (mBuffer.isEmpty()) |
40 | return; |
41 | mBufferStr = mBuffer[0]; |
42 | for (int i=1;i<mBuffer.size();i++) { |
43 | mBufferStr+='\n'; |
44 | mBufferStr+=mBuffer[i]; |
45 | } |
46 | mStart = mBufferStr.data(); |
47 | mCurrent = mStart; |
48 | mLineCount = mStart; |
49 | QString s = "" ; |
50 | bool bSkipBlocks = false; |
51 | mCurrentLine = 1; |
52 | while (true) { |
53 | mLastToken = s; |
54 | s = getNextToken(true, true, bSkipBlocks); |
55 | simplify(s); |
56 | if (s.isEmpty()) |
57 | break; |
58 | else |
59 | addToken(s,mCurrentLine); |
60 | } |
61 | } |
62 | |
63 | void CppTokenizer::dumpTokens(const QString &fileName) |
64 | { |
65 | QFile file(fileName); |
66 | |
67 | if (file.open(QIODevice::WriteOnly | QIODevice::Truncate)) { |
68 | QTextStream stream(&file); |
69 | foreach (const PToken& token,mTokenList) { |
70 | stream<<QString("%1,%2" ).arg(token->line).arg(token->text) |
71 | #if QT_VERSION >= QT_VERSION_CHECK(5,15,0) |
72 | <<Qt::endl; |
73 | #else |
74 | <<endl; |
75 | #endif |
76 | } |
77 | } |
78 | } |
79 | |
80 | const CppTokenizer::TokenList &CppTokenizer::tokens() |
81 | { |
82 | return mTokenList; |
83 | } |
84 | |
85 | CppTokenizer::PToken CppTokenizer::operator[](int i) |
86 | { |
87 | return mTokenList[i]; |
88 | } |
89 | |
90 | int CppTokenizer::tokenCount() |
91 | { |
92 | return mTokenList.count(); |
93 | } |
94 | |
95 | void CppTokenizer::addToken(const QString &sText, int iLine) |
96 | { |
97 | PToken token = std::make_shared<Token>(); |
98 | token->text = sText; |
99 | token->line = iLine; |
100 | mTokenList.append(token); |
101 | } |
102 | |
103 | void CppTokenizer::countLines() |
104 | { |
105 | while ((*mLineCount != 0) && (mLineCount < mCurrent)) { |
106 | if (*mLineCount == '\n') |
107 | mCurrentLine ++; |
108 | mLineCount++; |
109 | } |
110 | } |
111 | |
112 | QString CppTokenizer::getArguments() |
113 | { |
114 | QChar* offset = mCurrent; |
115 | skipPair('(', ')'); |
116 | QString result(offset,mCurrent-offset); |
117 | simplifyArgs(result); |
118 | if ((*mCurrent == '.') || ((*mCurrent == '-') && (*(mCurrent + 1) == '>'))) { |
119 | // skip '.' and '->' |
120 | while ( !( *mCurrent == 0 |
121 | || *mCurrent == '(' |
122 | || *mCurrent == ';' |
123 | || *mCurrent == '{' |
124 | || *mCurrent == '}' |
125 | || *mCurrent == ')' |
126 | || isLineChar(*mCurrent) |
127 | || isSpaceChar(*mCurrent)) ) |
128 | mCurrent++; |
129 | } |
130 | skipToNextToken(); |
131 | return result; |
132 | } |
133 | |
134 | QString CppTokenizer::getForInit() |
135 | { |
136 | QChar* startOffset = mCurrent; |
137 | |
138 | // Step into the init statement |
139 | mCurrent++; |
140 | |
141 | // Process until ; or end of file |
142 | while (true) { |
143 | QString s = getNextToken(true, true, false); |
144 | simplify(s); |
145 | if (!s.isEmpty()) |
146 | addToken(s,mCurrentLine); |
147 | if ( (s == "" ) || (s == ";" ) || (s==":" )) |
148 | break; |
149 | // : is used in for-each loop |
150 | } |
151 | |
152 | // Skip to end of for loop |
153 | mCurrent = startOffset; |
154 | skipPair('(', ')'); |
155 | return "" ; |
156 | } |
157 | |
158 | QString CppTokenizer::getNextToken(bool /* bSkipParenthesis */, bool bSkipArray, bool bSkipBlock) |
159 | { |
160 | QString result; |
161 | bool done = false; |
162 | while (true) { |
163 | skipToNextToken(); |
164 | if (*mCurrent == 0) |
165 | break; |
166 | if (isPreprocessor()) { |
167 | countLines(); |
168 | result = getPreprocessor(); // don't count preprocessor lines |
169 | if (result.startsWith("#include" )) { // if we find |
170 | int delimPos = result.lastIndexOf(':'); |
171 | if (delimPos >= 0) { |
172 | bool ok; |
173 | mCurrentLine = result.midRef(delimPos+1).toInt(&ok)-1; // fCurrLine is 0 based |
174 | } |
175 | } |
176 | done = (result != "" ); |
177 | } else if (isForInit()) { |
178 | countLines(); |
179 | result = getForInit(); |
180 | done = (result != "" ); |
181 | } else if (isArguments()) { |
182 | countLines(); |
183 | result = getArguments(); |
184 | done = (result != "" ); |
185 | } else if (isWord()) { |
186 | countLines(); |
187 | result = getWord(false, bSkipArray, bSkipBlock); |
188 | done = (result != "" ); |
189 | } else if (isNumber()) { |
190 | countLines(); |
191 | result = getNumber(); |
192 | done = (result != "" ); |
193 | } else { |
194 | switch((*mCurrent).unicode()) { |
195 | case 0: |
196 | done = true; |
197 | break; |
198 | case '/': |
199 | advance(); |
200 | break; |
201 | case ':': |
202 | if (*(mCurrent + 1) == ':') { |
203 | countLines(); |
204 | mCurrent+=2; |
205 | // Append next token to this one |
206 | result = "::" +getWord(true, bSkipArray, bSkipBlock); |
207 | done = true; |
208 | } else { |
209 | countLines(); |
210 | result = *mCurrent; |
211 | advance(); |
212 | done = true; |
213 | } |
214 | break; |
215 | case '{': |
216 | case '}': |
217 | case ';': |
218 | case ',': //just return the brace or the ';' |
219 | countLines(); |
220 | result = *mCurrent; |
221 | advance(); |
222 | done = true; |
223 | break; |
224 | case '>': // keep stream operators |
225 | if (*(mCurrent + 1) == '>') { |
226 | countLines(); |
227 | result = ">>" ; |
228 | advance(); |
229 | done = true; |
230 | } else |
231 | advance(); |
232 | break; |
233 | case '<': |
234 | if (*(mCurrent + 1) == '<') { |
235 | countLines(); |
236 | result = "<<" ; |
237 | advance(); |
238 | done = true; |
239 | } else |
240 | advance(); |
241 | break; |
242 | default: |
243 | advance(); |
244 | } |
245 | } |
246 | if (done) |
247 | break; |
248 | } |
249 | return result; |
250 | } |
251 | |
252 | QString CppTokenizer::getNumber() |
253 | { |
254 | QChar* offset = mCurrent; |
255 | |
256 | if (isDigitChar(*mCurrent)) { |
257 | while (isDigitChar(*mCurrent) || isHexChar(*mCurrent)) { |
258 | advance(); |
259 | } |
260 | } |
261 | |
262 | QString result; |
263 | if (offset != mCurrent) { |
264 | result = QString(offset,mCurrent-offset); |
265 | if (*mCurrent=='.') // keep '.' for decimal |
266 | result += *mCurrent; |
267 | } |
268 | return result; |
269 | } |
270 | |
271 | QString CppTokenizer::getPreprocessor() |
272 | { |
273 | QChar *offset = mCurrent; |
274 | skipToEOL(); |
275 | return QString(offset, mCurrent-offset); |
276 | } |
277 | |
278 | QString CppTokenizer::getWord(bool bSkipParenthesis, bool bSkipArray, bool bSkipBlock) |
279 | { |
280 | bool bFoundTemplate = false; |
281 | // bIsSmartPointer:=False; |
282 | |
283 | // Skip spaces |
284 | skipToNextToken(); |
285 | |
286 | // Get next word... |
287 | QChar* offset = mCurrent; |
288 | |
289 | mCurrent++; |
290 | // Copy the word ahead of us |
291 | while (isIdentChar(*mCurrent) || isDigitChar(*mCurrent)) |
292 | mCurrent++; |
293 | |
294 | QString currentWord; |
295 | if (offset != mCurrent) { |
296 | currentWord = QString(offset,mCurrent-offset); |
297 | } |
298 | // Append the operator characters and argument list to the operator word |
299 | if ((currentWord == "operator" ) || |
300 | (currentWord == "operator*" ) || |
301 | (currentWord == "operator&" )) { |
302 | // Spaces between 'operator' and the operator itself are allowed |
303 | while (isSpaceChar(*mCurrent)) |
304 | mCurrent++; |
305 | // Find end of operator |
306 | while (isOperatorChar(*mCurrent)) |
307 | mCurrent++; |
308 | currentWord = QString(offset,mCurrent-offset); |
309 | } else if (currentWord == "template" ) { |
310 | bFoundTemplate = true; |
311 | } |
312 | |
313 | |
314 | QString result; |
315 | // We found a word... |
316 | if (!currentWord.isEmpty()) { |
317 | result = currentWord; |
318 | // Skip whitespace |
319 | skipToNextToken(); |
320 | |
321 | // Skip template contents, but keep template variable types |
322 | if (*mCurrent == '<') { |
323 | offset = mCurrent; //we don't skip |
324 | skipTemplateArgs(); |
325 | |
326 | if (!bFoundTemplate) { |
327 | result += QString(offset, mCurrent-offset); |
328 | skipToNextToken(); |
329 | } |
330 | } else if (bSkipArray && (*mCurrent == '[')) { |
331 | // Append array stuff |
332 | while(true) { |
333 | offset = mCurrent; |
334 | skipPair('[', ']'); |
335 | result += QString(offset,mCurrent-offset); |
336 | simplifyArgs(result); |
337 | skipToNextToken(); |
338 | if (*mCurrent!='[') //maybe multi-dimension array |
339 | break; |
340 | } |
341 | } else if (bSkipBlock && (*mCurrent == '{')) { |
342 | skipPair('{', '}'); |
343 | skipToNextToken(); |
344 | } |
345 | |
346 | // Keep parent/child operators |
347 | if (*mCurrent == '.') { |
348 | result+=*mCurrent; |
349 | mCurrent++; |
350 | } else if ((*mCurrent == '-') && (*(mCurrent + 1) == '>')) { |
351 | result+=QString(mCurrent,2); |
352 | mCurrent+=2; |
353 | } else if ((*mCurrent == ':') && (*(mCurrent + 1) == ':')) { |
354 | if (result != "using" ) { |
355 | result+=QString(mCurrent,2); |
356 | mCurrent+=2; |
357 | // Append next token to this one |
358 | QString s = getWord(bSkipParenthesis, bSkipArray, bSkipBlock); |
359 | result += s; |
360 | } |
361 | } |
362 | } |
363 | return result; |
364 | } |
365 | |
366 | bool CppTokenizer::isArguments() |
367 | { |
368 | return *mCurrent == '('; |
369 | } |
370 | |
371 | bool CppTokenizer::isForInit() |
372 | { |
373 | return (*mCurrent == '(') && (mLastToken == "for" ); |
374 | } |
375 | |
376 | bool CppTokenizer::isNumber() |
377 | { |
378 | return isDigitChar(*mCurrent); |
379 | } |
380 | |
381 | bool CppTokenizer::isPreprocessor() |
382 | { |
383 | return *mCurrent=='#'; |
384 | } |
385 | |
386 | bool CppTokenizer::isWord() |
387 | { |
388 | bool result = isLetterChar(*mCurrent); |
389 | if (result && (*(mCurrent+1) == '"')) |
390 | result = false; |
391 | return result; |
392 | } |
393 | |
394 | void CppTokenizer::simplify(QString &output) |
395 | { |
396 | //remove \n \r; |
397 | QString temp; |
398 | for (const QChar& ch:output) { |
399 | if (!isLineChar(ch)) |
400 | temp+=ch; |
401 | } |
402 | output = temp.trimmed(); |
403 | } |
404 | |
405 | void CppTokenizer::simplifyArgs(QString &output) |
406 | { |
407 | QString temp; |
408 | QString lastSpace = "" ; |
409 | bool parentheseStart = true; |
410 | foreach (const QChar& ch,output.trimmed()) { |
411 | if (isSpaceChar(ch)) { |
412 | if (!parentheseStart) |
413 | lastSpace+=ch; |
414 | } else if (ch==','){ |
415 | temp+=ch; |
416 | lastSpace = "" ; |
417 | parentheseStart = false; |
418 | } else if (ch=='(') { |
419 | temp+=ch; |
420 | lastSpace = "" ; |
421 | parentheseStart=true; |
422 | } else if (ch==')') { |
423 | temp+=ch; |
424 | lastSpace = "" ; |
425 | parentheseStart = false; |
426 | } else { |
427 | parentheseStart=false; |
428 | if (!lastSpace.isEmpty()) { |
429 | temp+=" " ; |
430 | } |
431 | lastSpace = "" ; |
432 | temp+=ch; |
433 | } |
434 | } |
435 | output = temp; |
436 | } |
437 | |
438 | void CppTokenizer::skipAssignment() |
439 | { |
440 | while (true) { |
441 | switch ((*mCurrent).unicode()) { |
442 | case '(': skipPair('(', ')'); |
443 | break; |
444 | case '"': skipDoubleQuotes(); |
445 | break; |
446 | case '\'': skipSingleQuote(); |
447 | break; |
448 | case '{': skipPair('{', '}'); // support struct initializers |
449 | break; |
450 | case '/': |
451 | mCurrent++; |
452 | break; |
453 | default: |
454 | if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) |
455 | skipRawString(); |
456 | else |
457 | mCurrent++; |
458 | } |
459 | if (*mCurrent == ',' |
460 | || *mCurrent ==';' |
461 | || *mCurrent ==')' |
462 | || *mCurrent =='}' |
463 | || *mCurrent ==0) |
464 | break; |
465 | } |
466 | } |
467 | |
468 | void CppTokenizer::skipDoubleQuotes() |
469 | { |
470 | mCurrent++; |
471 | while (!(*mCurrent=='"' || *mCurrent == 0)) { |
472 | if (*mCurrent == '\\') |
473 | mCurrent+=2; // skip escaped char |
474 | else |
475 | mCurrent++; |
476 | } |
477 | if (*mCurrent!=0) { |
478 | mCurrent++; |
479 | } |
480 | } |
481 | |
482 | void CppTokenizer::skipPair(const QChar &cStart, const QChar cEnd, const QSet<QChar>& failChars) |
483 | { |
484 | mCurrent++; |
485 | while (*mCurrent != 0) { |
486 | if ((*mCurrent == '(') && !failChars.contains('(')) { |
487 | skipPair('(', ')', failChars); |
488 | } else if ((*mCurrent == '[') && !failChars.contains('[')) { |
489 | skipPair('[', ']', failChars); |
490 | } else if ((*mCurrent == '{') && !failChars.contains('{')) { |
491 | skipPair('{', '}', failChars); |
492 | } else if (*mCurrent == cStart) { |
493 | skipPair(cStart, cEnd, failChars); |
494 | } else if (*mCurrent == cEnd) { |
495 | mCurrent++; // skip over end |
496 | break; |
497 | } else if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) { |
498 | if (cStart != '\'' && cStart!='\"') |
499 | skipRawString(); // don't do it inside AnsiString! |
500 | else |
501 | mCurrent++; |
502 | } else if (*mCurrent == '"') { |
503 | if (cStart != '\'' && cStart!='\"') |
504 | skipDoubleQuotes(); // don't do it inside AnsiString! |
505 | else |
506 | mCurrent++; |
507 | } else if (*mCurrent == '\'') { |
508 | if (cStart != '\'' && cStart!='\"') |
509 | skipSingleQuote(); // don't do it inside AnsiString! |
510 | else |
511 | mCurrent++; |
512 | } else if (failChars.contains(*mCurrent)) { |
513 | break; |
514 | } else { |
515 | mCurrent++; |
516 | } |
517 | } |
518 | } |
519 | |
520 | void CppTokenizer::skipRawString() |
521 | { |
522 | mCurrent++; //skip R |
523 | bool noEscape = false; |
524 | while(true) { |
525 | mCurrent++; |
526 | switch(mCurrent->unicode()) { |
527 | case '(': |
528 | noEscape = true; |
529 | break; |
530 | case ')': |
531 | noEscape = false; |
532 | break; |
533 | } |
534 | if (*mCurrent == 0) |
535 | break; |
536 | if ((*mCurrent == '"') && !noEscape) |
537 | break; |
538 | } |
539 | if (*mCurrent!=0) |
540 | mCurrent++; |
541 | } |
542 | |
543 | void CppTokenizer::skipSingleQuote() |
544 | { |
545 | mCurrent++; |
546 | while (!(*mCurrent=='\'' || *mCurrent == 0)) { |
547 | if (*mCurrent == '\\') |
548 | mCurrent+=2; // skip escaped char |
549 | else |
550 | mCurrent++; |
551 | } |
552 | if (*mCurrent!=0) { |
553 | mCurrent++; |
554 | } |
555 | } |
556 | |
557 | void CppTokenizer::skipSplitLine() |
558 | { |
559 | mCurrent++; // skip '\' |
560 | while ( isLineChar(*mCurrent)) // skip newline |
561 | mCurrent++; |
562 | } |
563 | |
564 | void CppTokenizer::skipTemplateArgs() |
565 | { |
566 | if (*mCurrent != '<') |
567 | return; |
568 | QChar* start = mCurrent; |
569 | |
570 | QSet<QChar> failSet; |
571 | failSet.insert('{'); |
572 | failSet.insert('}'); |
573 | failSet.insert(';'); |
574 | skipPair('<', '>', failSet); |
575 | |
576 | // if we failed, return to where we came from |
577 | if (start!=mCurrent && *(mCurrent - 1) != '>') |
578 | mCurrent = start; |
579 | } |
580 | |
581 | void CppTokenizer::skipToEOL() |
582 | { |
583 | while (true) { |
584 | while (!isLineChar(*mCurrent) && (*mCurrent!=0)) { |
585 | mCurrent++; |
586 | } |
587 | if (*mCurrent==0) |
588 | return; |
589 | |
590 | bool splitLine = (*(mCurrent - 1) == '\\'); |
591 | |
592 | while (isLineChar(*mCurrent)) |
593 | mCurrent++; |
594 | |
595 | if (!splitLine || *mCurrent==0) |
596 | break; |
597 | } |
598 | } |
599 | |
600 | void CppTokenizer::skipToNextToken() |
601 | { |
602 | while (isSpaceChar(*mCurrent) || isLineChar(*mCurrent)) |
603 | advance(); |
604 | } |
605 | |
606 | bool CppTokenizer::isIdentChar(const QChar &ch) |
607 | { |
608 | return ch=='_' || ch.isLetter() ; |
609 | } |
610 | |
611 | void CppTokenizer::advance() |
612 | { |
613 | switch(mCurrent->unicode()) { |
614 | case '\"': skipDoubleQuotes(); |
615 | break; |
616 | case '\'': skipSingleQuote(); |
617 | break; |
618 | case '/': |
619 | if (*(mCurrent + 1) == '=') |
620 | skipAssignment(); |
621 | else |
622 | mCurrent++; |
623 | break; |
624 | case '=': { |
625 | if (mTokenList.size()>2 |
626 | && mTokenList[mTokenList.size()-2]->text == "using" ) { |
627 | addToken("=" ,mCurrentLine); |
628 | mCurrent++; |
629 | } else |
630 | skipAssignment(); |
631 | break; |
632 | } |
633 | case '&': |
634 | case '*': |
635 | case '!': |
636 | case '|': |
637 | case '+': |
638 | case '-': |
639 | case '~': |
640 | if (*(mCurrent + 1) == '=') |
641 | skipAssignment(); |
642 | else |
643 | mCurrent++; |
644 | break; |
645 | case '\\': |
646 | if (isLineChar(*(mCurrent + 1))) |
647 | skipSplitLine(); |
648 | else |
649 | mCurrent++; |
650 | break; |
651 | default: |
652 | if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) |
653 | skipRawString(); |
654 | else |
655 | mCurrent++; |
656 | } |
657 | } |
658 | |
659 | bool CppTokenizer::isLetterChar(const QChar &ch) |
660 | { |
661 | // return (ch>= 'A' && ch<='Z') |
662 | // || (ch>='a' && ch<='z') |
663 | return isIdentChar(ch) |
664 | || ch == '_' |
665 | || ch == '*' |
666 | || ch == '&' |
667 | || ch == '~'; |
668 | } |
669 | |
670 | bool CppTokenizer::isHexChar(const QChar &ch) |
671 | { |
672 | return (ch >= 'A' && ch<='F') |
673 | || (ch>='a' && ch<='f') |
674 | || ch == 'x' |
675 | || ch == 'L'; |
676 | } |
677 | |
678 | bool CppTokenizer::isDigitChar(const QChar &ch) |
679 | { |
680 | return (ch>='0' && ch<='9'); |
681 | } |
682 | |
683 | bool CppTokenizer::isSpaceChar(const QChar &ch) |
684 | { |
685 | return (ch == ' ' || ch == '\t'); |
686 | } |
687 | |
688 | bool CppTokenizer::isLineChar(const QChar &ch) |
689 | { |
690 | return (ch=='\n' || ch=='\r'); |
691 | } |
692 | |
693 | bool CppTokenizer::isBlankChar(const QChar &ch) |
694 | { |
695 | return (ch<=32); |
696 | } |
697 | |
698 | bool CppTokenizer::isOperatorChar(const QChar &ch) |
699 | { |
700 | switch (ch.unicode()) { |
701 | case '+': |
702 | case '-': |
703 | case '/': |
704 | case '*': |
705 | case '[': |
706 | case ']': |
707 | case '=': |
708 | case '%': |
709 | case '!': |
710 | case '&': |
711 | case '|': |
712 | case '>': |
713 | case '<': |
714 | case '^': |
715 | return true; |
716 | default: |
717 | return false; |
718 | } |
719 | } |
720 | |
721 | bool CppTokenizer::currentWordEquals(QChar *wordStart, QChar *wordEnd, const QString& text) |
722 | { |
723 | QString currentWord(wordStart, wordEnd-wordStart); |
724 | return currentWord == text; |
725 | } |
726 | |