1/*
2 * Copyright (C) 2020-2022 Roy Qu (royqh1979@gmail.com)
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17#include "cpptokenizer.h"
18
19#include <QFile>
20#include <QTextStream>
21
22CppTokenizer::CppTokenizer()
23{
24
25}
26
27void CppTokenizer::reset()
28{
29 mTokenList.clear();
30 mBuffer.clear();
31 mBufferStr.clear();
32}
33
34void CppTokenizer::tokenize(const QStringList &buffer)
35{
36 reset();
37
38 mBuffer = buffer;
39 if (mBuffer.isEmpty())
40 return;
41 mBufferStr = mBuffer[0];
42 for (int i=1;i<mBuffer.size();i++) {
43 mBufferStr+='\n';
44 mBufferStr+=mBuffer[i];
45 }
46 mStart = mBufferStr.data();
47 mCurrent = mStart;
48 mLineCount = mStart;
49 QString s = "";
50 bool bSkipBlocks = false;
51 mCurrentLine = 1;
52 while (true) {
53 mLastToken = s;
54 s = getNextToken(true, true, bSkipBlocks);
55 simplify(s);
56 if (s.isEmpty())
57 break;
58 else
59 addToken(s,mCurrentLine);
60 }
61}
62
63void CppTokenizer::dumpTokens(const QString &fileName)
64{
65 QFile file(fileName);
66
67 if (file.open(QIODevice::WriteOnly | QIODevice::Truncate)) {
68 QTextStream stream(&file);
69 foreach (const PToken& token,mTokenList) {
70 stream<<QString("%1,%2").arg(token->line).arg(token->text)
71#if QT_VERSION >= QT_VERSION_CHECK(5,15,0)
72 <<Qt::endl;
73#else
74 <<endl;
75#endif
76 }
77 }
78}
79
80const CppTokenizer::TokenList &CppTokenizer::tokens()
81{
82 return mTokenList;
83}
84
85CppTokenizer::PToken CppTokenizer::operator[](int i)
86{
87 return mTokenList[i];
88}
89
90int CppTokenizer::tokenCount()
91{
92 return mTokenList.count();
93}
94
95void CppTokenizer::addToken(const QString &sText, int iLine)
96{
97 PToken token = std::make_shared<Token>();
98 token->text = sText;
99 token->line = iLine;
100 mTokenList.append(token);
101}
102
103void CppTokenizer::countLines()
104{
105 while ((*mLineCount != 0) && (mLineCount < mCurrent)) {
106 if (*mLineCount == '\n')
107 mCurrentLine ++;
108 mLineCount++;
109 }
110}
111
112QString CppTokenizer::getArguments()
113{
114 QChar* offset = mCurrent;
115 skipPair('(', ')');
116 QString result(offset,mCurrent-offset);
117 simplifyArgs(result);
118 if ((*mCurrent == '.') || ((*mCurrent == '-') && (*(mCurrent + 1) == '>'))) {
119 // skip '.' and '->'
120 while ( !( *mCurrent == 0
121 || *mCurrent == '('
122 || *mCurrent == ';'
123 || *mCurrent == '{'
124 || *mCurrent == '}'
125 || *mCurrent == ')'
126 || isLineChar(*mCurrent)
127 || isSpaceChar(*mCurrent)) )
128 mCurrent++;
129 }
130 skipToNextToken();
131 return result;
132}
133
134QString CppTokenizer::getForInit()
135{
136 QChar* startOffset = mCurrent;
137
138 // Step into the init statement
139 mCurrent++;
140
141 // Process until ; or end of file
142 while (true) {
143 QString s = getNextToken(true, true, false);
144 simplify(s);
145 if (!s.isEmpty())
146 addToken(s,mCurrentLine);
147 if ( (s == "") || (s == ";") || (s==":"))
148 break;
149 // : is used in for-each loop
150 }
151
152 // Skip to end of for loop
153 mCurrent = startOffset;
154 skipPair('(', ')');
155 return "";
156}
157
158QString CppTokenizer::getNextToken(bool /* bSkipParenthesis */, bool bSkipArray, bool bSkipBlock)
159{
160 QString result;
161 bool done = false;
162 while (true) {
163 skipToNextToken();
164 if (*mCurrent == 0)
165 break;
166 if (isPreprocessor()) {
167 countLines();
168 result = getPreprocessor(); // don't count preprocessor lines
169 if (result.startsWith("#include")) { // if we find
170 int delimPos = result.lastIndexOf(':');
171 if (delimPos >= 0) {
172 bool ok;
173 mCurrentLine = result.midRef(delimPos+1).toInt(&ok)-1; // fCurrLine is 0 based
174 }
175 }
176 done = (result != "");
177 } else if (isForInit()) {
178 countLines();
179 result = getForInit();
180 done = (result != "");
181 } else if (isArguments()) {
182 countLines();
183 result = getArguments();
184 done = (result != "");
185 } else if (isWord()) {
186 countLines();
187 result = getWord(false, bSkipArray, bSkipBlock);
188 done = (result != "");
189 } else if (isNumber()) {
190 countLines();
191 result = getNumber();
192 done = (result != "");
193 } else {
194 switch((*mCurrent).unicode()) {
195 case 0:
196 done = true;
197 break;
198 case '/':
199 advance();
200 break;
201 case ':':
202 if (*(mCurrent + 1) == ':') {
203 countLines();
204 mCurrent+=2;
205 // Append next token to this one
206 result = "::"+getWord(true, bSkipArray, bSkipBlock);
207 done = true;
208 } else {
209 countLines();
210 result = *mCurrent;
211 advance();
212 done = true;
213 }
214 break;
215 case '{':
216 case '}':
217 case ';':
218 case ',': //just return the brace or the ';'
219 countLines();
220 result = *mCurrent;
221 advance();
222 done = true;
223 break;
224 case '>': // keep stream operators
225 if (*(mCurrent + 1) == '>') {
226 countLines();
227 result = ">>";
228 advance();
229 done = true;
230 } else
231 advance();
232 break;
233 case '<':
234 if (*(mCurrent + 1) == '<') {
235 countLines();
236 result = "<<";
237 advance();
238 done = true;
239 } else
240 advance();
241 break;
242 default:
243 advance();
244 }
245 }
246 if (done)
247 break;
248 }
249 return result;
250}
251
252QString CppTokenizer::getNumber()
253{
254 QChar* offset = mCurrent;
255
256 if (isDigitChar(*mCurrent)) {
257 while (isDigitChar(*mCurrent) || isHexChar(*mCurrent)) {
258 advance();
259 }
260 }
261
262 QString result;
263 if (offset != mCurrent) {
264 result = QString(offset,mCurrent-offset);
265 if (*mCurrent=='.') // keep '.' for decimal
266 result += *mCurrent;
267 }
268 return result;
269}
270
271QString CppTokenizer::getPreprocessor()
272{
273 QChar *offset = mCurrent;
274 skipToEOL();
275 return QString(offset, mCurrent-offset);
276}
277
278QString CppTokenizer::getWord(bool bSkipParenthesis, bool bSkipArray, bool bSkipBlock)
279{
280 bool bFoundTemplate = false;
281 // bIsSmartPointer:=False;
282
283 // Skip spaces
284 skipToNextToken();
285
286 // Get next word...
287 QChar* offset = mCurrent;
288
289 mCurrent++;
290 // Copy the word ahead of us
291 while (isIdentChar(*mCurrent) || isDigitChar(*mCurrent))
292 mCurrent++;
293
294 QString currentWord;
295 if (offset != mCurrent) {
296 currentWord = QString(offset,mCurrent-offset);
297 }
298 // Append the operator characters and argument list to the operator word
299 if ((currentWord == "operator") ||
300 (currentWord == "operator*") ||
301 (currentWord == "operator&")) {
302 // Spaces between 'operator' and the operator itself are allowed
303 while (isSpaceChar(*mCurrent))
304 mCurrent++;
305 // Find end of operator
306 while (isOperatorChar(*mCurrent))
307 mCurrent++;
308 currentWord = QString(offset,mCurrent-offset);
309 } else if (currentWord == "template") {
310 bFoundTemplate = true;
311 }
312
313
314 QString result;
315 // We found a word...
316 if (!currentWord.isEmpty()) {
317 result = currentWord;
318 // Skip whitespace
319 skipToNextToken();
320
321 // Skip template contents, but keep template variable types
322 if (*mCurrent == '<') {
323 offset = mCurrent; //we don't skip
324 skipTemplateArgs();
325
326 if (!bFoundTemplate) {
327 result += QString(offset, mCurrent-offset);
328 skipToNextToken();
329 }
330 } else if (bSkipArray && (*mCurrent == '[')) {
331 // Append array stuff
332 while(true) {
333 offset = mCurrent;
334 skipPair('[', ']');
335 result += QString(offset,mCurrent-offset);
336 simplifyArgs(result);
337 skipToNextToken();
338 if (*mCurrent!='[') //maybe multi-dimension array
339 break;
340 }
341 } else if (bSkipBlock && (*mCurrent == '{')) {
342 skipPair('{', '}');
343 skipToNextToken();
344 }
345
346 // Keep parent/child operators
347 if (*mCurrent == '.') {
348 result+=*mCurrent;
349 mCurrent++;
350 } else if ((*mCurrent == '-') && (*(mCurrent + 1) == '>')) {
351 result+=QString(mCurrent,2);
352 mCurrent+=2;
353 } else if ((*mCurrent == ':') && (*(mCurrent + 1) == ':')) {
354 if (result != "using") {
355 result+=QString(mCurrent,2);
356 mCurrent+=2;
357 // Append next token to this one
358 QString s = getWord(bSkipParenthesis, bSkipArray, bSkipBlock);
359 result += s;
360 }
361 }
362 }
363 return result;
364}
365
366bool CppTokenizer::isArguments()
367{
368 return *mCurrent == '(';
369}
370
371bool CppTokenizer::isForInit()
372{
373 return (*mCurrent == '(') && (mLastToken == "for");
374}
375
376bool CppTokenizer::isNumber()
377{
378 return isDigitChar(*mCurrent);
379}
380
381bool CppTokenizer::isPreprocessor()
382{
383 return *mCurrent=='#';
384}
385
386bool CppTokenizer::isWord()
387{
388 bool result = isLetterChar(*mCurrent);
389 if (result && (*(mCurrent+1) == '"'))
390 result = false;
391 return result;
392}
393
394void CppTokenizer::simplify(QString &output)
395{
396 //remove \n \r;
397 QString temp;
398 for (const QChar& ch:output) {
399 if (!isLineChar(ch))
400 temp+=ch;
401 }
402 output = temp.trimmed();
403}
404
405void CppTokenizer::simplifyArgs(QString &output)
406{
407 QString temp;
408 QString lastSpace = "";
409 bool parentheseStart = true;
410 foreach (const QChar& ch,output.trimmed()) {
411 if (isSpaceChar(ch)) {
412 if (!parentheseStart)
413 lastSpace+=ch;
414 } else if (ch==','){
415 temp+=ch;
416 lastSpace = "";
417 parentheseStart = false;
418 } else if (ch=='(') {
419 temp+=ch;
420 lastSpace = "";
421 parentheseStart=true;
422 } else if (ch==')') {
423 temp+=ch;
424 lastSpace = "";
425 parentheseStart = false;
426 } else {
427 parentheseStart=false;
428 if (!lastSpace.isEmpty()) {
429 temp+=" ";
430 }
431 lastSpace = "";
432 temp+=ch;
433 }
434 }
435 output = temp;
436}
437
438void CppTokenizer::skipAssignment()
439{
440 while (true) {
441 switch ((*mCurrent).unicode()) {
442 case '(': skipPair('(', ')');
443 break;
444 case '"': skipDoubleQuotes();
445 break;
446 case '\'': skipSingleQuote();
447 break;
448 case '{': skipPair('{', '}'); // support struct initializers
449 break;
450 case '/':
451 mCurrent++;
452 break;
453 default:
454 if ((*mCurrent == 'R') && (*(mCurrent+1) == '"'))
455 skipRawString();
456 else
457 mCurrent++;
458 }
459 if (*mCurrent == ','
460 || *mCurrent ==';'
461 || *mCurrent ==')'
462 || *mCurrent =='}'
463 || *mCurrent ==0)
464 break;
465 }
466}
467
468void CppTokenizer::skipDoubleQuotes()
469{
470 mCurrent++;
471 while (!(*mCurrent=='"' || *mCurrent == 0)) {
472 if (*mCurrent == '\\')
473 mCurrent+=2; // skip escaped char
474 else
475 mCurrent++;
476 }
477 if (*mCurrent!=0) {
478 mCurrent++;
479 }
480}
481
482void CppTokenizer::skipPair(const QChar &cStart, const QChar cEnd, const QSet<QChar>& failChars)
483{
484 mCurrent++;
485 while (*mCurrent != 0) {
486 if ((*mCurrent == '(') && !failChars.contains('(')) {
487 skipPair('(', ')', failChars);
488 } else if ((*mCurrent == '[') && !failChars.contains('[')) {
489 skipPair('[', ']', failChars);
490 } else if ((*mCurrent == '{') && !failChars.contains('{')) {
491 skipPair('{', '}', failChars);
492 } else if (*mCurrent == cStart) {
493 skipPair(cStart, cEnd, failChars);
494 } else if (*mCurrent == cEnd) {
495 mCurrent++; // skip over end
496 break;
497 } else if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) {
498 if (cStart != '\'' && cStart!='\"')
499 skipRawString(); // don't do it inside AnsiString!
500 else
501 mCurrent++;
502 } else if (*mCurrent == '"') {
503 if (cStart != '\'' && cStart!='\"')
504 skipDoubleQuotes(); // don't do it inside AnsiString!
505 else
506 mCurrent++;
507 } else if (*mCurrent == '\'') {
508 if (cStart != '\'' && cStart!='\"')
509 skipSingleQuote(); // don't do it inside AnsiString!
510 else
511 mCurrent++;
512 } else if (failChars.contains(*mCurrent)) {
513 break;
514 } else {
515 mCurrent++;
516 }
517 }
518}
519
520void CppTokenizer::skipRawString()
521{
522 mCurrent++; //skip R
523 bool noEscape = false;
524 while(true) {
525 mCurrent++;
526 switch(mCurrent->unicode()) {
527 case '(':
528 noEscape = true;
529 break;
530 case ')':
531 noEscape = false;
532 break;
533 }
534 if (*mCurrent == 0)
535 break;
536 if ((*mCurrent == '"') && !noEscape)
537 break;
538 }
539 if (*mCurrent!=0)
540 mCurrent++;
541}
542
543void CppTokenizer::skipSingleQuote()
544{
545 mCurrent++;
546 while (!(*mCurrent=='\'' || *mCurrent == 0)) {
547 if (*mCurrent == '\\')
548 mCurrent+=2; // skip escaped char
549 else
550 mCurrent++;
551 }
552 if (*mCurrent!=0) {
553 mCurrent++;
554 }
555}
556
557void CppTokenizer::skipSplitLine()
558{
559 mCurrent++; // skip '\'
560 while ( isLineChar(*mCurrent)) // skip newline
561 mCurrent++;
562}
563
564void CppTokenizer::skipTemplateArgs()
565{
566 if (*mCurrent != '<')
567 return;
568 QChar* start = mCurrent;
569
570 QSet<QChar> failSet;
571 failSet.insert('{');
572 failSet.insert('}');
573 failSet.insert(';');
574 skipPair('<', '>', failSet);
575
576 // if we failed, return to where we came from
577 if (start!=mCurrent && *(mCurrent - 1) != '>')
578 mCurrent = start;
579}
580
581void CppTokenizer::skipToEOL()
582{
583 while (true) {
584 while (!isLineChar(*mCurrent) && (*mCurrent!=0)) {
585 mCurrent++;
586 }
587 if (*mCurrent==0)
588 return;
589
590 bool splitLine = (*(mCurrent - 1) == '\\');
591
592 while (isLineChar(*mCurrent))
593 mCurrent++;
594
595 if (!splitLine || *mCurrent==0)
596 break;
597 }
598}
599
600void CppTokenizer::skipToNextToken()
601{
602 while (isSpaceChar(*mCurrent) || isLineChar(*mCurrent))
603 advance();
604}
605
606bool CppTokenizer::isIdentChar(const QChar &ch)
607{
608 return ch=='_' || ch.isLetter() ;
609}
610
611void CppTokenizer::advance()
612{
613 switch(mCurrent->unicode()) {
614 case '\"': skipDoubleQuotes();
615 break;
616 case '\'': skipSingleQuote();
617 break;
618 case '/':
619 if (*(mCurrent + 1) == '=')
620 skipAssignment();
621 else
622 mCurrent++;
623 break;
624 case '=': {
625 if (mTokenList.size()>2
626 && mTokenList[mTokenList.size()-2]->text == "using") {
627 addToken("=",mCurrentLine);
628 mCurrent++;
629 } else
630 skipAssignment();
631 break;
632 }
633 case '&':
634 case '*':
635 case '!':
636 case '|':
637 case '+':
638 case '-':
639 case '~':
640 if (*(mCurrent + 1) == '=')
641 skipAssignment();
642 else
643 mCurrent++;
644 break;
645 case '\\':
646 if (isLineChar(*(mCurrent + 1)))
647 skipSplitLine();
648 else
649 mCurrent++;
650 break;
651 default:
652 if ((*mCurrent == 'R') && (*(mCurrent+1) == '"'))
653 skipRawString();
654 else
655 mCurrent++;
656 }
657}
658
659bool CppTokenizer::isLetterChar(const QChar &ch)
660{
661// return (ch>= 'A' && ch<='Z')
662// || (ch>='a' && ch<='z')
663 return isIdentChar(ch)
664 || ch == '_'
665 || ch == '*'
666 || ch == '&'
667 || ch == '~';
668}
669
670bool CppTokenizer::isHexChar(const QChar &ch)
671{
672 return (ch >= 'A' && ch<='F')
673 || (ch>='a' && ch<='f')
674 || ch == 'x'
675 || ch == 'L';
676}
677
678bool CppTokenizer::isDigitChar(const QChar &ch)
679{
680 return (ch>='0' && ch<='9');
681}
682
683bool CppTokenizer::isSpaceChar(const QChar &ch)
684{
685 return (ch == ' ' || ch == '\t');
686}
687
688bool CppTokenizer::isLineChar(const QChar &ch)
689{
690 return (ch=='\n' || ch=='\r');
691}
692
693bool CppTokenizer::isBlankChar(const QChar &ch)
694{
695 return (ch<=32);
696}
697
698bool CppTokenizer::isOperatorChar(const QChar &ch)
699{
700 switch (ch.unicode()) {
701 case '+':
702 case '-':
703 case '/':
704 case '*':
705 case '[':
706 case ']':
707 case '=':
708 case '%':
709 case '!':
710 case '&':
711 case '|':
712 case '>':
713 case '<':
714 case '^':
715 return true;
716 default:
717 return false;
718 }
719}
720
721bool CppTokenizer::currentWordEquals(QChar *wordStart, QChar *wordEnd, const QString& text)
722{
723 QString currentWord(wordStart, wordEnd-wordStart);
724 return currentWord == text;
725}
726