cpptokenizer.cpp source code [RedPanda-CPP/RedPandaIDE/parser/cpptokenizer.cpp]

1	/*
2	* Copyright (C) 2020-2022 Roy Qu (royqh1979@gmail.com)
3	*
4	* This program is free software: you can redistribute it and/or modify
5	* it under the terms of the GNU General Public License as published by
6	* the Free Software Foundation, either version 3 of the License, or
7	* (at your option) any later version.
8	*
9	* This program is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*
14	* You should have received a copy of the GNU General Public License
15	* along with this program. If not, see <https://www.gnu.org/licenses/>.
16	*/
17	#include "cpptokenizer.h"
18
19	#include <QFile>
20	#include <QTextStream>
21
22	CppTokenizer::CppTokenizer()
23	{
24
25	}
26
27	void CppTokenizer::reset()
28	{
29	mTokenList.clear();
30	mBuffer.clear();
31	mBufferStr.clear();
32	}
33
34	void CppTokenizer::tokenize(const QStringList &buffer)
35	{
36	reset();
37
38	mBuffer = buffer;
39	if (mBuffer.isEmpty())
40	return;
41	mBufferStr = mBuffer [`0`];
42	for (int i=`1`;i<mBuffer.size();i++) {
43	mBufferStr +=`'\n'`;
44	mBufferStr +=mBuffer [i];
45	}
46	mStart = mBufferStr.data();
47	mCurrent = mStart;
48	mLineCount = mStart;
49	QString s = "";
50	bool bSkipBlocks = false;
51	mCurrentLine = `1`;
52	while (true) {
53	mLastToken = s;
54	s = getNextToken(true, true, bSkipBlocks);
55	simplify(s);
56	if (s.isEmpty())
57	break;
58	else
59	addToken(s,mCurrentLine);
60	}
61	}
62
63	void CppTokenizer::dumpTokens(const QString &fileName)
64	{
65	QFile file(fileName);
66
67	if (file.open(QIODevice::WriteOnly \| QIODevice::Truncate)) {
68	QTextStream stream(&file);
69	foreach (const PToken& token,mTokenList) {
70	stream <<QString ("%1,%2").arg(token ->line).arg(token ->text)
71	#if QT_VERSION >= QT_VERSION_CHECK(5,15,0)
72	<<Qt::endl;
73	#else
74	<<endl;
75	#endif
76	}
77	}
78	}
79
80	const CppTokenizer::TokenList &CppTokenizer::tokens()
81	{
82	return mTokenList;
83	}
84
85	CppTokenizer::PToken CppTokenizer::operator[](int i)
86	{
87	return mTokenList [i];
88	}
89
90	int CppTokenizer::tokenCount()
91	{
92	return mTokenList.count();
93	}
94
95	void CppTokenizer::addToken(const QString &sText, int iLine)
96	{
97	PToken token = std::make_shared<Token>();
98	token ->text = sText;
99	token ->line = iLine;
100	mTokenList.append(token);
101	}
102
103	void CppTokenizer::countLines()
104	{
105	while ((*mLineCount != `0`) && (mLineCount < mCurrent)) {
106	if (*mLineCount == `'\n'`)
107	mCurrentLine ++;
108	mLineCount++;
109	}
110	}
111
112	QString CppTokenizer::getArguments()
113	{
114	QChar* offset = mCurrent;
115	skipPair(`'('`, `')'`);
116	QString result(offset,mCurrent-offset);
117	simplifyArgs(result);
118	if ((mCurrent == `'.'`) \|\| ((mCurrent == `'-'`) && (*(mCurrent + `1`) == `'>'`))) {
119	// skip '.' and '->'
120	while ( !( *mCurrent == `0`
121	\|\| *mCurrent == `'('`
122	\|\| *mCurrent == `';'`
123	\|\| *mCurrent == `'{'`
124	\|\| *mCurrent == `'}'`
125	\|\| *mCurrent == `')'`
126	\|\| isLineChar(*mCurrent)
127	\|\| isSpaceChar(*mCurrent)) )
128	mCurrent++;
129	}
130	skipToNextToken();
131	return result;
132	}
133
134	QString CppTokenizer::getForInit()
135	{
136	QChar* startOffset = mCurrent;
137
138	// Step into the init statement
139	mCurrent++;
140
141	// Process until ; or end of file
142	while (true) {
143	QString s = getNextToken(true, true, false);
144	simplify(s);
145	if (!s.isEmpty())
146	addToken(s,mCurrentLine);
147	if ( (s == "") \|\| (s == ";") \|\| (s ==":"))
148	break;
149	// : is used in for-each loop
150	}
151
152	// Skip to end of for loop
153	mCurrent = startOffset;
154	skipPair(`'('`, `')'`);
155	return "";
156	}
157
158	QString CppTokenizer::getNextToken(bool / bSkipParenthesis /, bool bSkipArray, bool bSkipBlock)
159	{
160	QString result;
161	bool done = false;
162	while (true) {
163	skipToNextToken();
164	if (*mCurrent == `0`)
165	break;
166	if (isPreprocessor()) {
167	countLines();
168	result = getPreprocessor(); // don't count preprocessor lines
169	if (result.startsWith("#include")) { // if we find
170	int delimPos = result.lastIndexOf(`':'`);
171	if (delimPos >= `0`) {
172	bool ok;
173	mCurrentLine = result.midRef(delimPos+`1`).toInt(&ok)-`1`; // fCurrLine is 0 based
174	}
175	}
176	done = (result != "");
177	} else if (isForInit()) {
178	countLines();
179	result = getForInit();
180	done = (result != "");
181	} else if (isArguments()) {
182	countLines();
183	result = getArguments();
184	done = (result != "");
185	} else if (isWord()) {
186	countLines();
187	result = getWord(false, bSkipArray, bSkipBlock);
188	done = (result != "");
189	} else if (isNumber()) {
190	countLines();
191	result = getNumber();
192	done = (result != "");
193	} else {
194	switch((*mCurrent).unicode()) {
195	case `0`:
196	done = true;
197	break;
198	case `'/'`:
199	advance();
200	break;
201	case `':'`:
202	if (*(mCurrent + `1`) == `':'`) {
203	countLines();
204	mCurrent+=`2`;
205	// Append next token to this one
206	result = "::"+getWord(true, bSkipArray, bSkipBlock);
207	done = true;
208	} else {
209	countLines();
210	result = *mCurrent;
211	advance();
212	done = true;
213	}
214	break;
215	case `'{'`:
216	case `'}'`:
217	case `';'`:
218	case `','`: //just return the brace or the ';'
219	countLines();
220	result = *mCurrent;
221	advance();
222	done = true;
223	break;
224	case `'>'`: // keep stream operators
225	if (*(mCurrent + `1`) == `'>'`) {
226	countLines();
227	result = ">>";
228	advance();
229	done = true;
230	} else
231	advance();
232	break;
233	case `'<'`:
234	if (*(mCurrent + `1`) == `'<'`) {
235	countLines();
236	result = "<<";
237	advance();
238	done = true;
239	} else
240	advance();
241	break;
242	default:
243	advance();
244	}
245	}
246	if (done)
247	break;
248	}
249	return result;
250	}
251
252	QString CppTokenizer::getNumber()
253	{
254	QChar* offset = mCurrent;
255
256	if (isDigitChar(*mCurrent)) {
257	while (isDigitChar(mCurrent) \|\| isHexChar(mCurrent)) {
258	advance();
259	}
260	}
261
262	QString result;
263	if (offset != mCurrent) {
264	result = QString (offset,mCurrent-offset);
265	if (mCurrent ==`'.'`) // keep '.' for decimal*
266	result += *mCurrent;
267	}
268	return result;
269	}
270
271	QString CppTokenizer::getPreprocessor()
272	{
273	QChar *offset = mCurrent;
274	skipToEOL();
275	return QString (offset, mCurrent-offset);
276	}
277
278	QString CppTokenizer::getWord(bool bSkipParenthesis, bool bSkipArray, bool bSkipBlock)
279	{
280	bool bFoundTemplate = false;
281	// bIsSmartPointer:=False;
282
283	// Skip spaces
284	skipToNextToken();
285
286	// Get next word...
287	QChar* offset = mCurrent;
288
289	mCurrent++;
290	// Copy the word ahead of us
291	while (isIdentChar(mCurrent) \|\| isDigitChar(mCurrent))
292	mCurrent++;
293
294	QString currentWord;
295	if (offset != mCurrent) {
296	currentWord = QString (offset,mCurrent-offset);
297	}
298	// Append the operator characters and argument list to the operator word
299	if ((currentWord == "operator") \|\|
300	(currentWord == "operator*") \|\|
301	(currentWord == "operator&")) {
302	// Spaces between 'operator' and the operator itself are allowed
303	while (isSpaceChar(*mCurrent))
304	mCurrent++;
305	// Find end of operator
306	while (isOperatorChar(*mCurrent))
307	mCurrent++;
308	currentWord = QString (offset,mCurrent-offset);
309	} else if (currentWord == "template") {
310	bFoundTemplate = true;
311	}
312
313
314	QString result;
315	// We found a word...
316	if (!currentWord.isEmpty()) {
317	result = currentWord;
318	// Skip whitespace
319	skipToNextToken();
320
321	// Skip template contents, but keep template variable types
322	if (*mCurrent == `'<'`) {
323	offset = mCurrent; //we don't skip
324	skipTemplateArgs();
325
326	if (!bFoundTemplate) {
327	result += QString (offset, mCurrent-offset);
328	skipToNextToken();
329	}
330	} else if (bSkipArray && (*mCurrent == `'['`)) {
331	// Append array stuff
332	while(true) {
333	offset = mCurrent;
334	skipPair(`'['`, `']'`);
335	result += QString (offset,mCurrent-offset);
336	simplifyArgs(result);
337	skipToNextToken();
338	if (mCurrent !=`'['`) //maybe multi-dimension array*
339	break;
340	}
341	} else if (bSkipBlock && (*mCurrent == `'{'`)) {
342	skipPair(`'{'`, `'}'`);
343	skipToNextToken();
344	}
345
346	// Keep parent/child operators
347	if (*mCurrent == `'.'`) {
348	result +=*mCurrent;
349	mCurrent++;
350	} else if ((mCurrent == `'-'`) && ((mCurrent + `1`) == `'>'`)) {
351	result +=QString (mCurrent,`2`);
352	mCurrent+=`2`;
353	} else if ((mCurrent == `':'`) && ((mCurrent + `1`) == `':'`)) {
354	if (result != "using") {
355	result +=QString (mCurrent,`2`);
356	mCurrent+=`2`;
357	// Append next token to this one
358	QString s = getWord(bSkipParenthesis, bSkipArray, bSkipBlock);
359	result += s;
360	}
361	}
362	}
363	return result;
364	}
365
366	bool CppTokenizer::isArguments()
367	{
368	return *mCurrent == `'('`;
369	}
370
371	bool CppTokenizer::isForInit()
372	{
373	return (*mCurrent == `'('`) && (mLastToken == "for");
374	}
375
376	bool CppTokenizer::isNumber()
377	{
378	return isDigitChar(*mCurrent);
379	}
380
381	bool CppTokenizer::isPreprocessor()
382	{
383	return *mCurrent ==`'#'`;
384	}
385
386	bool CppTokenizer::isWord()
387	{
388	bool result = isLetterChar(*mCurrent);
389	if (result && (*(mCurrent+`1`) == `'"'`))
390	result = false;
391	return result;
392	}
393
394	void CppTokenizer::simplify(QString &output)
395	{
396	//remove \n \r;
397	QString temp;
398	for (const QChar& ch:output) {
399	if (!isLineChar(ch))
400	temp +=ch;
401	}
402	output = temp.trimmed();
403	}
404
405	void CppTokenizer::simplifyArgs(QString &output)
406	{
407	QString temp;
408	QString lastSpace = "";
409	bool parentheseStart = true;
410	foreach (const QChar& ch,output.trimmed()) {
411	if (isSpaceChar(ch)) {
412	if (!parentheseStart)
413	lastSpace +=ch;
414	} else if (ch ==`','`){
415	temp +=ch;
416	lastSpace = "";
417	parentheseStart = false;
418	} else if (ch ==`'('`) {
419	temp +=ch;
420	lastSpace = "";
421	parentheseStart=true;
422	} else if (ch ==`')'`) {
423	temp +=ch;
424	lastSpace = "";
425	parentheseStart = false;
426	} else {
427	parentheseStart=false;
428	if (!lastSpace.isEmpty()) {
429	temp +=" ";
430	}
431	lastSpace = "";
432	temp +=ch;
433	}
434	}
435	output = temp;
436	}
437
438	void CppTokenizer::skipAssignment()
439	{
440	while (true) {
441	switch ((*mCurrent).unicode()) {
442	case `'('`: skipPair(`'('`, `')'`);
443	break;
444	case `'"'`: skipDoubleQuotes();
445	break;
446	case `'\''`: skipSingleQuote();
447	break;
448	case `'{'`: skipPair(`'{'`, `'}'`); // support struct initializers
449	break;
450	case `'/'`:
451	mCurrent++;
452	break;
453	default:
454	if ((mCurrent == `'R'`) && ((mCurrent+`1`) == `'"'`))
455	skipRawString();
456	else
457	mCurrent++;
458	}
459	if (*mCurrent == `','`
460	\|\| *mCurrent ==`';'`
461	\|\| *mCurrent ==`')'`
462	\|\| *mCurrent ==`'}'`
463	\|\| *mCurrent ==`0`)
464	break;
465	}
466	}
467
468	void CppTokenizer::skipDoubleQuotes()
469	{
470	mCurrent++;
471	while (!(mCurrent ==`'"'` \|\| mCurrent == `0`)) {
472	if (*mCurrent == `'\\'`)
473	mCurrent+=`2`; // skip escaped char
474	else
475	mCurrent++;
476	}
477	if (*mCurrent !=`0`) {
478	mCurrent++;
479	}
480	}
481
482	void CppTokenizer::skipPair(const QChar &cStart, const QChar cEnd, const QSet<QChar>& failChars)
483	{
484	mCurrent++;
485	while (*mCurrent != `0`) {
486	if ((*mCurrent == `'('`) && !failChars.contains(`'('`)) {
487	skipPair(`'('`, `')'`, failChars);
488	} else if ((*mCurrent == `'['`) && !failChars.contains(`'['`)) {
489	skipPair(`'['`, `']'`, failChars);
490	} else if ((*mCurrent == `'{'`) && !failChars.contains(`'{'`)) {
491	skipPair(`'{'`, `'}'`, failChars);
492	} else if (*mCurrent == cStart) {
493	skipPair(cStart, cEnd, failChars);
494	} else if (*mCurrent == cEnd) {
495	mCurrent++; // skip over end
496	break;
497	} else if ((mCurrent == `'R'`) && ((mCurrent+`1`) == `'"'`)) {
498	if (cStart != `'\''` && cStart !=`'\"'`)
499	skipRawString(); // don't do it inside AnsiString!
500	else
501	mCurrent++;
502	} else if (*mCurrent == `'"'`) {
503	if (cStart != `'\''` && cStart !=`'\"'`)
504	skipDoubleQuotes(); // don't do it inside AnsiString!
505	else
506	mCurrent++;
507	} else if (*mCurrent == `'\''`) {
508	if (cStart != `'\''` && cStart !=`'\"'`)
509	skipSingleQuote(); // don't do it inside AnsiString!
510	else
511	mCurrent++;
512	} else if (failChars.contains(*mCurrent)) {
513	break;
514	} else {
515	mCurrent++;
516	}
517	}
518	}
519
520	void CppTokenizer::skipRawString()
521	{
522	mCurrent++; //skip R
523	bool noEscape = false;
524	while(true) {
525	mCurrent++;
526	switch(mCurrent->unicode()) {
527	case `'('`:
528	noEscape = true;
529	break;
530	case `')'`:
531	noEscape = false;
532	break;
533	}
534	if (*mCurrent == `0`)
535	break;
536	if ((*mCurrent == `'"'`) && !noEscape)
537	break;
538	}
539	if (*mCurrent !=`0`)
540	mCurrent++;
541	}
542
543	void CppTokenizer::skipSingleQuote()
544	{
545	mCurrent++;
546	while (!(mCurrent ==`'\''` \|\| mCurrent == `0`)) {
547	if (*mCurrent == `'\\'`)
548	mCurrent+=`2`; // skip escaped char
549	else
550	mCurrent++;
551	}
552	if (*mCurrent !=`0`) {
553	mCurrent++;
554	}
555	}
556
557	void CppTokenizer::skipSplitLine()
558	{
559	mCurrent++; // skip '\'
560	while ( isLineChar(mCurrent)) // skip newline*
561	mCurrent++;
562	}
563
564	void CppTokenizer::skipTemplateArgs()
565	{
566	if (*mCurrent != `'<'`)
567	return;
568	QChar* start = mCurrent;
569
570	QSet<QChar> failSet;
571	failSet.insert(`'{'`);
572	failSet.insert(`'}'`);
573	failSet.insert(`';'`);
574	skipPair(`'<'`, `'>'`, failSet);
575
576	// if we failed, return to where we came from
577	if (start!=mCurrent && *(mCurrent - `1`) != `'>'`)
578	mCurrent = start;
579	}
580
581	void CppTokenizer::skipToEOL()
582	{
583	while (true) {
584	while (!isLineChar(mCurrent) && (mCurrent !=`0`)) {
585	mCurrent++;
586	}
587	if (*mCurrent ==`0`)
588	return;
589
590	bool splitLine = (*(mCurrent - `1`) == `'\\'`);
591
592	while (isLineChar(*mCurrent))
593	mCurrent++;
594
595	if (!splitLine \|\| *mCurrent ==`0`)
596	break;
597	}
598	}
599
600	void CppTokenizer::skipToNextToken()
601	{
602	while (isSpaceChar(mCurrent) \|\| isLineChar(mCurrent))
603	advance();
604	}
605
606	bool CppTokenizer::isIdentChar(const QChar &ch)
607	{
608	return ch ==`'_'` \|\| ch.isLetter() ;
609	}
610
611	void CppTokenizer::advance()
612	{
613	switch(mCurrent->unicode()) {
614	case `'\"'`: skipDoubleQuotes();
615	break;
616	case `'\''`: skipSingleQuote();
617	break;
618	case `'/'`:
619	if (*(mCurrent + `1`) == `'='`)
620	skipAssignment();
621	else
622	mCurrent++;
623	break;
624	case `'='`: {
625	if (mTokenList.size()>`2`
626	&& mTokenList [mTokenList.size()-`2`]->text == "using") {
627	addToken("=",mCurrentLine);
628	mCurrent++;
629	} else
630	skipAssignment();
631	break;
632	}
633	case `'&'`:
634	case `'*'`:
635	case `'!'`:
636	case `'\|'`:
637	case `'+'`:
638	case `'-'`:
639	case `'~'`:
640	if (*(mCurrent + `1`) == `'='`)
641	skipAssignment();
642	else
643	mCurrent++;
644	break;
645	case `'\\'`:
646	if (isLineChar(*(mCurrent + `1`)))
647	skipSplitLine();
648	else
649	mCurrent++;
650	break;
651	default:
652	if ((mCurrent == `'R'`) && ((mCurrent+`1`) == `'"'`))
653	skipRawString();
654	else
655	mCurrent++;
656	}
657	}
658
659	bool CppTokenizer::isLetterChar(const QChar &ch)
660	{
661	// return (ch>= 'A' && ch<='Z')
662	// \|\| (ch>='a' && ch<='z')
663	return isIdentChar(ch)
664	\|\| ch == `'_'`
665	\|\| ch == `'*'`
666	\|\| ch == `'&'`
667	\|\| ch == `'~'`;
668	}
669
670	bool CppTokenizer::isHexChar(const QChar &ch)
671	{
672	return (ch >= `'A'` && ch <=`'F'`)
673	\|\| (ch >=`'a'` && ch <=`'f'`)
674	\|\| ch == `'x'`
675	\|\| ch == `'L'`;
676	}
677
678	bool CppTokenizer::isDigitChar(const QChar &ch)
679	{
680	return (ch >=`'0'` && ch <=`'9'`);
681	}
682
683	bool CppTokenizer::isSpaceChar(const QChar &ch)
684	{
685	return (ch == `' '` \|\| ch == `'\t'`);
686	}
687
688	bool CppTokenizer::isLineChar(const QChar &ch)
689	{
690	return (ch ==`'\n'` \|\| ch ==`'\r'`);
691	}
692
693	bool CppTokenizer::isBlankChar(const QChar &ch)
694	{
695	return (ch <=`32`);
696	}
697
698	bool CppTokenizer::isOperatorChar(const QChar &ch)
699	{
700	switch (ch.unicode()) {
701	case `'+'`:
702	case `'-'`:
703	case `'/'`:
704	case `'*'`:
705	case `'['`:
706	case `']'`:
707	case `'='`:
708	case `'%'`:
709	case `'!'`:
710	case `'&'`:
711	case `'\|'`:
712	case `'>'`:
713	case `'<'`:
714	case `'^'`:
715	return true;
716	default:
717	return false;
718	}
719	}
720
721	bool CppTokenizer::currentWordEquals(QChar wordStart, QChar wordEnd, const QString& text)
722	{
723	QString currentWord(wordStart, wordEnd-wordStart);
724	return currentWord == text;
725	}
726

Browse the source code of RedPanda-CPP/RedPandaIDE/parser/cpptokenizer.cpp