1// SExp - A S-Expression Parser for C++
2// Copyright (C) 2006 Matthias Braun <matze@braunis.de>
3// 2015 Ingo Ruhnke <grumbel@gmail.com>
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18#include "sexp/lexer.hpp"
19
20#include <assert.h>
21#include <string.h>
22#include <sstream>
23#include <stdexcept>
24#include <stdio.h>
25
26namespace sexp {
27
28Lexer::Lexer(std::istream& newstream, bool use_arrays) :
29 m_stream(newstream),
30 m_use_arrays(use_arrays),
31 m_eof(false),
32 m_linenumber(0),
33 m_bufend(),
34 m_bufpos(),
35 m_c(),
36 m_token_string()
37{
38 // trigger a refill of the buffer
39 m_bufpos = nullptr;
40 m_bufend = nullptr;
41 next_char();
42}
43
44Lexer::~Lexer()
45{
46}
47
48void
49Lexer::next_char()
50{
51 if (m_bufpos >= m_bufend || (m_bufpos == nullptr && m_bufend == nullptr) /* Initial refill trigger */) {
52 if (m_eof) {
53 m_c = EOF;
54 return;
55 }
56 m_stream.read(m_buffer, BUFFER_SIZE);
57 std::streamsize bytes_read = m_stream.gcount();
58
59 m_bufpos = m_buffer;
60 m_bufend = m_buffer + bytes_read;
61
62 // the following is a hack that appends an additional ' ' at the end of
63 // the file to avoid problems when parsing symbols/elements and a sudden
64 // EOF. This is faster than relying on unget and IMO also nicer.
65 if (bytes_read == 0 || m_stream.eof()) {
66 m_eof = true;
67 *m_bufend = ' ';
68 ++m_bufend;
69 }
70 }
71
72 if (m_bufpos == nullptr) {
73 return;
74 }
75
76 m_c = *m_bufpos++;
77 if (m_c == '\n') {
78 ++m_linenumber;
79 }
80}
81
82void
83Lexer::add_char()
84{
85 m_token_string += static_cast<char>(m_c);
86 next_char();
87}
88
89Lexer::TokenType
90Lexer::get_next_token()
91{
92 static const char* delims = "\"();";
93
94 while(isspace(m_c)) {
95 next_char();
96 }
97
98 m_token_string.clear();
99
100 switch(m_c)
101 {
102 case ';': // comment
103 while(m_c != '\n') {
104 next_char();
105 }
106 return get_next_token(); // and again
107
108 case '(':
109 next_char();
110 if (m_use_arrays)
111 {
112 return TOKEN_ARRAY_START;
113 }
114 else
115 {
116 return TOKEN_OPEN_PAREN;
117 }
118 case ')':
119 next_char();
120 return TOKEN_CLOSE_PAREN;
121
122 case '"': { // string
123 int startline = m_linenumber;
124 while(1) {
125 next_char();
126 switch(m_c) {
127 case '"':
128 next_char();
129 goto string_finished;
130 case '\r':
131 continue;
132 case '\n':
133 break;
134 case '\\':
135 next_char();
136 switch(m_c) {
137 case 'n':
138 m_c = '\n';
139 break;
140 case 't':
141 m_c = '\t';
142 break;
143 }
144 break;
145 case EOF: {
146 std::stringstream msg;
147 msg << "Parse error in line " << startline << ": "
148 << "EOF while parsing string.";
149 throw std::runtime_error(msg.str());
150 }
151 default:
152 break;
153 }
154 m_token_string += static_cast<char>(m_c);
155 }
156 string_finished:
157 return TOKEN_STRING;
158 }
159 case '#': // constant
160 next_char();
161
162 if (m_c == '(')
163 {
164 next_char();
165 return TOKEN_ARRAY_START;
166 }
167 else
168 {
169 while(isalnum(m_c) || m_c == '_') {
170 add_char();
171 }
172
173 if (m_token_string == "t")
174 {
175 return TOKEN_TRUE;
176 }
177 else if (m_token_string == "f")
178 {
179 return TOKEN_FALSE;
180 }
181 else
182 {
183 // we only handle #t and #f constants at the moment...
184 std::stringstream msg;
185 msg << "Parse Error in line " << m_linenumber << ": "
186 << "Unknown constant '" << m_token_string << "'.";
187 throw std::runtime_error(msg.str());
188 }
189 }
190
191 case EOF:
192 return TOKEN_EOF;
193
194 default:
195 {
196 enum {
197 STATE_INIT,
198 STATE_SYMBOL,
199 STATE_MAYBE_DOT,
200 STATE_MAYBE_INTEGER_SIGN,
201 STATE_MAYBE_INTEGER_PART,
202 STATE_MAYBE_FRACTIONAL_START,
203 STATE_MAYBE_FRACTIONAL_PART,
204 STATE_MAYBE_EXPONENT_SIGN,
205 STATE_MAYBE_EXPONENT_START,
206 STATE_MAYBE_EXPONENT_PART,
207 } state = STATE_INIT;
208
209 bool has_integer_part = false;
210 bool has_fractional_part = false;
211 do
212 {
213 switch(state)
214 {
215 case STATE_INIT:
216 if (isdigit(m_c)) {
217 has_integer_part = true;
218 state = STATE_MAYBE_INTEGER_PART;
219 } else if (m_c == '-' || m_c == '+') {
220 state = STATE_MAYBE_INTEGER_SIGN;
221 } else if (m_c == '.') {
222 state = STATE_MAYBE_DOT;
223 } else {
224 state = STATE_SYMBOL;
225 }
226 break;
227
228 case STATE_SYMBOL:
229 break;
230
231 case STATE_MAYBE_DOT:
232 if (isdigit(m_c)) {
233 state = STATE_MAYBE_FRACTIONAL_START;
234 } else {
235 state = STATE_SYMBOL;
236 }
237 break;
238
239 case STATE_MAYBE_INTEGER_SIGN:
240 if (isdigit(m_c)) {
241 has_integer_part = true;
242 state = STATE_MAYBE_INTEGER_PART;
243 } else if (m_c == '.') {
244 state = STATE_MAYBE_FRACTIONAL_START;
245 }
246 break;
247
248 case STATE_MAYBE_INTEGER_PART:
249 if (isdigit(m_c)) {
250 // loop
251 } else if (m_c == '.') {
252 state = STATE_MAYBE_FRACTIONAL_START;
253 } else if (m_c == 'e' || m_c == 'E') {
254 state = STATE_MAYBE_EXPONENT_SIGN;
255 } else {
256 state = STATE_SYMBOL;
257 }
258 break;
259
260 case STATE_MAYBE_FRACTIONAL_START:
261 if (isdigit(m_c)) {
262 has_fractional_part = true;
263 state = STATE_MAYBE_FRACTIONAL_PART;
264 } else {
265 state = STATE_SYMBOL;
266 }
267 break;
268
269 case STATE_MAYBE_FRACTIONAL_PART:
270 if (isdigit(m_c)) {
271 // loop
272 } else if ((has_integer_part || has_fractional_part) &&
273 (m_c == 'e' || m_c == 'E')) {
274 state = STATE_MAYBE_EXPONENT_SIGN;
275 } else {
276 state = STATE_SYMBOL;
277 }
278 break;
279
280 case STATE_MAYBE_EXPONENT_SIGN:
281 if (m_c == '-' || m_c == '+') {
282 state = STATE_MAYBE_EXPONENT_START;
283 } else if (isdigit(m_c)) {
284 state = STATE_MAYBE_EXPONENT_PART;
285 } else {
286 state = STATE_SYMBOL;
287 }
288 break;
289
290 case STATE_MAYBE_EXPONENT_START:
291 if (isdigit(m_c)) {
292 state = STATE_MAYBE_EXPONENT_PART;
293 } else {
294 state = STATE_SYMBOL;
295 }
296 break;
297
298 case STATE_MAYBE_EXPONENT_PART:
299 if (isdigit(m_c)) {
300 // loop
301 } else {
302 state = STATE_SYMBOL;
303 }
304 break;
305 }
306
307 add_char();
308 }
309 while(!isspace(m_c) && !strchr(delims, m_c));
310
311 switch(state)
312 {
313 case STATE_INIT:
314 assert(false && "never reached");
315 return TOKEN_EOF;
316
317 case STATE_SYMBOL:
318 return TOKEN_SYMBOL;
319
320 case STATE_MAYBE_DOT:
321 return TOKEN_DOT;
322
323 case STATE_MAYBE_INTEGER_SIGN:
324 return TOKEN_SYMBOL;
325
326 case STATE_MAYBE_INTEGER_PART:
327 return TOKEN_INTEGER;
328
329 case STATE_MAYBE_FRACTIONAL_START:
330 if (has_integer_part) {
331 return TOKEN_REAL;
332 } else {
333 return TOKEN_SYMBOL;
334 }
335
336 case STATE_MAYBE_FRACTIONAL_PART:
337 return TOKEN_REAL;
338
339 case STATE_MAYBE_EXPONENT_SIGN:
340 case STATE_MAYBE_EXPONENT_START:
341 return TOKEN_SYMBOL;
342
343 case STATE_MAYBE_EXPONENT_PART:
344 return TOKEN_REAL;
345 }
346 }
347 assert(false && "never reached");
348 return TOKEN_EOF;
349 }
350}
351
352} // namespace sexp
353
354/* EOF */
355