1 | // SExp - A S-Expression Parser for C++ |
2 | // Copyright (C) 2006 Matthias Braun <matze@braunis.de> |
3 | // 2015 Ingo Ruhnke <grumbel@gmail.com> |
4 | // |
5 | // This program is free software: you can redistribute it and/or modify |
6 | // it under the terms of the GNU General Public License as published by |
7 | // the Free Software Foundation, either version 3 of the License, or |
8 | // (at your option) any later version. |
9 | // |
10 | // This program is distributed in the hope that it will be useful, |
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | // GNU General Public License for more details. |
14 | // |
15 | // You should have received a copy of the GNU General Public License |
16 | // along with this program. If not, see <http://www.gnu.org/licenses/>. |
17 | |
18 | #include "sexp/lexer.hpp" |
19 | |
20 | #include <assert.h> |
21 | #include <string.h> |
22 | #include <sstream> |
23 | #include <stdexcept> |
24 | #include <stdio.h> |
25 | |
26 | namespace sexp { |
27 | |
28 | Lexer::Lexer(std::istream& newstream, bool use_arrays) : |
29 | m_stream(newstream), |
30 | m_use_arrays(use_arrays), |
31 | m_eof(false), |
32 | m_linenumber(0), |
33 | m_bufend(), |
34 | m_bufpos(), |
35 | m_c(), |
36 | m_token_string() |
37 | { |
38 | // trigger a refill of the buffer |
39 | m_bufpos = nullptr; |
40 | m_bufend = nullptr; |
41 | next_char(); |
42 | } |
43 | |
44 | Lexer::~Lexer() |
45 | { |
46 | } |
47 | |
48 | void |
49 | Lexer::next_char() |
50 | { |
51 | if (m_bufpos >= m_bufend || (m_bufpos == nullptr && m_bufend == nullptr) /* Initial refill trigger */) { |
52 | if (m_eof) { |
53 | m_c = EOF; |
54 | return; |
55 | } |
56 | m_stream.read(m_buffer, BUFFER_SIZE); |
57 | std::streamsize bytes_read = m_stream.gcount(); |
58 | |
59 | m_bufpos = m_buffer; |
60 | m_bufend = m_buffer + bytes_read; |
61 | |
62 | // the following is a hack that appends an additional ' ' at the end of |
63 | // the file to avoid problems when parsing symbols/elements and a sudden |
64 | // EOF. This is faster than relying on unget and IMO also nicer. |
65 | if (bytes_read == 0 || m_stream.eof()) { |
66 | m_eof = true; |
67 | *m_bufend = ' '; |
68 | ++m_bufend; |
69 | } |
70 | } |
71 | |
72 | if (m_bufpos == nullptr) { |
73 | return; |
74 | } |
75 | |
76 | m_c = *m_bufpos++; |
77 | if (m_c == '\n') { |
78 | ++m_linenumber; |
79 | } |
80 | } |
81 | |
82 | void |
83 | Lexer::add_char() |
84 | { |
85 | m_token_string += static_cast<char>(m_c); |
86 | next_char(); |
87 | } |
88 | |
89 | Lexer::TokenType |
90 | Lexer::get_next_token() |
91 | { |
92 | static const char* delims = "\"();" ; |
93 | |
94 | while(isspace(m_c)) { |
95 | next_char(); |
96 | } |
97 | |
98 | m_token_string.clear(); |
99 | |
100 | switch(m_c) |
101 | { |
102 | case ';': // comment |
103 | while(m_c != '\n') { |
104 | next_char(); |
105 | } |
106 | return get_next_token(); // and again |
107 | |
108 | case '(': |
109 | next_char(); |
110 | if (m_use_arrays) |
111 | { |
112 | return TOKEN_ARRAY_START; |
113 | } |
114 | else |
115 | { |
116 | return TOKEN_OPEN_PAREN; |
117 | } |
118 | case ')': |
119 | next_char(); |
120 | return TOKEN_CLOSE_PAREN; |
121 | |
122 | case '"': { // string |
123 | int startline = m_linenumber; |
124 | while(1) { |
125 | next_char(); |
126 | switch(m_c) { |
127 | case '"': |
128 | next_char(); |
129 | goto string_finished; |
130 | case '\r': |
131 | continue; |
132 | case '\n': |
133 | break; |
134 | case '\\': |
135 | next_char(); |
136 | switch(m_c) { |
137 | case 'n': |
138 | m_c = '\n'; |
139 | break; |
140 | case 't': |
141 | m_c = '\t'; |
142 | break; |
143 | } |
144 | break; |
145 | case EOF: { |
146 | std::stringstream msg; |
147 | msg << "Parse error in line " << startline << ": " |
148 | << "EOF while parsing string." ; |
149 | throw std::runtime_error(msg.str()); |
150 | } |
151 | default: |
152 | break; |
153 | } |
154 | m_token_string += static_cast<char>(m_c); |
155 | } |
156 | string_finished: |
157 | return TOKEN_STRING; |
158 | } |
159 | case '#': // constant |
160 | next_char(); |
161 | |
162 | if (m_c == '(') |
163 | { |
164 | next_char(); |
165 | return TOKEN_ARRAY_START; |
166 | } |
167 | else |
168 | { |
169 | while(isalnum(m_c) || m_c == '_') { |
170 | add_char(); |
171 | } |
172 | |
173 | if (m_token_string == "t" ) |
174 | { |
175 | return TOKEN_TRUE; |
176 | } |
177 | else if (m_token_string == "f" ) |
178 | { |
179 | return TOKEN_FALSE; |
180 | } |
181 | else |
182 | { |
183 | // we only handle #t and #f constants at the moment... |
184 | std::stringstream msg; |
185 | msg << "Parse Error in line " << m_linenumber << ": " |
186 | << "Unknown constant '" << m_token_string << "'." ; |
187 | throw std::runtime_error(msg.str()); |
188 | } |
189 | } |
190 | |
191 | case EOF: |
192 | return TOKEN_EOF; |
193 | |
194 | default: |
195 | { |
196 | enum { |
197 | STATE_INIT, |
198 | STATE_SYMBOL, |
199 | STATE_MAYBE_DOT, |
200 | STATE_MAYBE_INTEGER_SIGN, |
201 | STATE_MAYBE_INTEGER_PART, |
202 | STATE_MAYBE_FRACTIONAL_START, |
203 | STATE_MAYBE_FRACTIONAL_PART, |
204 | STATE_MAYBE_EXPONENT_SIGN, |
205 | STATE_MAYBE_EXPONENT_START, |
206 | STATE_MAYBE_EXPONENT_PART, |
207 | } state = STATE_INIT; |
208 | |
209 | bool has_integer_part = false; |
210 | bool has_fractional_part = false; |
211 | do |
212 | { |
213 | switch(state) |
214 | { |
215 | case STATE_INIT: |
216 | if (isdigit(m_c)) { |
217 | has_integer_part = true; |
218 | state = STATE_MAYBE_INTEGER_PART; |
219 | } else if (m_c == '-' || m_c == '+') { |
220 | state = STATE_MAYBE_INTEGER_SIGN; |
221 | } else if (m_c == '.') { |
222 | state = STATE_MAYBE_DOT; |
223 | } else { |
224 | state = STATE_SYMBOL; |
225 | } |
226 | break; |
227 | |
228 | case STATE_SYMBOL: |
229 | break; |
230 | |
231 | case STATE_MAYBE_DOT: |
232 | if (isdigit(m_c)) { |
233 | state = STATE_MAYBE_FRACTIONAL_START; |
234 | } else { |
235 | state = STATE_SYMBOL; |
236 | } |
237 | break; |
238 | |
239 | case STATE_MAYBE_INTEGER_SIGN: |
240 | if (isdigit(m_c)) { |
241 | has_integer_part = true; |
242 | state = STATE_MAYBE_INTEGER_PART; |
243 | } else if (m_c == '.') { |
244 | state = STATE_MAYBE_FRACTIONAL_START; |
245 | } |
246 | break; |
247 | |
248 | case STATE_MAYBE_INTEGER_PART: |
249 | if (isdigit(m_c)) { |
250 | // loop |
251 | } else if (m_c == '.') { |
252 | state = STATE_MAYBE_FRACTIONAL_START; |
253 | } else if (m_c == 'e' || m_c == 'E') { |
254 | state = STATE_MAYBE_EXPONENT_SIGN; |
255 | } else { |
256 | state = STATE_SYMBOL; |
257 | } |
258 | break; |
259 | |
260 | case STATE_MAYBE_FRACTIONAL_START: |
261 | if (isdigit(m_c)) { |
262 | has_fractional_part = true; |
263 | state = STATE_MAYBE_FRACTIONAL_PART; |
264 | } else { |
265 | state = STATE_SYMBOL; |
266 | } |
267 | break; |
268 | |
269 | case STATE_MAYBE_FRACTIONAL_PART: |
270 | if (isdigit(m_c)) { |
271 | // loop |
272 | } else if ((has_integer_part || has_fractional_part) && |
273 | (m_c == 'e' || m_c == 'E')) { |
274 | state = STATE_MAYBE_EXPONENT_SIGN; |
275 | } else { |
276 | state = STATE_SYMBOL; |
277 | } |
278 | break; |
279 | |
280 | case STATE_MAYBE_EXPONENT_SIGN: |
281 | if (m_c == '-' || m_c == '+') { |
282 | state = STATE_MAYBE_EXPONENT_START; |
283 | } else if (isdigit(m_c)) { |
284 | state = STATE_MAYBE_EXPONENT_PART; |
285 | } else { |
286 | state = STATE_SYMBOL; |
287 | } |
288 | break; |
289 | |
290 | case STATE_MAYBE_EXPONENT_START: |
291 | if (isdigit(m_c)) { |
292 | state = STATE_MAYBE_EXPONENT_PART; |
293 | } else { |
294 | state = STATE_SYMBOL; |
295 | } |
296 | break; |
297 | |
298 | case STATE_MAYBE_EXPONENT_PART: |
299 | if (isdigit(m_c)) { |
300 | // loop |
301 | } else { |
302 | state = STATE_SYMBOL; |
303 | } |
304 | break; |
305 | } |
306 | |
307 | add_char(); |
308 | } |
309 | while(!isspace(m_c) && !strchr(delims, m_c)); |
310 | |
311 | switch(state) |
312 | { |
313 | case STATE_INIT: |
314 | assert(false && "never reached" ); |
315 | return TOKEN_EOF; |
316 | |
317 | case STATE_SYMBOL: |
318 | return TOKEN_SYMBOL; |
319 | |
320 | case STATE_MAYBE_DOT: |
321 | return TOKEN_DOT; |
322 | |
323 | case STATE_MAYBE_INTEGER_SIGN: |
324 | return TOKEN_SYMBOL; |
325 | |
326 | case STATE_MAYBE_INTEGER_PART: |
327 | return TOKEN_INTEGER; |
328 | |
329 | case STATE_MAYBE_FRACTIONAL_START: |
330 | if (has_integer_part) { |
331 | return TOKEN_REAL; |
332 | } else { |
333 | return TOKEN_SYMBOL; |
334 | } |
335 | |
336 | case STATE_MAYBE_FRACTIONAL_PART: |
337 | return TOKEN_REAL; |
338 | |
339 | case STATE_MAYBE_EXPONENT_SIGN: |
340 | case STATE_MAYBE_EXPONENT_START: |
341 | return TOKEN_SYMBOL; |
342 | |
343 | case STATE_MAYBE_EXPONENT_PART: |
344 | return TOKEN_REAL; |
345 | } |
346 | } |
347 | assert(false && "never reached" ); |
348 | return TOKEN_EOF; |
349 | } |
350 | } |
351 | |
352 | } // namespace sexp |
353 | |
354 | /* EOF */ |
355 | |