| 1 | // SExp - A S-Expression Parser for C++ |
| 2 | // Copyright (C) 2006 Matthias Braun <matze@braunis.de> |
| 3 | // 2015 Ingo Ruhnke <grumbel@gmail.com> |
| 4 | // |
| 5 | // This program is free software: you can redistribute it and/or modify |
| 6 | // it under the terms of the GNU General Public License as published by |
| 7 | // the Free Software Foundation, either version 3 of the License, or |
| 8 | // (at your option) any later version. |
| 9 | // |
| 10 | // This program is distributed in the hope that it will be useful, |
| 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | // GNU General Public License for more details. |
| 14 | // |
| 15 | // You should have received a copy of the GNU General Public License |
| 16 | // along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 17 | |
| 18 | #include "sexp/lexer.hpp" |
| 19 | |
| 20 | #include <assert.h> |
| 21 | #include <string.h> |
| 22 | #include <sstream> |
| 23 | #include <stdexcept> |
| 24 | #include <stdio.h> |
| 25 | |
| 26 | namespace sexp { |
| 27 | |
| 28 | Lexer::Lexer(std::istream& newstream, bool use_arrays) : |
| 29 | m_stream(newstream), |
| 30 | m_use_arrays(use_arrays), |
| 31 | m_eof(false), |
| 32 | m_linenumber(0), |
| 33 | m_bufend(), |
| 34 | m_bufpos(), |
| 35 | m_c(), |
| 36 | m_token_string() |
| 37 | { |
| 38 | // trigger a refill of the buffer |
| 39 | m_bufpos = nullptr; |
| 40 | m_bufend = nullptr; |
| 41 | next_char(); |
| 42 | } |
| 43 | |
| 44 | Lexer::~Lexer() |
| 45 | { |
| 46 | } |
| 47 | |
| 48 | void |
| 49 | Lexer::next_char() |
| 50 | { |
| 51 | if (m_bufpos >= m_bufend || (m_bufpos == nullptr && m_bufend == nullptr) /* Initial refill trigger */) { |
| 52 | if (m_eof) { |
| 53 | m_c = EOF; |
| 54 | return; |
| 55 | } |
| 56 | m_stream.read(m_buffer, BUFFER_SIZE); |
| 57 | std::streamsize bytes_read = m_stream.gcount(); |
| 58 | |
| 59 | m_bufpos = m_buffer; |
| 60 | m_bufend = m_buffer + bytes_read; |
| 61 | |
| 62 | // the following is a hack that appends an additional ' ' at the end of |
| 63 | // the file to avoid problems when parsing symbols/elements and a sudden |
| 64 | // EOF. This is faster than relying on unget and IMO also nicer. |
| 65 | if (bytes_read == 0 || m_stream.eof()) { |
| 66 | m_eof = true; |
| 67 | *m_bufend = ' '; |
| 68 | ++m_bufend; |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | if (m_bufpos == nullptr) { |
| 73 | return; |
| 74 | } |
| 75 | |
| 76 | m_c = *m_bufpos++; |
| 77 | if (m_c == '\n') { |
| 78 | ++m_linenumber; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | void |
| 83 | Lexer::add_char() |
| 84 | { |
| 85 | m_token_string += static_cast<char>(m_c); |
| 86 | next_char(); |
| 87 | } |
| 88 | |
| 89 | Lexer::TokenType |
| 90 | Lexer::get_next_token() |
| 91 | { |
| 92 | static const char* delims = "\"();" ; |
| 93 | |
| 94 | while(isspace(m_c)) { |
| 95 | next_char(); |
| 96 | } |
| 97 | |
| 98 | m_token_string.clear(); |
| 99 | |
| 100 | switch(m_c) |
| 101 | { |
| 102 | case ';': // comment |
| 103 | while(m_c != '\n') { |
| 104 | next_char(); |
| 105 | } |
| 106 | return get_next_token(); // and again |
| 107 | |
| 108 | case '(': |
| 109 | next_char(); |
| 110 | if (m_use_arrays) |
| 111 | { |
| 112 | return TOKEN_ARRAY_START; |
| 113 | } |
| 114 | else |
| 115 | { |
| 116 | return TOKEN_OPEN_PAREN; |
| 117 | } |
| 118 | case ')': |
| 119 | next_char(); |
| 120 | return TOKEN_CLOSE_PAREN; |
| 121 | |
| 122 | case '"': { // string |
| 123 | int startline = m_linenumber; |
| 124 | while(1) { |
| 125 | next_char(); |
| 126 | switch(m_c) { |
| 127 | case '"': |
| 128 | next_char(); |
| 129 | goto string_finished; |
| 130 | case '\r': |
| 131 | continue; |
| 132 | case '\n': |
| 133 | break; |
| 134 | case '\\': |
| 135 | next_char(); |
| 136 | switch(m_c) { |
| 137 | case 'n': |
| 138 | m_c = '\n'; |
| 139 | break; |
| 140 | case 't': |
| 141 | m_c = '\t'; |
| 142 | break; |
| 143 | } |
| 144 | break; |
| 145 | case EOF: { |
| 146 | std::stringstream msg; |
| 147 | msg << "Parse error in line " << startline << ": " |
| 148 | << "EOF while parsing string." ; |
| 149 | throw std::runtime_error(msg.str()); |
| 150 | } |
| 151 | default: |
| 152 | break; |
| 153 | } |
| 154 | m_token_string += static_cast<char>(m_c); |
| 155 | } |
| 156 | string_finished: |
| 157 | return TOKEN_STRING; |
| 158 | } |
| 159 | case '#': // constant |
| 160 | next_char(); |
| 161 | |
| 162 | if (m_c == '(') |
| 163 | { |
| 164 | next_char(); |
| 165 | return TOKEN_ARRAY_START; |
| 166 | } |
| 167 | else |
| 168 | { |
| 169 | while(isalnum(m_c) || m_c == '_') { |
| 170 | add_char(); |
| 171 | } |
| 172 | |
| 173 | if (m_token_string == "t" ) |
| 174 | { |
| 175 | return TOKEN_TRUE; |
| 176 | } |
| 177 | else if (m_token_string == "f" ) |
| 178 | { |
| 179 | return TOKEN_FALSE; |
| 180 | } |
| 181 | else |
| 182 | { |
| 183 | // we only handle #t and #f constants at the moment... |
| 184 | std::stringstream msg; |
| 185 | msg << "Parse Error in line " << m_linenumber << ": " |
| 186 | << "Unknown constant '" << m_token_string << "'." ; |
| 187 | throw std::runtime_error(msg.str()); |
| 188 | } |
| 189 | } |
| 190 | |
| 191 | case EOF: |
| 192 | return TOKEN_EOF; |
| 193 | |
| 194 | default: |
| 195 | { |
| 196 | enum { |
| 197 | STATE_INIT, |
| 198 | STATE_SYMBOL, |
| 199 | STATE_MAYBE_DOT, |
| 200 | STATE_MAYBE_INTEGER_SIGN, |
| 201 | STATE_MAYBE_INTEGER_PART, |
| 202 | STATE_MAYBE_FRACTIONAL_START, |
| 203 | STATE_MAYBE_FRACTIONAL_PART, |
| 204 | STATE_MAYBE_EXPONENT_SIGN, |
| 205 | STATE_MAYBE_EXPONENT_START, |
| 206 | STATE_MAYBE_EXPONENT_PART, |
| 207 | } state = STATE_INIT; |
| 208 | |
| 209 | bool has_integer_part = false; |
| 210 | bool has_fractional_part = false; |
| 211 | do |
| 212 | { |
| 213 | switch(state) |
| 214 | { |
| 215 | case STATE_INIT: |
| 216 | if (isdigit(m_c)) { |
| 217 | has_integer_part = true; |
| 218 | state = STATE_MAYBE_INTEGER_PART; |
| 219 | } else if (m_c == '-' || m_c == '+') { |
| 220 | state = STATE_MAYBE_INTEGER_SIGN; |
| 221 | } else if (m_c == '.') { |
| 222 | state = STATE_MAYBE_DOT; |
| 223 | } else { |
| 224 | state = STATE_SYMBOL; |
| 225 | } |
| 226 | break; |
| 227 | |
| 228 | case STATE_SYMBOL: |
| 229 | break; |
| 230 | |
| 231 | case STATE_MAYBE_DOT: |
| 232 | if (isdigit(m_c)) { |
| 233 | state = STATE_MAYBE_FRACTIONAL_START; |
| 234 | } else { |
| 235 | state = STATE_SYMBOL; |
| 236 | } |
| 237 | break; |
| 238 | |
| 239 | case STATE_MAYBE_INTEGER_SIGN: |
| 240 | if (isdigit(m_c)) { |
| 241 | has_integer_part = true; |
| 242 | state = STATE_MAYBE_INTEGER_PART; |
| 243 | } else if (m_c == '.') { |
| 244 | state = STATE_MAYBE_FRACTIONAL_START; |
| 245 | } |
| 246 | break; |
| 247 | |
| 248 | case STATE_MAYBE_INTEGER_PART: |
| 249 | if (isdigit(m_c)) { |
| 250 | // loop |
| 251 | } else if (m_c == '.') { |
| 252 | state = STATE_MAYBE_FRACTIONAL_START; |
| 253 | } else if (m_c == 'e' || m_c == 'E') { |
| 254 | state = STATE_MAYBE_EXPONENT_SIGN; |
| 255 | } else { |
| 256 | state = STATE_SYMBOL; |
| 257 | } |
| 258 | break; |
| 259 | |
| 260 | case STATE_MAYBE_FRACTIONAL_START: |
| 261 | if (isdigit(m_c)) { |
| 262 | has_fractional_part = true; |
| 263 | state = STATE_MAYBE_FRACTIONAL_PART; |
| 264 | } else { |
| 265 | state = STATE_SYMBOL; |
| 266 | } |
| 267 | break; |
| 268 | |
| 269 | case STATE_MAYBE_FRACTIONAL_PART: |
| 270 | if (isdigit(m_c)) { |
| 271 | // loop |
| 272 | } else if ((has_integer_part || has_fractional_part) && |
| 273 | (m_c == 'e' || m_c == 'E')) { |
| 274 | state = STATE_MAYBE_EXPONENT_SIGN; |
| 275 | } else { |
| 276 | state = STATE_SYMBOL; |
| 277 | } |
| 278 | break; |
| 279 | |
| 280 | case STATE_MAYBE_EXPONENT_SIGN: |
| 281 | if (m_c == '-' || m_c == '+') { |
| 282 | state = STATE_MAYBE_EXPONENT_START; |
| 283 | } else if (isdigit(m_c)) { |
| 284 | state = STATE_MAYBE_EXPONENT_PART; |
| 285 | } else { |
| 286 | state = STATE_SYMBOL; |
| 287 | } |
| 288 | break; |
| 289 | |
| 290 | case STATE_MAYBE_EXPONENT_START: |
| 291 | if (isdigit(m_c)) { |
| 292 | state = STATE_MAYBE_EXPONENT_PART; |
| 293 | } else { |
| 294 | state = STATE_SYMBOL; |
| 295 | } |
| 296 | break; |
| 297 | |
| 298 | case STATE_MAYBE_EXPONENT_PART: |
| 299 | if (isdigit(m_c)) { |
| 300 | // loop |
| 301 | } else { |
| 302 | state = STATE_SYMBOL; |
| 303 | } |
| 304 | break; |
| 305 | } |
| 306 | |
| 307 | add_char(); |
| 308 | } |
| 309 | while(!isspace(m_c) && !strchr(delims, m_c)); |
| 310 | |
| 311 | switch(state) |
| 312 | { |
| 313 | case STATE_INIT: |
| 314 | assert(false && "never reached" ); |
| 315 | return TOKEN_EOF; |
| 316 | |
| 317 | case STATE_SYMBOL: |
| 318 | return TOKEN_SYMBOL; |
| 319 | |
| 320 | case STATE_MAYBE_DOT: |
| 321 | return TOKEN_DOT; |
| 322 | |
| 323 | case STATE_MAYBE_INTEGER_SIGN: |
| 324 | return TOKEN_SYMBOL; |
| 325 | |
| 326 | case STATE_MAYBE_INTEGER_PART: |
| 327 | return TOKEN_INTEGER; |
| 328 | |
| 329 | case STATE_MAYBE_FRACTIONAL_START: |
| 330 | if (has_integer_part) { |
| 331 | return TOKEN_REAL; |
| 332 | } else { |
| 333 | return TOKEN_SYMBOL; |
| 334 | } |
| 335 | |
| 336 | case STATE_MAYBE_FRACTIONAL_PART: |
| 337 | return TOKEN_REAL; |
| 338 | |
| 339 | case STATE_MAYBE_EXPONENT_SIGN: |
| 340 | case STATE_MAYBE_EXPONENT_START: |
| 341 | return TOKEN_SYMBOL; |
| 342 | |
| 343 | case STATE_MAYBE_EXPONENT_PART: |
| 344 | return TOKEN_REAL; |
| 345 | } |
| 346 | } |
| 347 | assert(false && "never reached" ); |
| 348 | return TOKEN_EOF; |
| 349 | } |
| 350 | } |
| 351 | |
| 352 | } // namespace sexp |
| 353 | |
| 354 | /* EOF */ |
| 355 | |