| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #include "tokenstream.h" |
| 5 | #include "../math/math.h" |
| 6 | |
| 7 | namespace embree |
| 8 | { |
| 9 | /* shorthands for common sets of characters */ |
| 10 | const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz" ; |
| 11 | const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; |
| 12 | const std::string TokenStream::numbers = "0123456789" ; |
| 13 | const std::string TokenStream::separators = "\n\t\r " ; |
| 14 | const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\" ; |
| 15 | |
| 16 | /* creates map for fast categorization of characters */ |
| 17 | static void createCharMap(bool map[256], const std::string& chrs) { |
| 18 | for (size_t i=0; i<256; i++) map[i] = false; |
| 19 | for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; |
| 20 | } |
| 21 | |
| 22 | /* build full tokenizer that takes list of valid characters and keywords */ |
| 23 | TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from |
| 24 | const std::string& alpha, //< valid characters for identifiers |
| 25 | const std::string& seps, //< characters that act as separators |
| 26 | const std::vector<std::string>& symbols) //< symbols |
| 27 | : cin(cin), symbols(symbols) |
| 28 | { |
| 29 | createCharMap(isAlphaMap,alpha); |
| 30 | createCharMap(isSepMap,seps); |
| 31 | createCharMap(isStringCharMap,stringChars); |
| 32 | } |
| 33 | |
| 34 | bool TokenStream::decDigits(std::string& str_o) |
| 35 | { |
| 36 | bool ok = false; |
| 37 | std::string str; |
| 38 | if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); |
| 39 | while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } |
| 40 | if (ok) str_o += str; |
| 41 | else cin->unget(str.size()); |
| 42 | return ok; |
| 43 | } |
| 44 | |
| 45 | bool TokenStream::decDigits1(std::string& str_o) |
| 46 | { |
| 47 | bool ok = false; |
| 48 | std::string str; |
| 49 | while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } |
| 50 | if (ok) str_o += str; else cin->unget(str.size()); |
| 51 | return ok; |
| 52 | } |
| 53 | |
| 54 | bool TokenStream::trySymbol(const std::string& symbol) |
| 55 | { |
| 56 | size_t pos = 0; |
| 57 | while (pos < symbol.size()) { |
| 58 | if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } |
| 59 | cin->drop(); pos++; |
| 60 | } |
| 61 | return true; |
| 62 | } |
| 63 | |
| 64 | bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) |
| 65 | { |
| 66 | for (size_t i=0; i<symbols.size(); i++) { |
| 67 | if (!trySymbol(symbols[i])) continue; |
| 68 | token = Token(symbols[i],Token::TY_SYMBOL,loc); |
| 69 | return true; |
| 70 | } |
| 71 | return false; |
| 72 | } |
| 73 | |
| 74 | bool TokenStream::tryFloat(Token& token, const ParseLocation& loc) |
| 75 | { |
| 76 | bool ok = false; |
| 77 | std::string str; |
| 78 | if (trySymbol("nan" )) { |
| 79 | token = Token(float(nan)); |
| 80 | return true; |
| 81 | } |
| 82 | if (trySymbol("+inf" )) { |
| 83 | token = Token(float(pos_inf)); |
| 84 | return true; |
| 85 | } |
| 86 | if (trySymbol("-inf" )) { |
| 87 | token = Token(float(neg_inf)); |
| 88 | return true; |
| 89 | } |
| 90 | |
| 91 | if (decDigits(str)) |
| 92 | { |
| 93 | if (cin->peek() == '.') { |
| 94 | str += (char)cin->get(); |
| 95 | decDigits(str); |
| 96 | if (cin->peek() == 'e' || cin->peek() == 'E') { |
| 97 | str += (char)cin->get(); |
| 98 | if (decDigits(str)) ok = true; // 1.[2]E2 |
| 99 | } |
| 100 | else ok = true; // 1.[2] |
| 101 | } |
| 102 | else if (cin->peek() == 'e' || cin->peek() == 'E') { |
| 103 | str += (char)cin->get(); |
| 104 | if (decDigits(str)) ok = true; // 1E2 |
| 105 | } |
| 106 | } |
| 107 | else |
| 108 | { |
| 109 | if (cin->peek() == '.') { |
| 110 | str += (char)cin->get(); |
| 111 | if (decDigits(str)) { |
| 112 | if (cin->peek() == 'e' || cin->peek() == 'E') { |
| 113 | str += (char)cin->get(); |
| 114 | if (decDigits(str)) ok = true; // .3E2 |
| 115 | } |
| 116 | else ok = true; // .3 |
| 117 | } |
| 118 | } |
| 119 | } |
| 120 | if (ok) { |
| 121 | token = Token((float)atof(str.c_str()),loc); |
| 122 | } |
| 123 | else cin->unget(str.size()); |
| 124 | return ok; |
| 125 | } |
| 126 | |
| 127 | bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { |
| 128 | std::string str; |
| 129 | if (decDigits(str)) { |
| 130 | token = Token(atoi(str.c_str()),loc); |
| 131 | return true; |
| 132 | } |
| 133 | return false; |
| 134 | } |
| 135 | |
| 136 | bool TokenStream::tryString(Token& token, const ParseLocation& loc) |
| 137 | { |
| 138 | std::string str; |
| 139 | if (cin->peek() != '\"') return false; |
| 140 | cin->drop(); |
| 141 | while (cin->peek() != '\"') { |
| 142 | const int c = cin->get(); |
| 143 | if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character " +std::string(1,c)+" at " +loc.str()); |
| 144 | str += (char)c; |
| 145 | } |
| 146 | cin->drop(); |
| 147 | token = Token(str,Token::TY_STRING,loc); |
| 148 | return true; |
| 149 | } |
| 150 | |
| 151 | bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) |
| 152 | { |
| 153 | std::string str; |
| 154 | if (!isAlpha(cin->peek())) return false; |
| 155 | str += (char)cin->get(); |
| 156 | while (isAlphaNum(cin->peek())) str += (char)cin->get(); |
| 157 | token = Token(str,Token::TY_IDENTIFIER,loc); |
| 158 | return true; |
| 159 | } |
| 160 | |
| 161 | void TokenStream::skipSeparators() |
| 162 | { |
| 163 | /* skip separators */ |
| 164 | while (cin->peek() != EOF && isSeparator(cin->peek())) |
| 165 | cin->drop(); |
| 166 | } |
| 167 | |
| 168 | Token TokenStream::next() |
| 169 | { |
| 170 | Token token; |
| 171 | skipSeparators(); |
| 172 | ParseLocation loc = cin->loc(); |
| 173 | if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ |
| 174 | if (tryFloat (token,loc)) return token; /**< try to parse float */ |
| 175 | if (tryInt (token,loc)) return token; /**< try to parse integer */ |
| 176 | if (tryString (token,loc)) return token; /**< try to parse string */ |
| 177 | if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ |
| 178 | if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ |
| 179 | return Token((char)cin->get(),loc); /**< return invalid character token */ |
| 180 | } |
| 181 | } |
| 182 | |