| 1 | // Licensed to the .NET Foundation under one or more agreements. |
| 2 | // The .NET Foundation licenses this file to you under the MIT license. |
| 3 | // See the LICENSE file in the project root for more information. |
| 4 | // ============================================================ |
| 5 | // |
| 6 | // StringLexer.inl |
| 7 | // |
| 8 | |
| 9 | |
| 10 | // |
| 11 | // Implements the inlined methods of StringLexer class |
| 12 | // |
| 13 | // ============================================================ |
| 14 | |
| 15 | #ifndef __BINDER__STRING_LEXER_INL__ |
| 16 | #define __BINDER__STRING_LEXER_INL__ |
| 17 | |
| 18 | StringLexer::StringLexer() |
| 19 | { |
| 20 | m_wcCurrentChar = INVALID_CHARACTER; |
| 21 | m_fCurrentCharIsEscaped = FALSE; |
| 22 | } |
| 23 | |
| 24 | StringLexer::~StringLexer() |
| 25 | { |
| 26 | // Nothing to do here |
| 27 | } |
| 28 | |
| 29 | void StringLexer::Init(SString &inputString, BOOL fSupportEscaping) |
| 30 | { |
| 31 | m_cursor = inputString.Begin(); |
| 32 | m_end = inputString.End(); |
| 33 | m_fSupportEscaping = fSupportEscaping; |
| 34 | m_fReadRawCharacter = FALSE; |
| 35 | } |
| 36 | |
| 37 | BOOL StringLexer::IsWhitespace(WCHAR wcChar) |
| 38 | { |
| 39 | return ((wcChar == L'\n') || (wcChar == L'\r') || (wcChar == L' ') || (wcChar == L'\t')); |
| 40 | } |
| 41 | |
| 42 | BOOL StringLexer::IsEOS(WCHAR wcChar) |
| 43 | { |
| 44 | return (wcChar == 0); |
| 45 | } |
| 46 | |
| 47 | BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar) |
| 48 | { |
| 49 | return ((wcChar == L'\'') || (wcChar == L'"')); |
| 50 | } |
| 51 | |
| 52 | WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped) |
| 53 | { |
| 54 | WCHAR wcCurrentChar = m_wcCurrentChar; |
| 55 | BINDER_LOG_ENTER(L"StringLexer::PopCharacter" ); |
| 56 | |
| 57 | if (wcCurrentChar != INVALID_CHARACTER) |
| 58 | { |
| 59 | BINDER_LOG(L"HAVE wcCurrentChar" ); |
| 60 | m_wcCurrentChar = INVALID_CHARACTER; |
| 61 | *pfIsEscaped = m_fCurrentCharIsEscaped; |
| 62 | } |
| 63 | else |
| 64 | { |
| 65 | BINDER_LOG(L"GET wcCurrentChar" ); |
| 66 | wcCurrentChar = GetNextCharacter(pfIsEscaped); |
| 67 | } |
| 68 | |
| 69 | #ifdef BINDER_DEBUG_LOG |
| 70 | PathString info; |
| 71 | |
| 72 | info.Printf(L"wcCurrentChar=%p" , (void *) wcCurrentChar); |
| 73 | BINDER_LOG((WCHAR *) info.GetUnicode()); |
| 74 | #endif |
| 75 | |
| 76 | BINDER_LOG_LEAVE(L"StringLexer::PopCharacter" ); |
| 77 | return wcCurrentChar; |
| 78 | } |
| 79 | |
| 80 | void StringLexer::PushCharacter(WCHAR wcCurrentChar, |
| 81 | BOOL fIsEscaped) |
| 82 | { |
| 83 | BINDER_LOG_ENTER(L"StringLexer::PushCharacter" ); |
| 84 | |
| 85 | #ifdef BINDER_DEBUG_LOG |
| 86 | PathString info; |
| 87 | |
| 88 | info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d" , (void *) wcCurrentChar, fIsEscaped); |
| 89 | BINDER_LOG((WCHAR *) info.GetUnicode()); |
| 90 | #endif |
| 91 | |
| 92 | _ASSERTE(m_wcCurrentChar == INVALID_CHARACTER); |
| 93 | |
| 94 | m_wcCurrentChar = wcCurrentChar; |
| 95 | m_fCurrentCharIsEscaped = fIsEscaped; |
| 96 | |
| 97 | BINDER_LOG_LEAVE(L"StringLexer::PushCharacter" ); |
| 98 | } |
| 99 | |
| 100 | WCHAR StringLexer::GetRawCharacter() |
| 101 | { |
| 102 | WCHAR wcCurrentChar = 0; |
| 103 | |
| 104 | if (m_cursor <= m_end) |
| 105 | { |
| 106 | wcCurrentChar = m_cursor[0]; |
| 107 | m_fReadRawCharacter = TRUE; |
| 108 | m_cursor++; |
| 109 | } |
| 110 | else |
| 111 | { |
| 112 | m_fReadRawCharacter = FALSE; |
| 113 | } |
| 114 | |
| 115 | return wcCurrentChar; |
| 116 | } |
| 117 | |
| 118 | void StringLexer::PushRawCharacter() |
| 119 | { |
| 120 | if (m_fReadRawCharacter) |
| 121 | { |
| 122 | m_cursor--; |
| 123 | m_fReadRawCharacter = FALSE; |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | WCHAR StringLexer::DecodeUTF16Character() |
| 128 | { |
| 129 | // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding. |
| 130 | |
| 131 | WCHAR wcCurrentChar = 0; |
| 132 | SIZE_T nCharacters = m_end - m_cursor + 1; |
| 133 | WCHAR wcChar1 = GetRawCharacter(); |
| 134 | |
| 135 | if (wcChar1 < 0xd800) |
| 136 | { |
| 137 | wcCurrentChar = wcChar1; |
| 138 | } |
| 139 | else |
| 140 | { |
| 141 | // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane, |
| 142 | // since it stores all characters in 16-bit WCHARs. |
| 143 | // However, since the vast majority of the time, we (Microsoft) produce the manifests, |
| 144 | // this is likely a non-scenario, as the other Unicode planes would never be used in practice. |
| 145 | |
| 146 | if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair |
| 147 | { |
| 148 | if (nCharacters >= 2) |
| 149 | { |
| 150 | GetRawCharacter(); // Skip the second WCHAR of the surrogate pair |
| 151 | } |
| 152 | } |
| 153 | // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair, |
| 154 | // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat |
| 155 | // as invalid. |
| 156 | |
| 157 | wcCurrentChar = INVALID_CHARACTER; |
| 158 | } |
| 159 | |
| 160 | return wcCurrentChar; |
| 161 | } |
| 162 | |
| 163 | |
| 164 | WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped) |
| 165 | { |
| 166 | *pfIsEscaped = FALSE; |
| 167 | |
| 168 | WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character() |
| 169 | if (wcCurrentChar == L'\\') |
| 170 | { |
| 171 | WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character() |
| 172 | |
| 173 | if (m_fSupportEscaping) |
| 174 | { |
| 175 | // Handle standard escapes |
| 176 | switch (wcTempChar) |
| 177 | { |
| 178 | case L'"': |
| 179 | case L'\'': |
| 180 | case L',': |
| 181 | case L'\\': |
| 182 | case L'/': |
| 183 | case L'=': |
| 184 | break; |
| 185 | case L't': |
| 186 | wcTempChar = 9; |
| 187 | break; |
| 188 | case L'n': |
| 189 | wcTempChar = 10; |
| 190 | break; |
| 191 | case L'r': |
| 192 | wcTempChar = 13; |
| 193 | break; |
| 194 | case L'u': |
| 195 | wcTempChar = ParseUnicode(); |
| 196 | break; |
| 197 | default: |
| 198 | return INVALID_CHARACTER; |
| 199 | } |
| 200 | |
| 201 | *pfIsEscaped = TRUE; |
| 202 | wcCurrentChar = wcTempChar; |
| 203 | } |
| 204 | else |
| 205 | { |
| 206 | // Do not handle escapes except for quotes |
| 207 | switch (wcTempChar) |
| 208 | { |
| 209 | case L'"': |
| 210 | case L'\'': |
| 211 | *pfIsEscaped = TRUE; |
| 212 | wcCurrentChar = wcTempChar; |
| 213 | break; |
| 214 | default: |
| 215 | PushRawCharacter(); |
| 216 | break; |
| 217 | } |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | return wcCurrentChar; |
| 222 | } |
| 223 | |
| 224 | WCHAR StringLexer::ParseUnicode() |
| 225 | { |
| 226 | int nCharacters = 0; |
| 227 | WCHAR wcUnicodeChar = 0; |
| 228 | |
| 229 | for(;;) |
| 230 | { |
| 231 | WCHAR wcCurrentChar = DecodeUTF16Character(); |
| 232 | nCharacters++; |
| 233 | |
| 234 | if (wcCurrentChar == L';') |
| 235 | { |
| 236 | break; |
| 237 | } |
| 238 | else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9)) |
| 239 | { |
| 240 | return INVALID_CHARACTER; |
| 241 | } |
| 242 | |
| 243 | wcUnicodeChar <<= 4; |
| 244 | |
| 245 | if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9')) |
| 246 | { |
| 247 | wcUnicodeChar += (wcCurrentChar - L'0'); |
| 248 | } |
| 249 | else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f')) |
| 250 | { |
| 251 | wcUnicodeChar += (wcCurrentChar - L'a') + 10; |
| 252 | } |
| 253 | else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F')) |
| 254 | { |
| 255 | wcUnicodeChar += (wcCurrentChar - L'A') + 10; |
| 256 | } |
| 257 | else |
| 258 | { |
| 259 | return INVALID_CHARACTER; |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | return wcUnicodeChar; |
| 264 | } |
| 265 | |
| 266 | #endif |
| 267 | |