1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | // ============================================================ |
5 | // |
6 | // StringLexer.inl |
7 | // |
8 | |
9 | |
10 | // |
11 | // Implements the inlined methods of StringLexer class |
12 | // |
13 | // ============================================================ |
14 | |
15 | #ifndef __BINDER__STRING_LEXER_INL__ |
16 | #define __BINDER__STRING_LEXER_INL__ |
17 | |
18 | StringLexer::StringLexer() |
19 | { |
20 | m_wcCurrentChar = INVALID_CHARACTER; |
21 | m_fCurrentCharIsEscaped = FALSE; |
22 | } |
23 | |
24 | StringLexer::~StringLexer() |
25 | { |
26 | // Nothing to do here |
27 | } |
28 | |
29 | void StringLexer::Init(SString &inputString, BOOL fSupportEscaping) |
30 | { |
31 | m_cursor = inputString.Begin(); |
32 | m_end = inputString.End(); |
33 | m_fSupportEscaping = fSupportEscaping; |
34 | m_fReadRawCharacter = FALSE; |
35 | } |
36 | |
37 | BOOL StringLexer::IsWhitespace(WCHAR wcChar) |
38 | { |
39 | return ((wcChar == L'\n') || (wcChar == L'\r') || (wcChar == L' ') || (wcChar == L'\t')); |
40 | } |
41 | |
42 | BOOL StringLexer::IsEOS(WCHAR wcChar) |
43 | { |
44 | return (wcChar == 0); |
45 | } |
46 | |
47 | BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar) |
48 | { |
49 | return ((wcChar == L'\'') || (wcChar == L'"')); |
50 | } |
51 | |
52 | WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped) |
53 | { |
54 | WCHAR wcCurrentChar = m_wcCurrentChar; |
55 | BINDER_LOG_ENTER(L"StringLexer::PopCharacter" ); |
56 | |
57 | if (wcCurrentChar != INVALID_CHARACTER) |
58 | { |
59 | BINDER_LOG(L"HAVE wcCurrentChar" ); |
60 | m_wcCurrentChar = INVALID_CHARACTER; |
61 | *pfIsEscaped = m_fCurrentCharIsEscaped; |
62 | } |
63 | else |
64 | { |
65 | BINDER_LOG(L"GET wcCurrentChar" ); |
66 | wcCurrentChar = GetNextCharacter(pfIsEscaped); |
67 | } |
68 | |
69 | #ifdef BINDER_DEBUG_LOG |
70 | PathString info; |
71 | |
72 | info.Printf(L"wcCurrentChar=%p" , (void *) wcCurrentChar); |
73 | BINDER_LOG((WCHAR *) info.GetUnicode()); |
74 | #endif |
75 | |
76 | BINDER_LOG_LEAVE(L"StringLexer::PopCharacter" ); |
77 | return wcCurrentChar; |
78 | } |
79 | |
80 | void StringLexer::PushCharacter(WCHAR wcCurrentChar, |
81 | BOOL fIsEscaped) |
82 | { |
83 | BINDER_LOG_ENTER(L"StringLexer::PushCharacter" ); |
84 | |
85 | #ifdef BINDER_DEBUG_LOG |
86 | PathString info; |
87 | |
88 | info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d" , (void *) wcCurrentChar, fIsEscaped); |
89 | BINDER_LOG((WCHAR *) info.GetUnicode()); |
90 | #endif |
91 | |
92 | _ASSERTE(m_wcCurrentChar == INVALID_CHARACTER); |
93 | |
94 | m_wcCurrentChar = wcCurrentChar; |
95 | m_fCurrentCharIsEscaped = fIsEscaped; |
96 | |
97 | BINDER_LOG_LEAVE(L"StringLexer::PushCharacter" ); |
98 | } |
99 | |
100 | WCHAR StringLexer::GetRawCharacter() |
101 | { |
102 | WCHAR wcCurrentChar = 0; |
103 | |
104 | if (m_cursor <= m_end) |
105 | { |
106 | wcCurrentChar = m_cursor[0]; |
107 | m_fReadRawCharacter = TRUE; |
108 | m_cursor++; |
109 | } |
110 | else |
111 | { |
112 | m_fReadRawCharacter = FALSE; |
113 | } |
114 | |
115 | return wcCurrentChar; |
116 | } |
117 | |
118 | void StringLexer::PushRawCharacter() |
119 | { |
120 | if (m_fReadRawCharacter) |
121 | { |
122 | m_cursor--; |
123 | m_fReadRawCharacter = FALSE; |
124 | } |
125 | } |
126 | |
127 | WCHAR StringLexer::DecodeUTF16Character() |
128 | { |
129 | // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding. |
130 | |
131 | WCHAR wcCurrentChar = 0; |
132 | SIZE_T nCharacters = m_end - m_cursor + 1; |
133 | WCHAR wcChar1 = GetRawCharacter(); |
134 | |
135 | if (wcChar1 < 0xd800) |
136 | { |
137 | wcCurrentChar = wcChar1; |
138 | } |
139 | else |
140 | { |
141 | // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane, |
142 | // since it stores all characters in 16-bit WCHARs. |
143 | // However, since the vast majority of the time, we (Microsoft) produce the manifests, |
144 | // this is likely a non-scenario, as the other Unicode planes would never be used in practice. |
145 | |
146 | if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair |
147 | { |
148 | if (nCharacters >= 2) |
149 | { |
150 | GetRawCharacter(); // Skip the second WCHAR of the surrogate pair |
151 | } |
152 | } |
153 | // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair, |
154 | // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat |
155 | // as invalid. |
156 | |
157 | wcCurrentChar = INVALID_CHARACTER; |
158 | } |
159 | |
160 | return wcCurrentChar; |
161 | } |
162 | |
163 | |
164 | WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped) |
165 | { |
166 | *pfIsEscaped = FALSE; |
167 | |
168 | WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character() |
169 | if (wcCurrentChar == L'\\') |
170 | { |
171 | WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character() |
172 | |
173 | if (m_fSupportEscaping) |
174 | { |
175 | // Handle standard escapes |
176 | switch (wcTempChar) |
177 | { |
178 | case L'"': |
179 | case L'\'': |
180 | case L',': |
181 | case L'\\': |
182 | case L'/': |
183 | case L'=': |
184 | break; |
185 | case L't': |
186 | wcTempChar = 9; |
187 | break; |
188 | case L'n': |
189 | wcTempChar = 10; |
190 | break; |
191 | case L'r': |
192 | wcTempChar = 13; |
193 | break; |
194 | case L'u': |
195 | wcTempChar = ParseUnicode(); |
196 | break; |
197 | default: |
198 | return INVALID_CHARACTER; |
199 | } |
200 | |
201 | *pfIsEscaped = TRUE; |
202 | wcCurrentChar = wcTempChar; |
203 | } |
204 | else |
205 | { |
206 | // Do not handle escapes except for quotes |
207 | switch (wcTempChar) |
208 | { |
209 | case L'"': |
210 | case L'\'': |
211 | *pfIsEscaped = TRUE; |
212 | wcCurrentChar = wcTempChar; |
213 | break; |
214 | default: |
215 | PushRawCharacter(); |
216 | break; |
217 | } |
218 | } |
219 | } |
220 | |
221 | return wcCurrentChar; |
222 | } |
223 | |
224 | WCHAR StringLexer::ParseUnicode() |
225 | { |
226 | int nCharacters = 0; |
227 | WCHAR wcUnicodeChar = 0; |
228 | |
229 | for(;;) |
230 | { |
231 | WCHAR wcCurrentChar = DecodeUTF16Character(); |
232 | nCharacters++; |
233 | |
234 | if (wcCurrentChar == L';') |
235 | { |
236 | break; |
237 | } |
238 | else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9)) |
239 | { |
240 | return INVALID_CHARACTER; |
241 | } |
242 | |
243 | wcUnicodeChar <<= 4; |
244 | |
245 | if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9')) |
246 | { |
247 | wcUnicodeChar += (wcCurrentChar - L'0'); |
248 | } |
249 | else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f')) |
250 | { |
251 | wcUnicodeChar += (wcCurrentChar - L'a') + 10; |
252 | } |
253 | else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F')) |
254 | { |
255 | wcUnicodeChar += (wcCurrentChar - L'A') + 10; |
256 | } |
257 | else |
258 | { |
259 | return INVALID_CHARACTER; |
260 | } |
261 | } |
262 | |
263 | return wcUnicodeChar; |
264 | } |
265 | |
266 | #endif |
267 | |