1// Licensed to the .NET Foundation under one or more agreements.
2// The .NET Foundation licenses this file to you under the MIT license.
3// See the LICENSE file in the project root for more information.
4// ============================================================
5//
6// StringLexer.inl
7//
8
9
10//
11// Implements the inlined methods of StringLexer class
12//
13// ============================================================
14
15#ifndef __BINDER__STRING_LEXER_INL__
16#define __BINDER__STRING_LEXER_INL__
17
18StringLexer::StringLexer()
19{
20 m_wcCurrentChar = INVALID_CHARACTER;
21 m_fCurrentCharIsEscaped = FALSE;
22}
23
24StringLexer::~StringLexer()
25{
26 // Nothing to do here
27}
28
29void StringLexer::Init(SString &inputString, BOOL fSupportEscaping)
30{
31 m_cursor = inputString.Begin();
32 m_end = inputString.End();
33 m_fSupportEscaping = fSupportEscaping;
34 m_fReadRawCharacter = FALSE;
35}
36
37BOOL StringLexer::IsWhitespace(WCHAR wcChar)
38{
39 return ((wcChar == L'\n') || (wcChar == L'\r') || (wcChar == L' ') || (wcChar == L'\t'));
40}
41
42BOOL StringLexer::IsEOS(WCHAR wcChar)
43{
44 return (wcChar == 0);
45}
46
47BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar)
48{
49 return ((wcChar == L'\'') || (wcChar == L'"'));
50}
51
52WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped)
53{
54 WCHAR wcCurrentChar = m_wcCurrentChar;
55 BINDER_LOG_ENTER(L"StringLexer::PopCharacter");
56
57 if (wcCurrentChar != INVALID_CHARACTER)
58 {
59 BINDER_LOG(L"HAVE wcCurrentChar");
60 m_wcCurrentChar = INVALID_CHARACTER;
61 *pfIsEscaped = m_fCurrentCharIsEscaped;
62 }
63 else
64 {
65 BINDER_LOG(L"GET wcCurrentChar");
66 wcCurrentChar = GetNextCharacter(pfIsEscaped);
67 }
68
69#ifdef BINDER_DEBUG_LOG
70 PathString info;
71
72 info.Printf(L"wcCurrentChar=%p", (void *) wcCurrentChar);
73 BINDER_LOG((WCHAR *) info.GetUnicode());
74#endif
75
76 BINDER_LOG_LEAVE(L"StringLexer::PopCharacter");
77 return wcCurrentChar;
78}
79
80void StringLexer::PushCharacter(WCHAR wcCurrentChar,
81 BOOL fIsEscaped)
82{
83 BINDER_LOG_ENTER(L"StringLexer::PushCharacter");
84
85#ifdef BINDER_DEBUG_LOG
86 PathString info;
87
88 info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d", (void *) wcCurrentChar, fIsEscaped);
89 BINDER_LOG((WCHAR *) info.GetUnicode());
90#endif
91
92 _ASSERTE(m_wcCurrentChar == INVALID_CHARACTER);
93
94 m_wcCurrentChar = wcCurrentChar;
95 m_fCurrentCharIsEscaped = fIsEscaped;
96
97 BINDER_LOG_LEAVE(L"StringLexer::PushCharacter");
98}
99
100WCHAR StringLexer::GetRawCharacter()
101{
102 WCHAR wcCurrentChar = 0;
103
104 if (m_cursor <= m_end)
105 {
106 wcCurrentChar = m_cursor[0];
107 m_fReadRawCharacter = TRUE;
108 m_cursor++;
109 }
110 else
111 {
112 m_fReadRawCharacter = FALSE;
113 }
114
115 return wcCurrentChar;
116}
117
118void StringLexer::PushRawCharacter()
119{
120 if (m_fReadRawCharacter)
121 {
122 m_cursor--;
123 m_fReadRawCharacter = FALSE;
124 }
125}
126
127WCHAR StringLexer::DecodeUTF16Character()
128{
129 // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding.
130
131 WCHAR wcCurrentChar = 0;
132 SIZE_T nCharacters = m_end - m_cursor + 1;
133 WCHAR wcChar1 = GetRawCharacter();
134
135 if (wcChar1 < 0xd800)
136 {
137 wcCurrentChar = wcChar1;
138 }
139 else
140 {
141 // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane,
142 // since it stores all characters in 16-bit WCHARs.
143 // However, since the vast majority of the time, we (Microsoft) produce the manifests,
144 // this is likely a non-scenario, as the other Unicode planes would never be used in practice.
145
146 if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair
147 {
148 if (nCharacters >= 2)
149 {
150 GetRawCharacter(); // Skip the second WCHAR of the surrogate pair
151 }
152 }
153 // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair,
154 // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat
155 // as invalid.
156
157 wcCurrentChar = INVALID_CHARACTER;
158 }
159
160 return wcCurrentChar;
161}
162
163
164WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped)
165{
166 *pfIsEscaped = FALSE;
167
168 WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character()
169 if (wcCurrentChar == L'\\')
170 {
171 WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character()
172
173 if (m_fSupportEscaping)
174 {
175 // Handle standard escapes
176 switch (wcTempChar)
177 {
178 case L'"':
179 case L'\'':
180 case L',':
181 case L'\\':
182 case L'/':
183 case L'=':
184 break;
185 case L't':
186 wcTempChar = 9;
187 break;
188 case L'n':
189 wcTempChar = 10;
190 break;
191 case L'r':
192 wcTempChar = 13;
193 break;
194 case L'u':
195 wcTempChar = ParseUnicode();
196 break;
197 default:
198 return INVALID_CHARACTER;
199 }
200
201 *pfIsEscaped = TRUE;
202 wcCurrentChar = wcTempChar;
203 }
204 else
205 {
206 // Do not handle escapes except for quotes
207 switch (wcTempChar)
208 {
209 case L'"':
210 case L'\'':
211 *pfIsEscaped = TRUE;
212 wcCurrentChar = wcTempChar;
213 break;
214 default:
215 PushRawCharacter();
216 break;
217 }
218 }
219 }
220
221 return wcCurrentChar;
222}
223
224WCHAR StringLexer::ParseUnicode()
225{
226 int nCharacters = 0;
227 WCHAR wcUnicodeChar = 0;
228
229 for(;;)
230 {
231 WCHAR wcCurrentChar = DecodeUTF16Character();
232 nCharacters++;
233
234 if (wcCurrentChar == L';')
235 {
236 break;
237 }
238 else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9))
239 {
240 return INVALID_CHARACTER;
241 }
242
243 wcUnicodeChar <<= 4;
244
245 if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9'))
246 {
247 wcUnicodeChar += (wcCurrentChar - L'0');
248 }
249 else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f'))
250 {
251 wcUnicodeChar += (wcCurrentChar - L'a') + 10;
252 }
253 else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F'))
254 {
255 wcUnicodeChar += (wcCurrentChar - L'A') + 10;
256 }
257 else
258 {
259 return INVALID_CHARACTER;
260 }
261 }
262
263 return wcUnicodeChar;
264}
265
266#endif
267