stringlexer.inl source code [CoreCLR/binder/inc/stringlexer.inl]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4	// ============================================================
5	//
6	// StringLexer.inl
7	//
8
9
10	//
11	// Implements the inlined methods of StringLexer class
12	//
13	// ============================================================
14
15	#ifndef __BINDER__STRING_LEXER_INL__
16	#define __BINDER__STRING_LEXER_INL__
17
18	StringLexer::StringLexer()
19	{
20	m_wcCurrentChar = INVALID_CHARACTER;
21	m_fCurrentCharIsEscaped = FALSE;
22	}
23
24	StringLexer::~StringLexer()
25	{
26	// Nothing to do here
27	}
28
29	void StringLexer::Init(SString &inputString, BOOL fSupportEscaping)
30	{
31	m_cursor = inputString.Begin();
32	m_end = inputString.End();
33	m_fSupportEscaping = fSupportEscaping;
34	m_fReadRawCharacter = FALSE;
35	}
36
37	BOOL StringLexer::IsWhitespace(WCHAR wcChar)
38	{
39	return ((wcChar == L`'\n'`) \|\| (wcChar == L`'\r'`) \|\| (wcChar == L`' '`) \|\| (wcChar == L`'\t'`));
40	}
41
42	BOOL StringLexer::IsEOS(WCHAR wcChar)
43	{
44	return (wcChar == `0`);
45	}
46
47	BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar)
48	{
49	return ((wcChar == L`'\''`) \|\| (wcChar == L`'"'`));
50	}
51
52	WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped)
53	{
54	WCHAR wcCurrentChar = m_wcCurrentChar;
55	BINDER_LOG_ENTER(L"StringLexer::PopCharacter");
56
57	if (wcCurrentChar != INVALID_CHARACTER)
58	{
59	BINDER_LOG(L"HAVE wcCurrentChar");
60	m_wcCurrentChar = INVALID_CHARACTER;
61	*pfIsEscaped = m_fCurrentCharIsEscaped;
62	}
63	else
64	{
65	BINDER_LOG(L"GET wcCurrentChar");
66	wcCurrentChar = GetNextCharacter(pfIsEscaped);
67	}
68
69	#ifdef BINDER_DEBUG_LOG
70	PathString info;
71
72	info.Printf(L"wcCurrentChar=%p", (void *) wcCurrentChar);
73	BINDER_LOG((WCHAR *) info.GetUnicode());
74	#endif
75
76	BINDER_LOG_LEAVE(L"StringLexer::PopCharacter");
77	return wcCurrentChar;
78	}
79
80	void StringLexer::PushCharacter(WCHAR wcCurrentChar,
81	BOOL fIsEscaped)
82	{
83	BINDER_LOG_ENTER(L"StringLexer::PushCharacter");
84
85	#ifdef BINDER_DEBUG_LOG
86	PathString info;
87
88	info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d", (void *) wcCurrentChar, fIsEscaped);
89	BINDER_LOG((WCHAR *) info.GetUnicode());
90	#endif
91
92	_ASSERTE(m_wcCurrentChar == INVALID_CHARACTER);
93
94	m_wcCurrentChar = wcCurrentChar;
95	m_fCurrentCharIsEscaped = fIsEscaped;
96
97	BINDER_LOG_LEAVE(L"StringLexer::PushCharacter");
98	}
99
100	WCHAR StringLexer::GetRawCharacter()
101	{
102	WCHAR wcCurrentChar = `0`;
103
104	if (m_cursor <= m_end)
105	{
106	wcCurrentChar = m_cursor [`0`];
107	m_fReadRawCharacter = TRUE;
108	m_cursor ++;
109	}
110	else
111	{
112	m_fReadRawCharacter = FALSE;
113	}
114
115	return wcCurrentChar;
116	}
117
118	void StringLexer::PushRawCharacter()
119	{
120	if (m_fReadRawCharacter)
121	{
122	m_cursor --;
123	m_fReadRawCharacter = FALSE;
124	}
125	}
126
127	WCHAR StringLexer::DecodeUTF16Character()
128	{
129	// See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding.
130
131	WCHAR wcCurrentChar = `0`;
132	SIZE_T nCharacters = m_end - m_cursor + `1`;
133	WCHAR wcChar1 = GetRawCharacter();
134
135	if (wcChar1 < `0xd800`)
136	{
137	wcCurrentChar = wcChar1;
138	}
139	else
140	{
141	// StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane,
142	// since it stores all characters in 16-bit WCHARs.
143	// However, since the vast majority of the time, we (Microsoft) produce the manifests,
144	// this is likely a non-scenario, as the other Unicode planes would never be used in practice.
145
146	if (wcChar1 <= `0xdbff`) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair
147	{
148	if (nCharacters >= `2`)
149	{
150	GetRawCharacter(); // Skip the second WCHAR of the surrogate pair
151	}
152	}
153	// Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair,
154	// or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat
155	// as invalid.
156
157	wcCurrentChar = INVALID_CHARACTER;
158	}
159
160	return wcCurrentChar;
161	}
162
163
164	WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped)
165	{
166	*pfIsEscaped = FALSE;
167
168	WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character()
169	if (wcCurrentChar == L`'\\'`)
170	{
171	WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character()
172
173	if (m_fSupportEscaping)
174	{
175	// Handle standard escapes
176	switch (wcTempChar)
177	{
178	case L`'"'`:
179	case L`'\''`:
180	case L`','`:
181	case L`'\\'`:
182	case L`'/'`:
183	case L`'='`:
184	break;
185	case L`'t'`:
186	wcTempChar = `9`;
187	break;
188	case L`'n'`:
189	wcTempChar = `10`;
190	break;
191	case L`'r'`:
192	wcTempChar = `13`;
193	break;
194	case L`'u'`:
195	wcTempChar = ParseUnicode();
196	break;
197	default:
198	return INVALID_CHARACTER;
199	}
200
201	*pfIsEscaped = TRUE;
202	wcCurrentChar = wcTempChar;
203	}
204	else
205	{
206	// Do not handle escapes except for quotes
207	switch (wcTempChar)
208	{
209	case L`'"'`:
210	case L`'\''`:
211	*pfIsEscaped = TRUE;
212	wcCurrentChar = wcTempChar;
213	break;
214	default:
215	PushRawCharacter();
216	break;
217	}
218	}
219	}
220
221	return wcCurrentChar;
222	}
223
224	WCHAR StringLexer::ParseUnicode()
225	{
226	int nCharacters = `0`;
227	WCHAR wcUnicodeChar = `0`;
228
229	for(;;)
230	{
231	WCHAR wcCurrentChar = DecodeUTF16Character();
232	nCharacters++;
233
234	if (wcCurrentChar == L`';'`)
235	{
236	break;
237	}
238	else if ((wcCurrentChar == INVALID_CHARACTER) \|\| (nCharacters >= `9`))
239	{
240	return INVALID_CHARACTER;
241	}
242
243	wcUnicodeChar <<= `4`;
244
245	if ((wcCurrentChar >= L`'0'`) && (wcCurrentChar <= L`'9'`))
246	{
247	wcUnicodeChar += (wcCurrentChar - L`'0'`);
248	}
249	else if ((wcCurrentChar >= L`'a'`) && (wcCurrentChar <= L`'f'`))
250	{
251	wcUnicodeChar += (wcCurrentChar - L`'a'`) + `10`;
252	}
253	else if ((wcCurrentChar >= L`'A'`) && (wcCurrentChar <= L`'F'`))
254	{
255	wcUnicodeChar += (wcCurrentChar - L`'A'`) + `10`;
256	}
257	else
258	{
259	return INVALID_CHARACTER;
260	}
261	}
262
263	return wcUnicodeChar;
264	}
265
266	#endif
267

Browse the source code of CoreCLR/binder/inc/stringlexer.inl