scansup.c source code [PostgreSQL/src/backend/parser/scansup.c]

1	/-------------------------------------------------------------------------*
2	*
3	* scansup.c
4	* support routines for the lex/flex scanner, used by both the normal
5	* backend as well as the bootstrap backend
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	*
11	* IDENTIFICATION
12	* src/backend/parser/scansup.c
13	*
14	*-------------------------------------------------------------------------
15	*/
16	#include "postgres.h"
17
18	#include <ctype.h>
19
20	#include "parser/scansup.h"
21	#include "mb/pg_wchar.h"
22
23
24	/ ----------------*
25	* scanstr
26	*
27	* if the string passed in has escaped codes, map the escape codes to actual
28	* chars
29	*
30	* the string returned is palloc'd and should eventually be pfree'd by the
31	* caller!
32	* ----------------
33	*/
34
35	char *
36	scanstr(const char *s)
37	{
38	char *newStr;
39	int len,
40	i,
41	j;
42
43	if (s == NULL \|\| s[`0`] == `'\0'`)
44	return pstrdup("");
45
46	len = strlen(s);
47
48	newStr = palloc(len + `1`); / string cannot get longer /
49
50	for (i = `0`, j = `0`; i < len; i++)
51	{
52	if (s[i] == `'\''`)
53	{
54	/*
55	* Note: if scanner is working right, unescaped quotes can only
56	* appear in pairs, so there should be another character.
57	*/
58	i++;
59	/ The bootstrap parser is not as smart, so check here. /
60	Assert(s[i] == `'\''`);
61	newStr[j] = s[i];
62	}
63	else if (s[i] == `'\\'`)
64	{
65	i++;
66	switch (s[i])
67	{
68	case `'b'`:
69	newStr[j] = `'\b'`;
70	break;
71	case `'f'`:
72	newStr[j] = `'\f'`;
73	break;
74	case `'n'`:
75	newStr[j] = `'\n'`;
76	break;
77	case `'r'`:
78	newStr[j] = `'\r'`;
79	break;
80	case `'t'`:
81	newStr[j] = `'\t'`;
82	break;
83	case `'0'`:
84	case `'1'`:
85	case `'2'`:
86	case `'3'`:
87	case `'4'`:
88	case `'5'`:
89	case `'6'`:
90	case `'7'`:
91	{
92	int k;
93	long octVal = `0`;
94
95	for (k = `0`;
96	s[i + k] >= `'0'` && s[i + k] <= `'7'` && k < `3`;
97	k++)
98	octVal = (octVal << `3`) + (s[i + k] - `'0'`);
99	i += k - `1`;
100	newStr[j] = ((char) octVal);
101	}
102	break;
103	default:
104	newStr[j] = s[i];
105	break;
106	} / switch /
107	} / s[i] == '\\' /
108	else
109	newStr[j] = s[i];
110	j++;
111	}
112	newStr[j] = `'\0'`;
113	return newStr;
114	}
115
116
117	/*
118	* downcase_truncate_identifier() --- do appropriate downcasing and
119	* truncation of an unquoted identifier. Optionally warn of truncation.
120	*
121	* Returns a palloc'd string containing the adjusted identifier.
122	*
123	* Note: in some usages the passed string is not null-terminated.
124	*
125	* Note: the API of this function is designed to allow for downcasing
126	* transformations that increase the string length, but we don't yet
127	* support that. If you want to implement it, you'll need to fix
128	* SplitIdentifierString() in utils/adt/varlena.c.
129	*/
130	char *
131	downcase_truncate_identifier(const char ident, int* len, bool warn)
132	{
133	return downcase_identifier(ident, len, warn, true);
134	}
135
136	/*
137	* a workhorse for downcase_truncate_identifier
138	*/
139	char *
140	downcase_identifier(const char ident, int* len, bool warn, bool truncate)
141	{
142	char *result;
143	int i;
144	bool enc_is_single_byte;
145
146	result = palloc(len + `1`);
147	enc_is_single_byte = pg_database_encoding_max_length() == `1`;
148
149	/*
150	* SQL99 specifies Unicode-aware case normalization, which we don't yet
151	* have the infrastructure for. Instead we use tolower() to provide a
152	* locale-aware translation. However, there are some locales where this
153	* is not right either (eg, Turkish may do strange things with 'i' and
154	* 'I'). Our current compromise is to use tolower() for characters with
155	* the high bit set, as long as they aren't part of a multi-byte
156	* character, and use an ASCII-only downcasing for 7-bit characters.
157	*/
158	for (i = `0`; i < len; i++)
159	{
160	unsigned char ch = (unsigned char) ident[i];
161
162	if (ch >= `'A'` && ch <= `'Z'`)
163	ch += `'a'` - `'A'`;
164	else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
165	ch = tolower(ch);
166	result[i] = (char) ch;
167	}
168	result[i] = `'\0'`;
169
170	if (i >= NAMEDATALEN && truncate)
171	truncate_identifier(result, i, warn);
172
173	return result;
174	}
175
176
177	/*
178	* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
179	*
180	* The given string is modified in-place, if necessary. A warning is
181	* issued if requested.
182	*
183	* We require the caller to pass in the string length since this saves a
184	* strlen() call in some common usages.
185	*/
186	void
187	truncate_identifier(char ident, int* len, bool warn)
188	{
189	if (len >= NAMEDATALEN)
190	{
191	len = pg_mbcliplen(ident, len, NAMEDATALEN - `1`);
192	if (warn)
193	{
194	/*
195	* We avoid using %.*s here because it can misbehave if the data
196	* is not valid in what libc thinks is the prevailing encoding.
197	*/
198	char buf[NAMEDATALEN];
199
200	memcpy(buf, ident, len);
201	buf[len] = `'\0'`;
202	ereport(NOTICE,
203	(errcode(ERRCODE_NAME_TOO_LONG),
204	errmsg("identifier \"%s\" will be truncated to \"%s\"",
205	ident, buf)));
206	}
207	ident[len] = `'\0'`;
208	}
209	}
210
211	/*
212	* scanner_isspace() --- return true if flex scanner considers char whitespace
213	*
214	* This should be used instead of the potentially locale-dependent isspace()
215	* function when it's important to match the lexer's behavior.
216	*
217	* In principle we might need similar functions for isalnum etc, but for the
218	* moment only isspace seems needed.
219	*/
220	bool
221	scanner_isspace(char ch)
222	{
223	/ This must match scan.l's list of {space} characters /
224	if (ch == `' '` \|\|
225	ch == `'\t'` \|\|
226	ch == `'\n'` \|\|
227	ch == `'\r'` \|\|
228	ch == `'\f'`)
229	return true;
230	return false;
231	}
232

Browse the source code of PostgreSQL/src/backend/parser/scansup.c