1/*-------------------------------------------------------------------------
2 *
3 * scansup.c
4 * support routines for the lex/flex scanner, used by both the normal
5 * backend as well as the bootstrap backend
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 *
11 * IDENTIFICATION
12 * src/backend/parser/scansup.c
13 *
14 *-------------------------------------------------------------------------
15 */
16#include "postgres.h"
17
18#include <ctype.h>
19
20#include "parser/scansup.h"
21#include "mb/pg_wchar.h"
22
23
24/* ----------------
25 * scanstr
26 *
27 * if the string passed in has escaped codes, map the escape codes to actual
28 * chars
29 *
30 * the string returned is palloc'd and should eventually be pfree'd by the
31 * caller!
32 * ----------------
33 */
34
35char *
36scanstr(const char *s)
37{
38 char *newStr;
39 int len,
40 i,
41 j;
42
43 if (s == NULL || s[0] == '\0')
44 return pstrdup("");
45
46 len = strlen(s);
47
48 newStr = palloc(len + 1); /* string cannot get longer */
49
50 for (i = 0, j = 0; i < len; i++)
51 {
52 if (s[i] == '\'')
53 {
54 /*
55 * Note: if scanner is working right, unescaped quotes can only
56 * appear in pairs, so there should be another character.
57 */
58 i++;
59 /* The bootstrap parser is not as smart, so check here. */
60 Assert(s[i] == '\'');
61 newStr[j] = s[i];
62 }
63 else if (s[i] == '\\')
64 {
65 i++;
66 switch (s[i])
67 {
68 case 'b':
69 newStr[j] = '\b';
70 break;
71 case 'f':
72 newStr[j] = '\f';
73 break;
74 case 'n':
75 newStr[j] = '\n';
76 break;
77 case 'r':
78 newStr[j] = '\r';
79 break;
80 case 't':
81 newStr[j] = '\t';
82 break;
83 case '0':
84 case '1':
85 case '2':
86 case '3':
87 case '4':
88 case '5':
89 case '6':
90 case '7':
91 {
92 int k;
93 long octVal = 0;
94
95 for (k = 0;
96 s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
97 k++)
98 octVal = (octVal << 3) + (s[i + k] - '0');
99 i += k - 1;
100 newStr[j] = ((char) octVal);
101 }
102 break;
103 default:
104 newStr[j] = s[i];
105 break;
106 } /* switch */
107 } /* s[i] == '\\' */
108 else
109 newStr[j] = s[i];
110 j++;
111 }
112 newStr[j] = '\0';
113 return newStr;
114}
115
116
117/*
118 * downcase_truncate_identifier() --- do appropriate downcasing and
119 * truncation of an unquoted identifier. Optionally warn of truncation.
120 *
121 * Returns a palloc'd string containing the adjusted identifier.
122 *
123 * Note: in some usages the passed string is not null-terminated.
124 *
125 * Note: the API of this function is designed to allow for downcasing
126 * transformations that increase the string length, but we don't yet
127 * support that. If you want to implement it, you'll need to fix
128 * SplitIdentifierString() in utils/adt/varlena.c.
129 */
130char *
131downcase_truncate_identifier(const char *ident, int len, bool warn)
132{
133 return downcase_identifier(ident, len, warn, true);
134}
135
136/*
137 * a workhorse for downcase_truncate_identifier
138 */
139char *
140downcase_identifier(const char *ident, int len, bool warn, bool truncate)
141{
142 char *result;
143 int i;
144 bool enc_is_single_byte;
145
146 result = palloc(len + 1);
147 enc_is_single_byte = pg_database_encoding_max_length() == 1;
148
149 /*
150 * SQL99 specifies Unicode-aware case normalization, which we don't yet
151 * have the infrastructure for. Instead we use tolower() to provide a
152 * locale-aware translation. However, there are some locales where this
153 * is not right either (eg, Turkish may do strange things with 'i' and
154 * 'I'). Our current compromise is to use tolower() for characters with
155 * the high bit set, as long as they aren't part of a multi-byte
156 * character, and use an ASCII-only downcasing for 7-bit characters.
157 */
158 for (i = 0; i < len; i++)
159 {
160 unsigned char ch = (unsigned char) ident[i];
161
162 if (ch >= 'A' && ch <= 'Z')
163 ch += 'a' - 'A';
164 else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
165 ch = tolower(ch);
166 result[i] = (char) ch;
167 }
168 result[i] = '\0';
169
170 if (i >= NAMEDATALEN && truncate)
171 truncate_identifier(result, i, warn);
172
173 return result;
174}
175
176
177/*
178 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
179 *
180 * The given string is modified in-place, if necessary. A warning is
181 * issued if requested.
182 *
183 * We require the caller to pass in the string length since this saves a
184 * strlen() call in some common usages.
185 */
186void
187truncate_identifier(char *ident, int len, bool warn)
188{
189 if (len >= NAMEDATALEN)
190 {
191 len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
192 if (warn)
193 {
194 /*
195 * We avoid using %.*s here because it can misbehave if the data
196 * is not valid in what libc thinks is the prevailing encoding.
197 */
198 char buf[NAMEDATALEN];
199
200 memcpy(buf, ident, len);
201 buf[len] = '\0';
202 ereport(NOTICE,
203 (errcode(ERRCODE_NAME_TOO_LONG),
204 errmsg("identifier \"%s\" will be truncated to \"%s\"",
205 ident, buf)));
206 }
207 ident[len] = '\0';
208 }
209}
210
211/*
212 * scanner_isspace() --- return true if flex scanner considers char whitespace
213 *
214 * This should be used instead of the potentially locale-dependent isspace()
215 * function when it's important to match the lexer's behavior.
216 *
217 * In principle we might need similar functions for isalnum etc, but for the
218 * moment only isspace seems needed.
219 */
220bool
221scanner_isspace(char ch)
222{
223 /* This must match scan.l's list of {space} characters */
224 if (ch == ' ' ||
225 ch == '\t' ||
226 ch == '\n' ||
227 ch == '\r' ||
228 ch == '\f')
229 return true;
230 return false;
231}
232