1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * ts_locale.c |
4 | * locale compatibility layer for tsearch |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/tsearch/ts_locale.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #include "postgres.h" |
15 | |
16 | #include "catalog/pg_collation.h" |
17 | #include "storage/fd.h" |
18 | #include "tsearch/ts_locale.h" |
19 | #include "tsearch/ts_public.h" |
20 | |
21 | static void tsearch_readline_callback(void *arg); |
22 | |
23 | |
24 | /* |
25 | * The reason these functions use a 3-wchar_t output buffer, not 2 as you |
26 | * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be |
27 | * getting from char2wchar() is UTF16 not UTF32. A single input character |
28 | * may therefore produce a surrogate pair rather than just one wchar_t; |
29 | * we also need room for a trailing null. When we do get a surrogate pair, |
30 | * we pass just the first code to iswdigit() etc, so that these functions will |
31 | * always return false for characters outside the Basic Multilingual Plane. |
32 | */ |
33 | #define WC_BUF_LEN 3 |
34 | |
35 | int |
36 | t_isdigit(const char *ptr) |
37 | { |
38 | int clen = pg_mblen(ptr); |
39 | wchar_t character[WC_BUF_LEN]; |
40 | Oid collation = DEFAULT_COLLATION_OID; /* TODO */ |
41 | pg_locale_t mylocale = 0; /* TODO */ |
42 | |
43 | if (clen == 1 || lc_ctype_is_c(collation)) |
44 | return isdigit(TOUCHAR(ptr)); |
45 | |
46 | char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); |
47 | |
48 | return iswdigit((wint_t) character[0]); |
49 | } |
50 | |
51 | int |
52 | t_isspace(const char *ptr) |
53 | { |
54 | int clen = pg_mblen(ptr); |
55 | wchar_t character[WC_BUF_LEN]; |
56 | Oid collation = DEFAULT_COLLATION_OID; /* TODO */ |
57 | pg_locale_t mylocale = 0; /* TODO */ |
58 | |
59 | if (clen == 1 || lc_ctype_is_c(collation)) |
60 | return isspace(TOUCHAR(ptr)); |
61 | |
62 | char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); |
63 | |
64 | return iswspace((wint_t) character[0]); |
65 | } |
66 | |
67 | int |
68 | t_isalpha(const char *ptr) |
69 | { |
70 | int clen = pg_mblen(ptr); |
71 | wchar_t character[WC_BUF_LEN]; |
72 | Oid collation = DEFAULT_COLLATION_OID; /* TODO */ |
73 | pg_locale_t mylocale = 0; /* TODO */ |
74 | |
75 | if (clen == 1 || lc_ctype_is_c(collation)) |
76 | return isalpha(TOUCHAR(ptr)); |
77 | |
78 | char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); |
79 | |
80 | return iswalpha((wint_t) character[0]); |
81 | } |
82 | |
83 | int |
84 | t_isprint(const char *ptr) |
85 | { |
86 | int clen = pg_mblen(ptr); |
87 | wchar_t character[WC_BUF_LEN]; |
88 | Oid collation = DEFAULT_COLLATION_OID; /* TODO */ |
89 | pg_locale_t mylocale = 0; /* TODO */ |
90 | |
91 | if (clen == 1 || lc_ctype_is_c(collation)) |
92 | return isprint(TOUCHAR(ptr)); |
93 | |
94 | char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); |
95 | |
96 | return iswprint((wint_t) character[0]); |
97 | } |
98 | |
99 | |
100 | /* |
101 | * Set up to read a file using tsearch_readline(). This facility is |
102 | * better than just reading the file directly because it provides error |
103 | * context pointing to the specific line where a problem is detected. |
104 | * |
105 | * Expected usage is: |
106 | * |
107 | * tsearch_readline_state trst; |
108 | * |
109 | * if (!tsearch_readline_begin(&trst, filename)) |
110 | * ereport(ERROR, |
111 | * (errcode(ERRCODE_CONFIG_FILE_ERROR), |
112 | * errmsg("could not open stop-word file \"%s\": %m", |
113 | * filename))); |
114 | * while ((line = tsearch_readline(&trst)) != NULL) |
115 | * process line; |
116 | * tsearch_readline_end(&trst); |
117 | * |
118 | * Note that the caller supplies the ereport() for file open failure; |
119 | * this is so that a custom message can be provided. The filename string |
120 | * passed to tsearch_readline_begin() must remain valid through |
121 | * tsearch_readline_end(). |
122 | */ |
123 | bool |
124 | tsearch_readline_begin(tsearch_readline_state *stp, |
125 | const char *filename) |
126 | { |
127 | if ((stp->fp = AllocateFile(filename, "r" )) == NULL) |
128 | return false; |
129 | stp->filename = filename; |
130 | stp->lineno = 0; |
131 | stp->curline = NULL; |
132 | /* Setup error traceback support for ereport() */ |
133 | stp->cb.callback = tsearch_readline_callback; |
134 | stp->cb.arg = (void *) stp; |
135 | stp->cb.previous = error_context_stack; |
136 | error_context_stack = &stp->cb; |
137 | return true; |
138 | } |
139 | |
140 | /* |
141 | * Read the next line from a tsearch data file (expected to be in UTF-8), and |
142 | * convert it to database encoding if needed. The returned string is palloc'd. |
143 | * NULL return means EOF. |
144 | */ |
145 | char * |
146 | tsearch_readline(tsearch_readline_state *stp) |
147 | { |
148 | char *result; |
149 | |
150 | stp->lineno++; |
151 | stp->curline = NULL; |
152 | result = t_readline(stp->fp); |
153 | stp->curline = result; |
154 | return result; |
155 | } |
156 | |
157 | /* |
158 | * Close down after reading a file with tsearch_readline() |
159 | */ |
160 | void |
161 | tsearch_readline_end(tsearch_readline_state *stp) |
162 | { |
163 | FreeFile(stp->fp); |
164 | /* Pop the error context stack */ |
165 | error_context_stack = stp->cb.previous; |
166 | } |
167 | |
168 | /* |
169 | * Error context callback for errors occurring while reading a tsearch |
170 | * configuration file. |
171 | */ |
172 | static void |
173 | tsearch_readline_callback(void *arg) |
174 | { |
175 | tsearch_readline_state *stp = (tsearch_readline_state *) arg; |
176 | |
177 | /* |
178 | * We can't include the text of the config line for errors that occur |
179 | * during t_readline() itself. This is only partly a consequence of our |
180 | * arms-length use of that routine: the major cause of such errors is |
181 | * encoding violations, and we daren't try to print error messages |
182 | * containing badly-encoded data. |
183 | */ |
184 | if (stp->curline) |
185 | errcontext("line %d of configuration file \"%s\": \"%s\"" , |
186 | stp->lineno, |
187 | stp->filename, |
188 | stp->curline); |
189 | else |
190 | errcontext("line %d of configuration file \"%s\"" , |
191 | stp->lineno, |
192 | stp->filename); |
193 | } |
194 | |
195 | |
196 | /* |
197 | * Read the next line from a tsearch data file (expected to be in UTF-8), and |
198 | * convert it to database encoding if needed. The returned string is palloc'd. |
199 | * NULL return means EOF. |
200 | * |
201 | * Note: direct use of this function is now deprecated. Go through |
202 | * tsearch_readline() to provide better error reporting. |
203 | */ |
204 | char * |
205 | t_readline(FILE *fp) |
206 | { |
207 | int len; |
208 | char *recoded; |
209 | char buf[4096]; /* lines must not be longer than this */ |
210 | |
211 | if (fgets(buf, sizeof(buf), fp) == NULL) |
212 | return NULL; |
213 | |
214 | len = strlen(buf); |
215 | |
216 | /* Make sure the input is valid UTF-8 */ |
217 | (void) pg_verify_mbstr(PG_UTF8, buf, len, false); |
218 | |
219 | /* And convert */ |
220 | recoded = pg_any_to_server(buf, len, PG_UTF8); |
221 | if (recoded == buf) |
222 | { |
223 | /* |
224 | * conversion didn't pstrdup, so we must. We can use the length of the |
225 | * original string, because no conversion was done. |
226 | */ |
227 | recoded = pnstrdup(recoded, len); |
228 | } |
229 | |
230 | return recoded; |
231 | } |
232 | |
233 | /* |
234 | * lowerstr --- fold null-terminated string to lower case |
235 | * |
236 | * Returned string is palloc'd |
237 | */ |
238 | char * |
239 | lowerstr(const char *str) |
240 | { |
241 | return lowerstr_with_len(str, strlen(str)); |
242 | } |
243 | |
244 | /* |
245 | * lowerstr_with_len --- fold string to lower case |
246 | * |
247 | * Input string need not be null-terminated. |
248 | * |
249 | * Returned string is palloc'd |
250 | */ |
251 | char * |
252 | lowerstr_with_len(const char *str, int len) |
253 | { |
254 | char *out; |
255 | Oid collation = DEFAULT_COLLATION_OID; /* TODO */ |
256 | pg_locale_t mylocale = 0; /* TODO */ |
257 | |
258 | if (len == 0) |
259 | return pstrdup("" ); |
260 | |
261 | /* |
262 | * Use wide char code only when max encoding length > 1 and ctype != C. |
263 | * Some operating systems fail with multi-byte encodings and a C locale. |
264 | * Also, for a C locale there is no need to process as multibyte. From |
265 | * backend/utils/adt/oracle_compat.c Teodor |
266 | */ |
267 | if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation)) |
268 | { |
269 | wchar_t *wstr, |
270 | *wptr; |
271 | int wlen; |
272 | |
273 | /* |
274 | * alloc number of wchar_t for worst case, len contains number of |
275 | * bytes >= number of characters and alloc 1 wchar_t for 0, because |
276 | * wchar2char wants zero-terminated string |
277 | */ |
278 | wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); |
279 | |
280 | wlen = char2wchar(wstr, len + 1, str, len, mylocale); |
281 | Assert(wlen <= len); |
282 | |
283 | while (*wptr) |
284 | { |
285 | *wptr = towlower((wint_t) *wptr); |
286 | wptr++; |
287 | } |
288 | |
289 | /* |
290 | * Alloc result string for worst case + '\0' |
291 | */ |
292 | len = pg_database_encoding_max_length() * wlen + 1; |
293 | out = (char *) palloc(len); |
294 | |
295 | wlen = wchar2char(out, wstr, len, mylocale); |
296 | |
297 | pfree(wstr); |
298 | |
299 | if (wlen < 0) |
300 | ereport(ERROR, |
301 | (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
302 | errmsg("conversion from wchar_t to server encoding failed: %m" ))); |
303 | Assert(wlen < len); |
304 | } |
305 | else |
306 | { |
307 | const char *ptr = str; |
308 | char *outptr; |
309 | |
310 | outptr = out = (char *) palloc(sizeof(char) * (len + 1)); |
311 | while ((ptr - str) < len && *ptr) |
312 | { |
313 | *outptr++ = tolower(TOUCHAR(ptr)); |
314 | ptr++; |
315 | } |
316 | *outptr = '\0'; |
317 | } |
318 | |
319 | return out; |
320 | } |
321 | |