1/*-------------------------------------------------------------------------
2 *
3 * ts_locale.c
4 * locale compatibility layer for tsearch
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_locale.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include "catalog/pg_collation.h"
17#include "storage/fd.h"
18#include "tsearch/ts_locale.h"
19#include "tsearch/ts_public.h"
20
21static void tsearch_readline_callback(void *arg);
22
23
24/*
25 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
26 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27 * getting from char2wchar() is UTF16 not UTF32. A single input character
28 * may therefore produce a surrogate pair rather than just one wchar_t;
29 * we also need room for a trailing null. When we do get a surrogate pair,
30 * we pass just the first code to iswdigit() etc, so that these functions will
31 * always return false for characters outside the Basic Multilingual Plane.
32 */
33#define WC_BUF_LEN 3
34
35int
36t_isdigit(const char *ptr)
37{
38 int clen = pg_mblen(ptr);
39 wchar_t character[WC_BUF_LEN];
40 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
41 pg_locale_t mylocale = 0; /* TODO */
42
43 if (clen == 1 || lc_ctype_is_c(collation))
44 return isdigit(TOUCHAR(ptr));
45
46 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47
48 return iswdigit((wint_t) character[0]);
49}
50
51int
52t_isspace(const char *ptr)
53{
54 int clen = pg_mblen(ptr);
55 wchar_t character[WC_BUF_LEN];
56 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
57 pg_locale_t mylocale = 0; /* TODO */
58
59 if (clen == 1 || lc_ctype_is_c(collation))
60 return isspace(TOUCHAR(ptr));
61
62 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
63
64 return iswspace((wint_t) character[0]);
65}
66
67int
68t_isalpha(const char *ptr)
69{
70 int clen = pg_mblen(ptr);
71 wchar_t character[WC_BUF_LEN];
72 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
73 pg_locale_t mylocale = 0; /* TODO */
74
75 if (clen == 1 || lc_ctype_is_c(collation))
76 return isalpha(TOUCHAR(ptr));
77
78 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
79
80 return iswalpha((wint_t) character[0]);
81}
82
83int
84t_isprint(const char *ptr)
85{
86 int clen = pg_mblen(ptr);
87 wchar_t character[WC_BUF_LEN];
88 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
89 pg_locale_t mylocale = 0; /* TODO */
90
91 if (clen == 1 || lc_ctype_is_c(collation))
92 return isprint(TOUCHAR(ptr));
93
94 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
95
96 return iswprint((wint_t) character[0]);
97}
98
99
100/*
101 * Set up to read a file using tsearch_readline(). This facility is
102 * better than just reading the file directly because it provides error
103 * context pointing to the specific line where a problem is detected.
104 *
105 * Expected usage is:
106 *
107 * tsearch_readline_state trst;
108 *
109 * if (!tsearch_readline_begin(&trst, filename))
110 * ereport(ERROR,
111 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
112 * errmsg("could not open stop-word file \"%s\": %m",
113 * filename)));
114 * while ((line = tsearch_readline(&trst)) != NULL)
115 * process line;
116 * tsearch_readline_end(&trst);
117 *
118 * Note that the caller supplies the ereport() for file open failure;
119 * this is so that a custom message can be provided. The filename string
120 * passed to tsearch_readline_begin() must remain valid through
121 * tsearch_readline_end().
122 */
123bool
124tsearch_readline_begin(tsearch_readline_state *stp,
125 const char *filename)
126{
127 if ((stp->fp = AllocateFile(filename, "r")) == NULL)
128 return false;
129 stp->filename = filename;
130 stp->lineno = 0;
131 stp->curline = NULL;
132 /* Setup error traceback support for ereport() */
133 stp->cb.callback = tsearch_readline_callback;
134 stp->cb.arg = (void *) stp;
135 stp->cb.previous = error_context_stack;
136 error_context_stack = &stp->cb;
137 return true;
138}
139
140/*
141 * Read the next line from a tsearch data file (expected to be in UTF-8), and
142 * convert it to database encoding if needed. The returned string is palloc'd.
143 * NULL return means EOF.
144 */
145char *
146tsearch_readline(tsearch_readline_state *stp)
147{
148 char *result;
149
150 stp->lineno++;
151 stp->curline = NULL;
152 result = t_readline(stp->fp);
153 stp->curline = result;
154 return result;
155}
156
157/*
158 * Close down after reading a file with tsearch_readline()
159 */
160void
161tsearch_readline_end(tsearch_readline_state *stp)
162{
163 FreeFile(stp->fp);
164 /* Pop the error context stack */
165 error_context_stack = stp->cb.previous;
166}
167
168/*
169 * Error context callback for errors occurring while reading a tsearch
170 * configuration file.
171 */
172static void
173tsearch_readline_callback(void *arg)
174{
175 tsearch_readline_state *stp = (tsearch_readline_state *) arg;
176
177 /*
178 * We can't include the text of the config line for errors that occur
179 * during t_readline() itself. This is only partly a consequence of our
180 * arms-length use of that routine: the major cause of such errors is
181 * encoding violations, and we daren't try to print error messages
182 * containing badly-encoded data.
183 */
184 if (stp->curline)
185 errcontext("line %d of configuration file \"%s\": \"%s\"",
186 stp->lineno,
187 stp->filename,
188 stp->curline);
189 else
190 errcontext("line %d of configuration file \"%s\"",
191 stp->lineno,
192 stp->filename);
193}
194
195
196/*
197 * Read the next line from a tsearch data file (expected to be in UTF-8), and
198 * convert it to database encoding if needed. The returned string is palloc'd.
199 * NULL return means EOF.
200 *
201 * Note: direct use of this function is now deprecated. Go through
202 * tsearch_readline() to provide better error reporting.
203 */
204char *
205t_readline(FILE *fp)
206{
207 int len;
208 char *recoded;
209 char buf[4096]; /* lines must not be longer than this */
210
211 if (fgets(buf, sizeof(buf), fp) == NULL)
212 return NULL;
213
214 len = strlen(buf);
215
216 /* Make sure the input is valid UTF-8 */
217 (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
218
219 /* And convert */
220 recoded = pg_any_to_server(buf, len, PG_UTF8);
221 if (recoded == buf)
222 {
223 /*
224 * conversion didn't pstrdup, so we must. We can use the length of the
225 * original string, because no conversion was done.
226 */
227 recoded = pnstrdup(recoded, len);
228 }
229
230 return recoded;
231}
232
233/*
234 * lowerstr --- fold null-terminated string to lower case
235 *
236 * Returned string is palloc'd
237 */
238char *
239lowerstr(const char *str)
240{
241 return lowerstr_with_len(str, strlen(str));
242}
243
244/*
245 * lowerstr_with_len --- fold string to lower case
246 *
247 * Input string need not be null-terminated.
248 *
249 * Returned string is palloc'd
250 */
251char *
252lowerstr_with_len(const char *str, int len)
253{
254 char *out;
255 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
256 pg_locale_t mylocale = 0; /* TODO */
257
258 if (len == 0)
259 return pstrdup("");
260
261 /*
262 * Use wide char code only when max encoding length > 1 and ctype != C.
263 * Some operating systems fail with multi-byte encodings and a C locale.
264 * Also, for a C locale there is no need to process as multibyte. From
265 * backend/utils/adt/oracle_compat.c Teodor
266 */
267 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
268 {
269 wchar_t *wstr,
270 *wptr;
271 int wlen;
272
273 /*
274 * alloc number of wchar_t for worst case, len contains number of
275 * bytes >= number of characters and alloc 1 wchar_t for 0, because
276 * wchar2char wants zero-terminated string
277 */
278 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
279
280 wlen = char2wchar(wstr, len + 1, str, len, mylocale);
281 Assert(wlen <= len);
282
283 while (*wptr)
284 {
285 *wptr = towlower((wint_t) *wptr);
286 wptr++;
287 }
288
289 /*
290 * Alloc result string for worst case + '\0'
291 */
292 len = pg_database_encoding_max_length() * wlen + 1;
293 out = (char *) palloc(len);
294
295 wlen = wchar2char(out, wstr, len, mylocale);
296
297 pfree(wstr);
298
299 if (wlen < 0)
300 ereport(ERROR,
301 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
302 errmsg("conversion from wchar_t to server encoding failed: %m")));
303 Assert(wlen < len);
304 }
305 else
306 {
307 const char *ptr = str;
308 char *outptr;
309
310 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
311 while ((ptr - str) < len && *ptr)
312 {
313 *outptr++ = tolower(TOUCHAR(ptr));
314 ptr++;
315 }
316 *outptr = '\0';
317 }
318
319 return out;
320}
321