ts_locale.c source code [PostgreSQL/src/backend/tsearch/ts_locale.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ts_locale.c
4	* locale compatibility layer for tsearch
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/tsearch/ts_locale.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14	#include "postgres.h"
15
16	#include "catalog/pg_collation.h"
17	#include "storage/fd.h"
18	#include "tsearch/ts_locale.h"
19	#include "tsearch/ts_public.h"
20
21	static void tsearch_readline_callback(void *arg);
22
23
24	/*
25	* The reason these functions use a 3-wchar_t output buffer, not 2 as you
26	* might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27	* getting from char2wchar() is UTF16 not UTF32. A single input character
28	* may therefore produce a surrogate pair rather than just one wchar_t;
29	* we also need room for a trailing null. When we do get a surrogate pair,
30	* we pass just the first code to iswdigit() etc, so that these functions will
31	* always return false for characters outside the Basic Multilingual Plane.
32	*/
33	#define WC_BUF_LEN 3
34
35	int
36	t_isdigit(const char *ptr)
37	{
38	int clen = pg_mblen(ptr);
39	wchar_t character[WC_BUF_LEN];
40	Oid collation = DEFAULT_COLLATION_OID; / TODO /
41	pg_locale_t mylocale = `0`; / TODO /
42
43	if (clen == `1` \|\| lc_ctype_is_c(collation))
44	return isdigit(TOUCHAR(ptr));
45
46	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47
48	return iswdigit((wint_t) character[`0`]);
49	}
50
51	int
52	t_isspace(const char *ptr)
53	{
54	int clen = pg_mblen(ptr);
55	wchar_t character[WC_BUF_LEN];
56	Oid collation = DEFAULT_COLLATION_OID; / TODO /
57	pg_locale_t mylocale = `0`; / TODO /
58
59	if (clen == `1` \|\| lc_ctype_is_c(collation))
60	return isspace(TOUCHAR(ptr));
61
62	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
63
64	return iswspace((wint_t) character[`0`]);
65	}
66
67	int
68	t_isalpha(const char *ptr)
69	{
70	int clen = pg_mblen(ptr);
71	wchar_t character[WC_BUF_LEN];
72	Oid collation = DEFAULT_COLLATION_OID; / TODO /
73	pg_locale_t mylocale = `0`; / TODO /
74
75	if (clen == `1` \|\| lc_ctype_is_c(collation))
76	return isalpha(TOUCHAR(ptr));
77
78	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
79
80	return iswalpha((wint_t) character[`0`]);
81	}
82
83	int
84	t_isprint(const char *ptr)
85	{
86	int clen = pg_mblen(ptr);
87	wchar_t character[WC_BUF_LEN];
88	Oid collation = DEFAULT_COLLATION_OID; / TODO /
89	pg_locale_t mylocale = `0`; / TODO /
90
91	if (clen == `1` \|\| lc_ctype_is_c(collation))
92	return isprint(TOUCHAR(ptr));
93
94	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
95
96	return iswprint((wint_t) character[`0`]);
97	}
98
99
100	/*
101	* Set up to read a file using tsearch_readline(). This facility is
102	* better than just reading the file directly because it provides error
103	* context pointing to the specific line where a problem is detected.
104	*
105	* Expected usage is:
106	*
107	* tsearch_readline_state trst;
108	*
109	* if (!tsearch_readline_begin(&trst, filename))
110	* ereport(ERROR,
111	* (errcode(ERRCODE_CONFIG_FILE_ERROR),
112	* errmsg("could not open stop-word file \"%s\": %m",
113	* filename)));
114	* while ((line = tsearch_readline(&trst)) != NULL)
115	* process line;
116	* tsearch_readline_end(&trst);
117	*
118	* Note that the caller supplies the ereport() for file open failure;
119	* this is so that a custom message can be provided. The filename string
120	* passed to tsearch_readline_begin() must remain valid through
121	* tsearch_readline_end().
122	*/
123	bool
124	tsearch_readline_begin(tsearch_readline_state *stp,
125	const char *filename)
126	{
127	if ((stp->fp = AllocateFile(filename, "r")) == NULL)
128	return false;
129	stp->filename = filename;
130	stp->lineno = `0`;
131	stp->curline = NULL;
132	/ Setup error traceback support for ereport() /
133	stp->cb.callback = tsearch_readline_callback;
134	stp->cb.arg = (void *) stp;
135	stp->cb.previous = error_context_stack;
136	error_context_stack = &stp->cb;
137	return true;
138	}
139
140	/*
141	* Read the next line from a tsearch data file (expected to be in UTF-8), and
142	* convert it to database encoding if needed. The returned string is palloc'd.
143	* NULL return means EOF.
144	*/
145	char *
146	tsearch_readline(tsearch_readline_state *stp)
147	{
148	char *result;
149
150	stp->lineno++;
151	stp->curline = NULL;
152	result = t_readline(stp->fp);
153	stp->curline = result;
154	return result;
155	}
156
157	/*
158	* Close down after reading a file with tsearch_readline()
159	*/
160	void
161	tsearch_readline_end(tsearch_readline_state *stp)
162	{
163	FreeFile(stp->fp);
164	/ Pop the error context stack /
165	error_context_stack = stp->cb.previous;
166	}
167
168	/*
169	* Error context callback for errors occurring while reading a tsearch
170	* configuration file.
171	*/
172	static void
173	tsearch_readline_callback(void *arg)
174	{
175	tsearch_readline_state stp = (tsearch_readline_state ) arg;
176
177	/*
178	* We can't include the text of the config line for errors that occur
179	* during t_readline() itself. This is only partly a consequence of our
180	* arms-length use of that routine: the major cause of such errors is
181	* encoding violations, and we daren't try to print error messages
182	* containing badly-encoded data.
183	*/
184	if (stp->curline)
185	errcontext("line %d of configuration file \"%s\": \"%s\"",
186	stp->lineno,
187	stp->filename,
188	stp->curline);
189	else
190	errcontext("line %d of configuration file \"%s\"",
191	stp->lineno,
192	stp->filename);
193	}
194
195
196	/*
197	* Read the next line from a tsearch data file (expected to be in UTF-8), and
198	* convert it to database encoding if needed. The returned string is palloc'd.
199	* NULL return means EOF.
200	*
201	* Note: direct use of this function is now deprecated. Go through
202	* tsearch_readline() to provide better error reporting.
203	*/
204	char *
205	t_readline(FILE *fp)
206	{
207	int len;
208	char *recoded;
209	char buf[`4096`]; / lines must not be longer than this /
210
211	if (fgets(buf, sizeof(buf), fp) == NULL)
212	return NULL;
213
214	len = strlen(buf);
215
216	/ Make sure the input is valid UTF-8 /
217	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
218
219	/ And convert /
220	recoded = pg_any_to_server(buf, len, PG_UTF8);
221	if (recoded == buf)
222	{
223	/*
224	* conversion didn't pstrdup, so we must. We can use the length of the
225	* original string, because no conversion was done.
226	*/
227	recoded = pnstrdup(recoded, len);
228	}
229
230	return recoded;
231	}
232
233	/*
234	* lowerstr --- fold null-terminated string to lower case
235	*
236	* Returned string is palloc'd
237	*/
238	char *
239	lowerstr(const char *str)
240	{
241	return lowerstr_with_len(str, strlen(str));
242	}
243
244	/*
245	* lowerstr_with_len --- fold string to lower case
246	*
247	* Input string need not be null-terminated.
248	*
249	* Returned string is palloc'd
250	*/
251	char *
252	lowerstr_with_len(const char str, int* len)
253	{
254	char *out;
255	Oid collation = DEFAULT_COLLATION_OID; / TODO /
256	pg_locale_t mylocale = `0`; / TODO /
257
258	if (len == `0`)
259	return pstrdup("");
260
261	/*
262	* Use wide char code only when max encoding length > 1 and ctype != C.
263	* Some operating systems fail with multi-byte encodings and a C locale.
264	* Also, for a C locale there is no need to process as multibyte. From
265	* backend/utils/adt/oracle_compat.c Teodor
266	*/
267	if (pg_database_encoding_max_length() > `1` && !lc_ctype_is_c(collation))
268	{
269	wchar_t *wstr,
270	*wptr;
271	int wlen;
272
273	/*
274	* alloc number of wchar_t for worst case, len contains number of
275	* bytes >= number of characters and alloc 1 wchar_t for 0, because
276	* wchar2char wants zero-terminated string
277	*/
278	wptr = wstr = (wchar_t ) palloc(sizeof(wchar_t) (len + `1`));
279
280	wlen = char2wchar(wstr, len + `1`, str, len, mylocale);
281	Assert(wlen <= len);
282
283	while (*wptr)
284	{
285	wptr = towlower((wint_t) wptr);
286	wptr++;
287	}
288
289	/*
290	* Alloc result string for worst case + '\0'
291	*/
292	len = pg_database_encoding_max_length() * wlen + `1`;
293	out = (char *) palloc(len);
294
295	wlen = wchar2char(out, wstr, len, mylocale);
296
297	pfree(wstr);
298
299	if (wlen < `0`)
300	ereport(ERROR,
301	(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
302	errmsg("conversion from wchar_t to server encoding failed: %m")));
303	Assert(wlen < len);
304	}
305	else
306	{
307	const char *ptr = str;
308	char *outptr;
309
310	outptr = out = (char ) palloc(sizeof(char) (len + `1`));
311	while ((ptr - str) < len && *ptr)
312	{
313	*outptr++ = tolower(TOUCHAR(ptr));
314	ptr++;
315	}
316	*outptr = `'\0'`;
317	}
318
319	return out;
320	}
321

Browse the source code of PostgreSQL/src/backend/tsearch/ts_locale.c