unicode_norm.c source code [PostgreSQL/src/common/unicode_norm.c]

1	/-------------------------------------------------------------------------*
2	* unicode_norm.c
3	* Normalize a Unicode string to NFKC form
4	*
5	* This implements Unicode normalization, per the documentation at
6	* http://www.unicode.org/reports/tr15/.
7	*
8	* Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
9	*
10	* IDENTIFICATION
11	* src/common/unicode_norm.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#ifndef FRONTEND
16	#include "postgres.h"
17	#else
18	#include "postgres_fe.h"
19	#endif
20
21	#include "common/unicode_norm.h"
22	#include "common/unicode_norm_table.h"
23
24	#ifndef FRONTEND
25	#define ALLOC(size) palloc(size)
26	#define FREE(size) pfree(size)
27	#else
28	#define ALLOC(size) malloc(size)
29	#define FREE(size) free(size)
30	#endif
31
32	/ Constants for calculations with Hangul characters /
33	#define SBASE 0xAC00 /* U+AC00 */
34	#define LBASE 0x1100 /* U+1100 */
35	#define VBASE 0x1161 /* U+1161 */
36	#define TBASE 0x11A7 /* U+11A7 */
37	#define LCOUNT 19
38	#define VCOUNT 21
39	#define TCOUNT 28
40	#define NCOUNT VCOUNT * TCOUNT
41	#define SCOUNT LCOUNT * NCOUNT
42
43	/ comparison routine for bsearch() of decomposition lookup table. /
44	static int
45	conv_compare(const void p1, const* void *p2)
46	{
47	uint32 v1,
48	v2;
49
50	v1 = (const* uint32 *) p1;
51	v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52	return (v1 > v2) ? `1` : ((v1 == v2) ? `0` : -`1`);
53	}
54
55	/*
56	* Get the entry corresponding to code in the decomposition lookup table.
57	*/
58	static pg_unicode_decomposition *
59	get_code_entry(pg_wchar code)
60	{
61	return bsearch(&(code),
62	UnicodeDecompMain,
63	lengthof(UnicodeDecompMain),
64	sizeof(pg_unicode_decomposition),
65	conv_compare);
66	}
67
68	/*
69	* Given a decomposition entry looked up earlier, get the decomposed
70	* characters.
71	*
72	* Note: the returned pointer can point to statically allocated buffer, and
73	* is only valid until next call to this function!
74	*/
75	static const pg_wchar *
76	get_code_decomposition(pg_unicode_decomposition entry, int* *dec_size)
77	{
78	static pg_wchar x;
79
80	if (DECOMPOSITION_IS_INLINE(entry))
81	{
82	Assert(DECOMPOSITION_SIZE(entry) == `1`);
83	x = (pg_wchar) entry->dec_index;
84	*dec_size = `1`;
85	return &x;
86	}
87	else
88	{
89	*dec_size = DECOMPOSITION_SIZE(entry);
90	return &UnicodeDecomp_codepoints[entry->dec_index];
91	}
92	}
93
94	/*
95	* Calculate how many characters a given character will decompose to.
96	*
97	* This needs to recurse, if the character decomposes into characters that
98	* are, in turn, decomposable.
99	*/
100	static int
101	get_decomposed_size(pg_wchar code)
102	{
103	pg_unicode_decomposition *entry;
104	int size = `0`;
105	int i;
106	const uint32 *decomp;
107	int dec_size;
108
109	/*
110	* Fast path for Hangul characters not stored in tables to save memory as
111	* decomposition is algorithmic. See
112	* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113	* the matter.
114	*/
115	if (code >= SBASE && code < SBASE + SCOUNT)
116	{
117	uint32 tindex,
118	sindex;
119
120	sindex = code - SBASE;
121	tindex = sindex % TCOUNT;
122
123	if (tindex != `0`)
124	return `3`;
125	return `2`;
126	}
127
128	entry = get_code_entry(code);
129
130	/*
131	* Just count current code if no other decompositions. A NULL entry is
132	* equivalent to a character with class 0 and no decompositions.
133	*/
134	if (entry == NULL \|\| DECOMPOSITION_SIZE(entry) == `0`)
135	return `1`;
136
137	/*
138	* If this entry has other decomposition codes look at them as well. First
139	* get its decomposition in the list of tables available.
140	*/
141	decomp = get_code_decomposition(entry, &dec_size);
142	for (i = `0`; i < dec_size; i++)
143	{
144	uint32 lcode = decomp[i];
145
146	size += get_decomposed_size(lcode);
147	}
148
149	return size;
150	}
151
152	/*
153	* Recompose a set of characters. For hangul characters, the calculation
154	* is algorithmic. For others, an inverse lookup at the decomposition
155	* table is necessary. Returns true if a recomposition can be done, and
156	* false otherwise.
157	*/
158	static bool
159	recompose_code(uint32 start, uint32 code, uint32 *result)
160	{
161	/*
162	* Handle Hangul characters algorithmically, per the Unicode spec.
163	*
164	* Check if two current characters are L and V.
165	*/
166	if (start >= LBASE && start < LBASE + LCOUNT &&
167	code >= VBASE && code < VBASE + VCOUNT)
168	{
169	/ make syllable of form LV /
170	uint32 lindex = start - LBASE;
171	uint32 vindex = code - VBASE;
172
173	result = SBASE + (lindex VCOUNT + vindex) * TCOUNT;
174	return true;
175	}
176	/ Check if two current characters are LV and T /
177	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178	((start - SBASE) % TCOUNT) == `0` &&
179	code >= TBASE && code < (TBASE + TCOUNT))
180	{
181	/ make syllable of from LVT /
182	uint32 tindex = code - TBASE;
183
184	*result = start + tindex;
185	return true;
186	}
187	else
188	{
189	int i;
190
191	/*
192	* Do an inverse lookup of the decomposition tables to see if anything
193	* matches. The comparison just needs to be a perfect match on the
194	* sub-table of size two, because the start character has already been
195	* recomposed partially.
196	*/
197	for (i = `0`; i < lengthof(UnicodeDecompMain); i++)
198	{
199	const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
200
201	if (DECOMPOSITION_SIZE(entry) != `2`)
202	continue;
203
204	if (DECOMPOSITION_NO_COMPOSE(entry))
205	continue;
206
207	if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208	code == UnicodeDecomp_codepoints[entry->dec_index + `1`])
209	{
210	*result = entry->codepoint;
211	return true;
212	}
213	}
214	}
215
216	return false;
217	}
218
219	/*
220	* Decompose the given code into the array given by caller. The
221	* decomposition begins at the position given by caller, saving one
222	* lookup on the decomposition table. The current position needs to be
223	* updated here to let the caller know from where to continue filling
224	* in the array result.
225	*/
226	static void
227	decompose_code(pg_wchar code, pg_wchar *result, int* *current)
228	{
229	pg_unicode_decomposition *entry;
230	int i;
231	const uint32 *decomp;
232	int dec_size;
233
234	/*
235	* Fast path for Hangul characters not stored in tables to save memory as
236	* decomposition is algorithmic. See
237	* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238	* the matter.
239	*/
240	if (code >= SBASE && code < SBASE + SCOUNT)
241	{
242	uint32 l,
243	v,
244	tindex,
245	sindex;
246	pg_wchar res = result;
247
248	sindex = code - SBASE;
249	l = LBASE + sindex / (VCOUNT * TCOUNT);
250	v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251	tindex = sindex % TCOUNT;
252
253	res[*current] = l;
254	(*current)++;
255	res[*current] = v;
256	(*current)++;
257
258	if (tindex != `0`)
259	{
260	res[*current] = TBASE + tindex;
261	(*current)++;
262	}
263
264	return;
265	}
266
267	entry = get_code_entry(code);
268
269	/*
270	* Just fill in with the current decomposition if there are no
271	* decomposition codes to recurse to. A NULL entry is equivalent to a
272	* character with class 0 and no decompositions, so just leave also in
273	* this case.
274	*/
275	if (entry == NULL \|\| DECOMPOSITION_SIZE(entry) == `0`)
276	{
277	pg_wchar res = result;
278
279	res[*current] = code;
280	(*current)++;
281	return;
282	}
283
284	/*
285	* If this entry has other decomposition codes look at them as well.
286	*/
287	decomp = get_code_decomposition(entry, &dec_size);
288	for (i = `0`; i < dec_size; i++)
289	{
290	pg_wchar lcode = (pg_wchar) decomp[i];
291
292	/ Leave if no more decompositions /
293	decompose_code(lcode, result, current);
294	}
295	}
296
297	/*
298	* unicode_normalize_kc - Normalize a Unicode string to NFKC form.
299	*
300	* The input is a 0-terminated array of codepoints.
301	*
302	* In frontend, returns a 0-terminated array of codepoints, allocated with
303	* malloc. Or NULL if we run out of memory. In frontend, the returned
304	* string is palloc'd instead, and OOM is reported with ereport().
305	*/
306	pg_wchar *
307	unicode_normalize_kc(const pg_wchar *input)
308	{
309	pg_wchar *decomp_chars;
310	pg_wchar *recomp_chars;
311	int decomp_size,
312	current_size;
313	int count;
314	const pg_wchar *p;
315
316	/ variables for recomposition /
317	int last_class;
318	int starter_pos;
319	int target_pos;
320	uint32 starter_ch;
321
322	/ First, do character decomposition /
323
324	/*
325	* Calculate how many characters long the decomposed version will be.
326	*/
327	decomp_size = `0`;
328	for (p = input; *p; p++)
329	decomp_size += get_decomposed_size(*p);
330
331	decomp_chars = (pg_wchar ) ALLOC((decomp_size + `1`) sizeof(pg_wchar));
332	if (decomp_chars == NULL)
333	return NULL;
334
335	/*
336	* Now fill in each entry recursively. This needs a second pass on the
337	* decomposition table.
338	*/
339	current_size = `0`;
340	for (p = input; *p; p++)
341	decompose_code(*p, &decomp_chars, &current_size);
342	decomp_chars[decomp_size] = `'\0'`;
343	Assert(decomp_size == current_size);
344
345	/*
346	* Now apply canonical ordering.
347	*/
348	for (count = `1`; count < decomp_size; count++)
349	{
350	pg_wchar prev = decomp_chars[count - `1`];
351	pg_wchar next = decomp_chars[count];
352	pg_wchar tmp;
353	pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354	pg_unicode_decomposition *nextEntry = get_code_entry(next);
355
356	/*
357	* If no entries are found, the character used is either an Hangul
358	* character or a character with a class of 0 and no decompositions,
359	* so move to next result.
360	*/
361	if (prevEntry == NULL \|\| nextEntry == NULL)
362	continue;
363
364	/*
365	* Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
366	* a sequence of two adjacent characters in a string is an
367	* exchangeable pair if the combining class (from the Unicode
368	* Character Database) for the first character is greater than the
369	* combining class for the second, and the second is not a starter. A
370	* character is a starter if its combining class is 0.
371	*/
372	if (nextEntry->comb_class == `0x0` \|\| prevEntry->comb_class == `0x0`)
373	continue;
374
375	if (prevEntry->comb_class <= nextEntry->comb_class)
376	continue;
377
378	/ exchange can happen /
379	tmp = decomp_chars[count - `1`];
380	decomp_chars[count - `1`] = decomp_chars[count];
381	decomp_chars[count] = tmp;
382
383	/ backtrack to check again /
384	if (count > `1`)
385	count -= `2`;
386	}
387
388	/*
389	* The last phase of NFKC is the recomposition of the reordered Unicode
390	* string using combining classes. The recomposed string cannot be longer
391	* than the decomposed one, so make the allocation of the output string
392	* based on that assumption.
393	*/
394	recomp_chars = (pg_wchar ) ALLOC((decomp_size + `1`) sizeof(pg_wchar));
395	if (!recomp_chars)
396	{
397	FREE(decomp_chars);
398	return NULL;
399	}
400
401	last_class = -`1`; / this eliminates a special check /
402	starter_pos = `0`;
403	target_pos = `1`;
404	starter_ch = recomp_chars[`0`] = decomp_chars[`0`];
405
406	for (count = `1`; count < decomp_size; count++)
407	{
408	pg_wchar ch = decomp_chars[count];
409	pg_unicode_decomposition *ch_entry = get_code_entry(ch);
410	int ch_class = (ch_entry == NULL) ? `0` : ch_entry->comb_class;
411	pg_wchar composite;
412
413	if (last_class < ch_class &&
414	recompose_code(starter_ch, ch, &composite))
415	{
416	recomp_chars[starter_pos] = composite;
417	starter_ch = composite;
418	}
419	else if (ch_class == `0`)
420	{
421	starter_pos = target_pos;
422	starter_ch = ch;
423	last_class = -`1`;
424	recomp_chars[target_pos++] = ch;
425	}
426	else
427	{
428	last_class = ch_class;
429	recomp_chars[target_pos++] = ch;
430	}
431	}
432	recomp_chars[target_pos] = (pg_wchar) `'\0'`;
433
434	FREE(decomp_chars);
435
436	return recomp_chars;
437	}
438

Browse the source code of PostgreSQL/src/common/unicode_norm.c