1/*-------------------------------------------------------------------------
2 * unicode_norm.c
3 * Normalize a Unicode string to NFKC form
4 *
5 * This implements Unicode normalization, per the documentation at
6 * http://www.unicode.org/reports/tr15/.
7 *
8 * Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
9 *
10 * IDENTIFICATION
11 * src/common/unicode_norm.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef FRONTEND
16#include "postgres.h"
17#else
18#include "postgres_fe.h"
19#endif
20
21#include "common/unicode_norm.h"
22#include "common/unicode_norm_table.h"
23
24#ifndef FRONTEND
25#define ALLOC(size) palloc(size)
26#define FREE(size) pfree(size)
27#else
28#define ALLOC(size) malloc(size)
29#define FREE(size) free(size)
30#endif
31
32/* Constants for calculations with Hangul characters */
33#define SBASE 0xAC00 /* U+AC00 */
34#define LBASE 0x1100 /* U+1100 */
35#define VBASE 0x1161 /* U+1161 */
36#define TBASE 0x11A7 /* U+11A7 */
37#define LCOUNT 19
38#define VCOUNT 21
39#define TCOUNT 28
40#define NCOUNT VCOUNT * TCOUNT
41#define SCOUNT LCOUNT * NCOUNT
42
43/* comparison routine for bsearch() of decomposition lookup table. */
44static int
45conv_compare(const void *p1, const void *p2)
46{
47 uint32 v1,
48 v2;
49
50 v1 = *(const uint32 *) p1;
51 v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
53}
54
55/*
56 * Get the entry corresponding to code in the decomposition lookup table.
57 */
58static pg_unicode_decomposition *
59get_code_entry(pg_wchar code)
60{
61 return bsearch(&(code),
62 UnicodeDecompMain,
63 lengthof(UnicodeDecompMain),
64 sizeof(pg_unicode_decomposition),
65 conv_compare);
66}
67
68/*
69 * Given a decomposition entry looked up earlier, get the decomposed
70 * characters.
71 *
72 * Note: the returned pointer can point to statically allocated buffer, and
73 * is only valid until next call to this function!
74 */
75static const pg_wchar *
76get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
77{
78 static pg_wchar x;
79
80 if (DECOMPOSITION_IS_INLINE(entry))
81 {
82 Assert(DECOMPOSITION_SIZE(entry) == 1);
83 x = (pg_wchar) entry->dec_index;
84 *dec_size = 1;
85 return &x;
86 }
87 else
88 {
89 *dec_size = DECOMPOSITION_SIZE(entry);
90 return &UnicodeDecomp_codepoints[entry->dec_index];
91 }
92}
93
94/*
95 * Calculate how many characters a given character will decompose to.
96 *
97 * This needs to recurse, if the character decomposes into characters that
98 * are, in turn, decomposable.
99 */
100static int
101get_decomposed_size(pg_wchar code)
102{
103 pg_unicode_decomposition *entry;
104 int size = 0;
105 int i;
106 const uint32 *decomp;
107 int dec_size;
108
109 /*
110 * Fast path for Hangul characters not stored in tables to save memory as
111 * decomposition is algorithmic. See
112 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113 * the matter.
114 */
115 if (code >= SBASE && code < SBASE + SCOUNT)
116 {
117 uint32 tindex,
118 sindex;
119
120 sindex = code - SBASE;
121 tindex = sindex % TCOUNT;
122
123 if (tindex != 0)
124 return 3;
125 return 2;
126 }
127
128 entry = get_code_entry(code);
129
130 /*
131 * Just count current code if no other decompositions. A NULL entry is
132 * equivalent to a character with class 0 and no decompositions.
133 */
134 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
135 return 1;
136
137 /*
138 * If this entry has other decomposition codes look at them as well. First
139 * get its decomposition in the list of tables available.
140 */
141 decomp = get_code_decomposition(entry, &dec_size);
142 for (i = 0; i < dec_size; i++)
143 {
144 uint32 lcode = decomp[i];
145
146 size += get_decomposed_size(lcode);
147 }
148
149 return size;
150}
151
152/*
153 * Recompose a set of characters. For hangul characters, the calculation
154 * is algorithmic. For others, an inverse lookup at the decomposition
155 * table is necessary. Returns true if a recomposition can be done, and
156 * false otherwise.
157 */
158static bool
159recompose_code(uint32 start, uint32 code, uint32 *result)
160{
161 /*
162 * Handle Hangul characters algorithmically, per the Unicode spec.
163 *
164 * Check if two current characters are L and V.
165 */
166 if (start >= LBASE && start < LBASE + LCOUNT &&
167 code >= VBASE && code < VBASE + VCOUNT)
168 {
169 /* make syllable of form LV */
170 uint32 lindex = start - LBASE;
171 uint32 vindex = code - VBASE;
172
173 *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
174 return true;
175 }
176 /* Check if two current characters are LV and T */
177 else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178 ((start - SBASE) % TCOUNT) == 0 &&
179 code >= TBASE && code < (TBASE + TCOUNT))
180 {
181 /* make syllable of from LVT */
182 uint32 tindex = code - TBASE;
183
184 *result = start + tindex;
185 return true;
186 }
187 else
188 {
189 int i;
190
191 /*
192 * Do an inverse lookup of the decomposition tables to see if anything
193 * matches. The comparison just needs to be a perfect match on the
194 * sub-table of size two, because the start character has already been
195 * recomposed partially.
196 */
197 for (i = 0; i < lengthof(UnicodeDecompMain); i++)
198 {
199 const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
200
201 if (DECOMPOSITION_SIZE(entry) != 2)
202 continue;
203
204 if (DECOMPOSITION_NO_COMPOSE(entry))
205 continue;
206
207 if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
209 {
210 *result = entry->codepoint;
211 return true;
212 }
213 }
214 }
215
216 return false;
217}
218
219/*
220 * Decompose the given code into the array given by caller. The
221 * decomposition begins at the position given by caller, saving one
222 * lookup on the decomposition table. The current position needs to be
223 * updated here to let the caller know from where to continue filling
224 * in the array result.
225 */
226static void
227decompose_code(pg_wchar code, pg_wchar **result, int *current)
228{
229 pg_unicode_decomposition *entry;
230 int i;
231 const uint32 *decomp;
232 int dec_size;
233
234 /*
235 * Fast path for Hangul characters not stored in tables to save memory as
236 * decomposition is algorithmic. See
237 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238 * the matter.
239 */
240 if (code >= SBASE && code < SBASE + SCOUNT)
241 {
242 uint32 l,
243 v,
244 tindex,
245 sindex;
246 pg_wchar *res = *result;
247
248 sindex = code - SBASE;
249 l = LBASE + sindex / (VCOUNT * TCOUNT);
250 v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251 tindex = sindex % TCOUNT;
252
253 res[*current] = l;
254 (*current)++;
255 res[*current] = v;
256 (*current)++;
257
258 if (tindex != 0)
259 {
260 res[*current] = TBASE + tindex;
261 (*current)++;
262 }
263
264 return;
265 }
266
267 entry = get_code_entry(code);
268
269 /*
270 * Just fill in with the current decomposition if there are no
271 * decomposition codes to recurse to. A NULL entry is equivalent to a
272 * character with class 0 and no decompositions, so just leave also in
273 * this case.
274 */
275 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
276 {
277 pg_wchar *res = *result;
278
279 res[*current] = code;
280 (*current)++;
281 return;
282 }
283
284 /*
285 * If this entry has other decomposition codes look at them as well.
286 */
287 decomp = get_code_decomposition(entry, &dec_size);
288 for (i = 0; i < dec_size; i++)
289 {
290 pg_wchar lcode = (pg_wchar) decomp[i];
291
292 /* Leave if no more decompositions */
293 decompose_code(lcode, result, current);
294 }
295}
296
297/*
298 * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
299 *
300 * The input is a 0-terminated array of codepoints.
301 *
302 * In frontend, returns a 0-terminated array of codepoints, allocated with
303 * malloc. Or NULL if we run out of memory. In frontend, the returned
304 * string is palloc'd instead, and OOM is reported with ereport().
305 */
306pg_wchar *
307unicode_normalize_kc(const pg_wchar *input)
308{
309 pg_wchar *decomp_chars;
310 pg_wchar *recomp_chars;
311 int decomp_size,
312 current_size;
313 int count;
314 const pg_wchar *p;
315
316 /* variables for recomposition */
317 int last_class;
318 int starter_pos;
319 int target_pos;
320 uint32 starter_ch;
321
322 /* First, do character decomposition */
323
324 /*
325 * Calculate how many characters long the decomposed version will be.
326 */
327 decomp_size = 0;
328 for (p = input; *p; p++)
329 decomp_size += get_decomposed_size(*p);
330
331 decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332 if (decomp_chars == NULL)
333 return NULL;
334
335 /*
336 * Now fill in each entry recursively. This needs a second pass on the
337 * decomposition table.
338 */
339 current_size = 0;
340 for (p = input; *p; p++)
341 decompose_code(*p, &decomp_chars, &current_size);
342 decomp_chars[decomp_size] = '\0';
343 Assert(decomp_size == current_size);
344
345 /*
346 * Now apply canonical ordering.
347 */
348 for (count = 1; count < decomp_size; count++)
349 {
350 pg_wchar prev = decomp_chars[count - 1];
351 pg_wchar next = decomp_chars[count];
352 pg_wchar tmp;
353 pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354 pg_unicode_decomposition *nextEntry = get_code_entry(next);
355
356 /*
357 * If no entries are found, the character used is either an Hangul
358 * character or a character with a class of 0 and no decompositions,
359 * so move to next result.
360 */
361 if (prevEntry == NULL || nextEntry == NULL)
362 continue;
363
364 /*
365 * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
366 * a sequence of two adjacent characters in a string is an
367 * exchangeable pair if the combining class (from the Unicode
368 * Character Database) for the first character is greater than the
369 * combining class for the second, and the second is not a starter. A
370 * character is a starter if its combining class is 0.
371 */
372 if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373 continue;
374
375 if (prevEntry->comb_class <= nextEntry->comb_class)
376 continue;
377
378 /* exchange can happen */
379 tmp = decomp_chars[count - 1];
380 decomp_chars[count - 1] = decomp_chars[count];
381 decomp_chars[count] = tmp;
382
383 /* backtrack to check again */
384 if (count > 1)
385 count -= 2;
386 }
387
388 /*
389 * The last phase of NFKC is the recomposition of the reordered Unicode
390 * string using combining classes. The recomposed string cannot be longer
391 * than the decomposed one, so make the allocation of the output string
392 * based on that assumption.
393 */
394 recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395 if (!recomp_chars)
396 {
397 FREE(decomp_chars);
398 return NULL;
399 }
400
401 last_class = -1; /* this eliminates a special check */
402 starter_pos = 0;
403 target_pos = 1;
404 starter_ch = recomp_chars[0] = decomp_chars[0];
405
406 for (count = 1; count < decomp_size; count++)
407 {
408 pg_wchar ch = decomp_chars[count];
409 pg_unicode_decomposition *ch_entry = get_code_entry(ch);
410 int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411 pg_wchar composite;
412
413 if (last_class < ch_class &&
414 recompose_code(starter_ch, ch, &composite))
415 {
416 recomp_chars[starter_pos] = composite;
417 starter_ch = composite;
418 }
419 else if (ch_class == 0)
420 {
421 starter_pos = target_pos;
422 starter_ch = ch;
423 last_class = -1;
424 recomp_chars[target_pos++] = ch;
425 }
426 else
427 {
428 last_class = ch_class;
429 recomp_chars[target_pos++] = ch;
430 }
431 }
432 recomp_chars[target_pos] = (pg_wchar) '\0';
433
434 FREE(decomp_chars);
435
436 return recomp_chars;
437}
438