1 | /*------------------------------------------------------------------------- |
2 | * unicode_norm.c |
3 | * Normalize a Unicode string to NFKC form |
4 | * |
5 | * This implements Unicode normalization, per the documentation at |
6 | * http://www.unicode.org/reports/tr15/. |
7 | * |
8 | * Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group |
9 | * |
10 | * IDENTIFICATION |
11 | * src/common/unicode_norm.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #ifndef FRONTEND |
16 | #include "postgres.h" |
17 | #else |
18 | #include "postgres_fe.h" |
19 | #endif |
20 | |
21 | #include "common/unicode_norm.h" |
22 | #include "common/unicode_norm_table.h" |
23 | |
24 | #ifndef FRONTEND |
25 | #define ALLOC(size) palloc(size) |
26 | #define FREE(size) pfree(size) |
27 | #else |
28 | #define ALLOC(size) malloc(size) |
29 | #define FREE(size) free(size) |
30 | #endif |
31 | |
32 | /* Constants for calculations with Hangul characters */ |
33 | #define SBASE 0xAC00 /* U+AC00 */ |
34 | #define LBASE 0x1100 /* U+1100 */ |
35 | #define VBASE 0x1161 /* U+1161 */ |
36 | #define TBASE 0x11A7 /* U+11A7 */ |
37 | #define LCOUNT 19 |
38 | #define VCOUNT 21 |
39 | #define TCOUNT 28 |
40 | #define NCOUNT VCOUNT * TCOUNT |
41 | #define SCOUNT LCOUNT * NCOUNT |
42 | |
43 | /* comparison routine for bsearch() of decomposition lookup table. */ |
44 | static int |
45 | conv_compare(const void *p1, const void *p2) |
46 | { |
47 | uint32 v1, |
48 | v2; |
49 | |
50 | v1 = *(const uint32 *) p1; |
51 | v2 = ((const pg_unicode_decomposition *) p2)->codepoint; |
52 | return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
53 | } |
54 | |
55 | /* |
56 | * Get the entry corresponding to code in the decomposition lookup table. |
57 | */ |
58 | static pg_unicode_decomposition * |
59 | get_code_entry(pg_wchar code) |
60 | { |
61 | return bsearch(&(code), |
62 | UnicodeDecompMain, |
63 | lengthof(UnicodeDecompMain), |
64 | sizeof(pg_unicode_decomposition), |
65 | conv_compare); |
66 | } |
67 | |
68 | /* |
69 | * Given a decomposition entry looked up earlier, get the decomposed |
70 | * characters. |
71 | * |
72 | * Note: the returned pointer can point to statically allocated buffer, and |
73 | * is only valid until next call to this function! |
74 | */ |
75 | static const pg_wchar * |
76 | get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) |
77 | { |
78 | static pg_wchar x; |
79 | |
80 | if (DECOMPOSITION_IS_INLINE(entry)) |
81 | { |
82 | Assert(DECOMPOSITION_SIZE(entry) == 1); |
83 | x = (pg_wchar) entry->dec_index; |
84 | *dec_size = 1; |
85 | return &x; |
86 | } |
87 | else |
88 | { |
89 | *dec_size = DECOMPOSITION_SIZE(entry); |
90 | return &UnicodeDecomp_codepoints[entry->dec_index]; |
91 | } |
92 | } |
93 | |
94 | /* |
95 | * Calculate how many characters a given character will decompose to. |
96 | * |
97 | * This needs to recurse, if the character decomposes into characters that |
98 | * are, in turn, decomposable. |
99 | */ |
100 | static int |
101 | get_decomposed_size(pg_wchar code) |
102 | { |
103 | pg_unicode_decomposition *entry; |
104 | int size = 0; |
105 | int i; |
106 | const uint32 *decomp; |
107 | int dec_size; |
108 | |
109 | /* |
110 | * Fast path for Hangul characters not stored in tables to save memory as |
111 | * decomposition is algorithmic. See |
112 | * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on |
113 | * the matter. |
114 | */ |
115 | if (code >= SBASE && code < SBASE + SCOUNT) |
116 | { |
117 | uint32 tindex, |
118 | sindex; |
119 | |
120 | sindex = code - SBASE; |
121 | tindex = sindex % TCOUNT; |
122 | |
123 | if (tindex != 0) |
124 | return 3; |
125 | return 2; |
126 | } |
127 | |
128 | entry = get_code_entry(code); |
129 | |
130 | /* |
131 | * Just count current code if no other decompositions. A NULL entry is |
132 | * equivalent to a character with class 0 and no decompositions. |
133 | */ |
134 | if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) |
135 | return 1; |
136 | |
137 | /* |
138 | * If this entry has other decomposition codes look at them as well. First |
139 | * get its decomposition in the list of tables available. |
140 | */ |
141 | decomp = get_code_decomposition(entry, &dec_size); |
142 | for (i = 0; i < dec_size; i++) |
143 | { |
144 | uint32 lcode = decomp[i]; |
145 | |
146 | size += get_decomposed_size(lcode); |
147 | } |
148 | |
149 | return size; |
150 | } |
151 | |
152 | /* |
153 | * Recompose a set of characters. For hangul characters, the calculation |
154 | * is algorithmic. For others, an inverse lookup at the decomposition |
155 | * table is necessary. Returns true if a recomposition can be done, and |
156 | * false otherwise. |
157 | */ |
158 | static bool |
159 | recompose_code(uint32 start, uint32 code, uint32 *result) |
160 | { |
161 | /* |
162 | * Handle Hangul characters algorithmically, per the Unicode spec. |
163 | * |
164 | * Check if two current characters are L and V. |
165 | */ |
166 | if (start >= LBASE && start < LBASE + LCOUNT && |
167 | code >= VBASE && code < VBASE + VCOUNT) |
168 | { |
169 | /* make syllable of form LV */ |
170 | uint32 lindex = start - LBASE; |
171 | uint32 vindex = code - VBASE; |
172 | |
173 | *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT; |
174 | return true; |
175 | } |
176 | /* Check if two current characters are LV and T */ |
177 | else if (start >= SBASE && start < (SBASE + SCOUNT) && |
178 | ((start - SBASE) % TCOUNT) == 0 && |
179 | code >= TBASE && code < (TBASE + TCOUNT)) |
180 | { |
181 | /* make syllable of from LVT */ |
182 | uint32 tindex = code - TBASE; |
183 | |
184 | *result = start + tindex; |
185 | return true; |
186 | } |
187 | else |
188 | { |
189 | int i; |
190 | |
191 | /* |
192 | * Do an inverse lookup of the decomposition tables to see if anything |
193 | * matches. The comparison just needs to be a perfect match on the |
194 | * sub-table of size two, because the start character has already been |
195 | * recomposed partially. |
196 | */ |
197 | for (i = 0; i < lengthof(UnicodeDecompMain); i++) |
198 | { |
199 | const pg_unicode_decomposition *entry = &UnicodeDecompMain[i]; |
200 | |
201 | if (DECOMPOSITION_SIZE(entry) != 2) |
202 | continue; |
203 | |
204 | if (DECOMPOSITION_NO_COMPOSE(entry)) |
205 | continue; |
206 | |
207 | if (start == UnicodeDecomp_codepoints[entry->dec_index] && |
208 | code == UnicodeDecomp_codepoints[entry->dec_index + 1]) |
209 | { |
210 | *result = entry->codepoint; |
211 | return true; |
212 | } |
213 | } |
214 | } |
215 | |
216 | return false; |
217 | } |
218 | |
219 | /* |
220 | * Decompose the given code into the array given by caller. The |
221 | * decomposition begins at the position given by caller, saving one |
222 | * lookup on the decomposition table. The current position needs to be |
223 | * updated here to let the caller know from where to continue filling |
224 | * in the array result. |
225 | */ |
226 | static void |
227 | decompose_code(pg_wchar code, pg_wchar **result, int *current) |
228 | { |
229 | pg_unicode_decomposition *entry; |
230 | int i; |
231 | const uint32 *decomp; |
232 | int dec_size; |
233 | |
234 | /* |
235 | * Fast path for Hangul characters not stored in tables to save memory as |
236 | * decomposition is algorithmic. See |
237 | * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on |
238 | * the matter. |
239 | */ |
240 | if (code >= SBASE && code < SBASE + SCOUNT) |
241 | { |
242 | uint32 l, |
243 | v, |
244 | tindex, |
245 | sindex; |
246 | pg_wchar *res = *result; |
247 | |
248 | sindex = code - SBASE; |
249 | l = LBASE + sindex / (VCOUNT * TCOUNT); |
250 | v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; |
251 | tindex = sindex % TCOUNT; |
252 | |
253 | res[*current] = l; |
254 | (*current)++; |
255 | res[*current] = v; |
256 | (*current)++; |
257 | |
258 | if (tindex != 0) |
259 | { |
260 | res[*current] = TBASE + tindex; |
261 | (*current)++; |
262 | } |
263 | |
264 | return; |
265 | } |
266 | |
267 | entry = get_code_entry(code); |
268 | |
269 | /* |
270 | * Just fill in with the current decomposition if there are no |
271 | * decomposition codes to recurse to. A NULL entry is equivalent to a |
272 | * character with class 0 and no decompositions, so just leave also in |
273 | * this case. |
274 | */ |
275 | if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) |
276 | { |
277 | pg_wchar *res = *result; |
278 | |
279 | res[*current] = code; |
280 | (*current)++; |
281 | return; |
282 | } |
283 | |
284 | /* |
285 | * If this entry has other decomposition codes look at them as well. |
286 | */ |
287 | decomp = get_code_decomposition(entry, &dec_size); |
288 | for (i = 0; i < dec_size; i++) |
289 | { |
290 | pg_wchar lcode = (pg_wchar) decomp[i]; |
291 | |
292 | /* Leave if no more decompositions */ |
293 | decompose_code(lcode, result, current); |
294 | } |
295 | } |
296 | |
297 | /* |
298 | * unicode_normalize_kc - Normalize a Unicode string to NFKC form. |
299 | * |
300 | * The input is a 0-terminated array of codepoints. |
301 | * |
302 | * In frontend, returns a 0-terminated array of codepoints, allocated with |
303 | * malloc. Or NULL if we run out of memory. In frontend, the returned |
304 | * string is palloc'd instead, and OOM is reported with ereport(). |
305 | */ |
306 | pg_wchar * |
307 | unicode_normalize_kc(const pg_wchar *input) |
308 | { |
309 | pg_wchar *decomp_chars; |
310 | pg_wchar *recomp_chars; |
311 | int decomp_size, |
312 | current_size; |
313 | int count; |
314 | const pg_wchar *p; |
315 | |
316 | /* variables for recomposition */ |
317 | int last_class; |
318 | int starter_pos; |
319 | int target_pos; |
320 | uint32 starter_ch; |
321 | |
322 | /* First, do character decomposition */ |
323 | |
324 | /* |
325 | * Calculate how many characters long the decomposed version will be. |
326 | */ |
327 | decomp_size = 0; |
328 | for (p = input; *p; p++) |
329 | decomp_size += get_decomposed_size(*p); |
330 | |
331 | decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
332 | if (decomp_chars == NULL) |
333 | return NULL; |
334 | |
335 | /* |
336 | * Now fill in each entry recursively. This needs a second pass on the |
337 | * decomposition table. |
338 | */ |
339 | current_size = 0; |
340 | for (p = input; *p; p++) |
341 | decompose_code(*p, &decomp_chars, ¤t_size); |
342 | decomp_chars[decomp_size] = '\0'; |
343 | Assert(decomp_size == current_size); |
344 | |
345 | /* |
346 | * Now apply canonical ordering. |
347 | */ |
348 | for (count = 1; count < decomp_size; count++) |
349 | { |
350 | pg_wchar prev = decomp_chars[count - 1]; |
351 | pg_wchar next = decomp_chars[count]; |
352 | pg_wchar tmp; |
353 | pg_unicode_decomposition *prevEntry = get_code_entry(prev); |
354 | pg_unicode_decomposition *nextEntry = get_code_entry(next); |
355 | |
356 | /* |
357 | * If no entries are found, the character used is either an Hangul |
358 | * character or a character with a class of 0 and no decompositions, |
359 | * so move to next result. |
360 | */ |
361 | if (prevEntry == NULL || nextEntry == NULL) |
362 | continue; |
363 | |
364 | /* |
365 | * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4, |
366 | * a sequence of two adjacent characters in a string is an |
367 | * exchangeable pair if the combining class (from the Unicode |
368 | * Character Database) for the first character is greater than the |
369 | * combining class for the second, and the second is not a starter. A |
370 | * character is a starter if its combining class is 0. |
371 | */ |
372 | if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0) |
373 | continue; |
374 | |
375 | if (prevEntry->comb_class <= nextEntry->comb_class) |
376 | continue; |
377 | |
378 | /* exchange can happen */ |
379 | tmp = decomp_chars[count - 1]; |
380 | decomp_chars[count - 1] = decomp_chars[count]; |
381 | decomp_chars[count] = tmp; |
382 | |
383 | /* backtrack to check again */ |
384 | if (count > 1) |
385 | count -= 2; |
386 | } |
387 | |
388 | /* |
389 | * The last phase of NFKC is the recomposition of the reordered Unicode |
390 | * string using combining classes. The recomposed string cannot be longer |
391 | * than the decomposed one, so make the allocation of the output string |
392 | * based on that assumption. |
393 | */ |
394 | recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
395 | if (!recomp_chars) |
396 | { |
397 | FREE(decomp_chars); |
398 | return NULL; |
399 | } |
400 | |
401 | last_class = -1; /* this eliminates a special check */ |
402 | starter_pos = 0; |
403 | target_pos = 1; |
404 | starter_ch = recomp_chars[0] = decomp_chars[0]; |
405 | |
406 | for (count = 1; count < decomp_size; count++) |
407 | { |
408 | pg_wchar ch = decomp_chars[count]; |
409 | pg_unicode_decomposition *ch_entry = get_code_entry(ch); |
410 | int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class; |
411 | pg_wchar composite; |
412 | |
413 | if (last_class < ch_class && |
414 | recompose_code(starter_ch, ch, &composite)) |
415 | { |
416 | recomp_chars[starter_pos] = composite; |
417 | starter_ch = composite; |
418 | } |
419 | else if (ch_class == 0) |
420 | { |
421 | starter_pos = target_pos; |
422 | starter_ch = ch; |
423 | last_class = -1; |
424 | recomp_chars[target_pos++] = ch; |
425 | } |
426 | else |
427 | { |
428 | last_class = ch_class; |
429 | recomp_chars[target_pos++] = ch; |
430 | } |
431 | } |
432 | recomp_chars[target_pos] = (pg_wchar) '\0'; |
433 | |
434 | FREE(decomp_chars); |
435 | |
436 | return recomp_chars; |
437 | } |
438 | |