1 | /* |
2 | * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
3 | * |
4 | * Permission to use, copy, modify, and/or distribute this software for any |
5 | * purpose with or without fee is hereby granted, provided that the above |
6 | * copyright notice and this permission notice appear in all copies. |
7 | * |
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
15 | */ |
16 | |
17 | #include <stdio.h> |
18 | #include <stdlib.h> |
19 | #include <stdint.h> |
20 | #include "ucdn.h" |
21 | |
22 | typedef struct { |
23 | unsigned char category; |
24 | unsigned char combining; |
25 | unsigned char bidi_class; |
26 | unsigned char east_asian_width; |
27 | unsigned char script; |
28 | unsigned char linebreak_class; |
29 | } UCDRecord; |
30 | |
31 | typedef struct { |
32 | unsigned short from, to; |
33 | } MirrorPair; |
34 | |
35 | typedef struct { |
36 | unsigned short from, to; |
37 | unsigned char type; |
38 | } BracketPair; |
39 | |
40 | typedef struct { |
41 | unsigned int start; |
42 | short count, index; |
43 | } Reindex; |
44 | |
45 | #include "ucdn_db.h" |
46 | |
47 | /* constants required for Hangul (de)composition */ |
48 | #define SBASE 0xAC00 |
49 | #define LBASE 0x1100 |
50 | #define VBASE 0x1161 |
51 | #define TBASE 0x11A7 |
52 | #define SCOUNT 11172 |
53 | #define LCOUNT 19 |
54 | #define VCOUNT 21 |
55 | #define TCOUNT 28 |
56 | #define NCOUNT (VCOUNT * TCOUNT) |
57 | |
58 | static const UCDRecord *get_ucd_record(uint32_t code) |
59 | { |
60 | int index, offset; |
61 | |
62 | if (code >= 0x110000) |
63 | index = 0; |
64 | else { |
65 | index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
66 | offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
67 | index = index1[index + offset] << SHIFT2; |
68 | offset = code & ((1<<SHIFT2) - 1); |
69 | index = index2[index + offset]; |
70 | } |
71 | |
72 | return &ucd_records[index]; |
73 | } |
74 | |
75 | static const unsigned short *get_decomp_record(uint32_t code) |
76 | { |
77 | int index, offset; |
78 | |
79 | if (code >= 0x110000) |
80 | index = 0; |
81 | else { |
82 | index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
83 | << DECOMP_SHIFT1; |
84 | offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
85 | index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
86 | offset = code & ((1<<DECOMP_SHIFT2) - 1); |
87 | index = decomp_index2[index + offset]; |
88 | } |
89 | |
90 | return &decomp_data[index]; |
91 | } |
92 | |
93 | static int compare_reindex(const void *a, const void *b) |
94 | { |
95 | Reindex *ra = (Reindex *)a; |
96 | Reindex *rb = (Reindex *)b; |
97 | |
98 | if (ra->start < rb->start) |
99 | return -1; |
100 | else if (ra->start > (rb->start + rb->count)) |
101 | return 1; |
102 | else |
103 | return 0; |
104 | } |
105 | |
106 | static int get_comp_index(uint32_t code, const Reindex *idx, size_t len) |
107 | { |
108 | Reindex *res; |
109 | Reindex r = {0, 0, 0}; |
110 | r.start = code; |
111 | res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex); |
112 | |
113 | if (res != NULL) |
114 | return res->index + (code - res->start); |
115 | else |
116 | return -1; |
117 | } |
118 | |
119 | static int compare_mp(const void *a, const void *b) |
120 | { |
121 | MirrorPair *mpa = (MirrorPair *)a; |
122 | MirrorPair *mpb = (MirrorPair *)b; |
123 | return mpa->from - mpb->from; |
124 | } |
125 | |
126 | static int compare_bp(const void *a, const void *b) |
127 | { |
128 | BracketPair *bpa = (BracketPair *)a; |
129 | BracketPair *bpb = (BracketPair *)b; |
130 | return bpa->from - bpb->from; |
131 | } |
132 | |
133 | static BracketPair *search_bp(uint32_t code) |
134 | { |
135 | BracketPair bp = {0,0,2}; |
136 | BracketPair *res; |
137 | |
138 | bp.from = code; |
139 | res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, |
140 | sizeof(BracketPair), compare_bp); |
141 | return res; |
142 | } |
143 | |
144 | static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
145 | { |
146 | int si = code - SBASE; |
147 | |
148 | if (si < 0 || si >= SCOUNT) |
149 | return 0; |
150 | |
151 | if (si % TCOUNT) { |
152 | /* LV,T */ |
153 | *a = SBASE + (si / TCOUNT) * TCOUNT; |
154 | *b = TBASE + (si % TCOUNT); |
155 | return 3; |
156 | } else { |
157 | /* L,V */ |
158 | *a = LBASE + (si / NCOUNT); |
159 | *b = VBASE + (si % NCOUNT) / TCOUNT; |
160 | return 2; |
161 | } |
162 | } |
163 | |
164 | static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
165 | { |
166 | if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) { |
167 | /* LV,T */ |
168 | *code = a + (b - TBASE); |
169 | return 3; |
170 | } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) { |
171 | /* L,V */ |
172 | int li = a - LBASE; |
173 | int vi = b - VBASE; |
174 | *code = SBASE + li * NCOUNT + vi * TCOUNT; |
175 | return 2; |
176 | } else { |
177 | return 0; |
178 | } |
179 | } |
180 | |
181 | static uint32_t decode_utf16(const unsigned short **code_ptr) |
182 | { |
183 | const unsigned short *code = *code_ptr; |
184 | |
185 | if (code[0] < 0xd800 || code[0] > 0xdc00) { |
186 | *code_ptr += 1; |
187 | return (uint32_t)code[0]; |
188 | } else { |
189 | *code_ptr += 2; |
190 | return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
191 | (((uint32_t)code[0] - 0xd800) << 10); |
192 | } |
193 | } |
194 | |
195 | const char *ucdn_get_unicode_version(void) |
196 | { |
197 | return UNIDATA_VERSION; |
198 | } |
199 | |
200 | int ucdn_get_combining_class(uint32_t code) |
201 | { |
202 | return get_ucd_record(code)->combining; |
203 | } |
204 | |
205 | int ucdn_get_east_asian_width(uint32_t code) |
206 | { |
207 | return get_ucd_record(code)->east_asian_width; |
208 | } |
209 | |
210 | int ucdn_get_general_category(uint32_t code) |
211 | { |
212 | return get_ucd_record(code)->category; |
213 | } |
214 | |
215 | int ucdn_get_bidi_class(uint32_t code) |
216 | { |
217 | return get_ucd_record(code)->bidi_class; |
218 | } |
219 | |
220 | int ucdn_get_mirrored(uint32_t code) |
221 | { |
222 | return ucdn_mirror(code) != code; |
223 | } |
224 | |
225 | int ucdn_get_script(uint32_t code) |
226 | { |
227 | return get_ucd_record(code)->script; |
228 | } |
229 | |
230 | int ucdn_get_linebreak_class(uint32_t code) |
231 | { |
232 | return get_ucd_record(code)->linebreak_class; |
233 | } |
234 | |
235 | int ucdn_get_resolved_linebreak_class(uint32_t code) |
236 | { |
237 | const UCDRecord *record = get_ucd_record(code); |
238 | |
239 | switch (record->linebreak_class) |
240 | { |
241 | case UCDN_LINEBREAK_CLASS_AI: |
242 | case UCDN_LINEBREAK_CLASS_SG: |
243 | case UCDN_LINEBREAK_CLASS_XX: |
244 | return UCDN_LINEBREAK_CLASS_AL; |
245 | |
246 | case UCDN_LINEBREAK_CLASS_SA: |
247 | if (record->category == UCDN_GENERAL_CATEGORY_MC || |
248 | record->category == UCDN_GENERAL_CATEGORY_MN) |
249 | return UCDN_LINEBREAK_CLASS_CM; |
250 | return UCDN_LINEBREAK_CLASS_AL; |
251 | |
252 | case UCDN_LINEBREAK_CLASS_CJ: |
253 | return UCDN_LINEBREAK_CLASS_NS; |
254 | |
255 | case UCDN_LINEBREAK_CLASS_CB: |
256 | return UCDN_LINEBREAK_CLASS_B2; |
257 | |
258 | case UCDN_LINEBREAK_CLASS_NL: |
259 | return UCDN_LINEBREAK_CLASS_BK; |
260 | |
261 | default: |
262 | return record->linebreak_class; |
263 | } |
264 | } |
265 | |
266 | uint32_t ucdn_mirror(uint32_t code) |
267 | { |
268 | MirrorPair mp = {0}; |
269 | MirrorPair *res; |
270 | |
271 | mp.from = code; |
272 | res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, |
273 | sizeof(MirrorPair), compare_mp); |
274 | |
275 | if (res == NULL) |
276 | return code; |
277 | else |
278 | return res->to; |
279 | } |
280 | |
281 | uint32_t ucdn_paired_bracket(uint32_t code) |
282 | { |
283 | BracketPair *res = search_bp(code); |
284 | if (res == NULL) |
285 | return code; |
286 | else |
287 | return res->to; |
288 | } |
289 | |
290 | int ucdn_paired_bracket_type(uint32_t code) |
291 | { |
292 | BracketPair *res = search_bp(code); |
293 | if (res == NULL) |
294 | return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE; |
295 | else |
296 | return res->type; |
297 | } |
298 | |
299 | int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
300 | { |
301 | const unsigned short *rec; |
302 | int len; |
303 | |
304 | if (hangul_pair_decompose(code, a, b)) |
305 | return 1; |
306 | |
307 | rec = get_decomp_record(code); |
308 | len = rec[0] >> 8; |
309 | |
310 | if ((rec[0] & 0xff) != 0 || len == 0) |
311 | return 0; |
312 | |
313 | rec++; |
314 | *a = decode_utf16(&rec); |
315 | if (len > 1) |
316 | *b = decode_utf16(&rec); |
317 | else |
318 | *b = 0; |
319 | |
320 | return 1; |
321 | } |
322 | |
323 | int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
324 | { |
325 | int l, r, index, indexi, offset; |
326 | |
327 | if (hangul_pair_compose(code, a, b)) |
328 | return 1; |
329 | |
330 | l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex)); |
331 | r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex)); |
332 | |
333 | if (l < 0 || r < 0) |
334 | return 0; |
335 | |
336 | indexi = l * TOTAL_LAST + r; |
337 | index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
338 | offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
339 | index = comp_index1[index + offset] << COMP_SHIFT2; |
340 | offset = indexi & ((1<<COMP_SHIFT2) - 1); |
341 | *code = comp_data[index + offset]; |
342 | |
343 | return *code != 0; |
344 | } |
345 | |
346 | int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
347 | { |
348 | int i, len; |
349 | const unsigned short *rec = get_decomp_record(code); |
350 | len = rec[0] >> 8; |
351 | |
352 | if (len == 0) |
353 | return 0; |
354 | |
355 | rec++; |
356 | for (i = 0; i < len; i++) |
357 | decomposed[i] = decode_utf16(&rec); |
358 | |
359 | return len; |
360 | } |
361 | |