1 | /* |
2 | * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
3 | * |
4 | * Permission to use, copy, modify, and/or distribute this software for any |
5 | * purpose with or without fee is hereby granted, provided that the above |
6 | * copyright notice and this permission notice appear in all copies. |
7 | * |
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
15 | */ |
16 | |
17 | #include "mupdf/fitz.h" |
18 | #include "mupdf/ucdn.h" |
19 | |
20 | #include <stdio.h> |
21 | #include <stdlib.h> |
22 | |
23 | typedef struct { |
24 | unsigned char category; |
25 | unsigned char combining; |
26 | unsigned char bidi_class; |
27 | unsigned char east_asian_width; |
28 | unsigned char script; |
29 | unsigned char linebreak_class; |
30 | } UCDRecord; |
31 | |
32 | typedef struct { |
33 | unsigned short from, to; |
34 | } MirrorPair; |
35 | |
36 | typedef struct { |
37 | unsigned short from, to; |
38 | unsigned char type; |
39 | } BracketPair; |
40 | |
41 | typedef struct { |
42 | unsigned int start; |
43 | short count, index; |
44 | } Reindex; |
45 | |
46 | #include "ucdn_db.h" |
47 | |
48 | /* constants required for Hangul (de)composition */ |
49 | #define SBASE 0xAC00 |
50 | #define LBASE 0x1100 |
51 | #define VBASE 0x1161 |
52 | #define TBASE 0x11A7 |
53 | #define SCOUNT 11172 |
54 | #define LCOUNT 19 |
55 | #define VCOUNT 21 |
56 | #define TCOUNT 28 |
57 | #define NCOUNT (VCOUNT * TCOUNT) |
58 | |
59 | static const UCDRecord *get_ucd_record(uint32_t code) |
60 | { |
61 | int index, offset; |
62 | |
63 | if (code >= 0x110000) |
64 | index = 0; |
65 | else { |
66 | index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
67 | offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
68 | index = index1[index + offset] << SHIFT2; |
69 | offset = code & ((1<<SHIFT2) - 1); |
70 | index = index2[index + offset]; |
71 | } |
72 | |
73 | return &ucd_records[index]; |
74 | } |
75 | |
76 | static const unsigned short *get_decomp_record(uint32_t code) |
77 | { |
78 | int index, offset; |
79 | |
80 | if (code >= 0x110000) |
81 | index = 0; |
82 | else { |
83 | index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
84 | << DECOMP_SHIFT1; |
85 | offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
86 | index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
87 | offset = code & ((1<<DECOMP_SHIFT2) - 1); |
88 | index = decomp_index2[index + offset]; |
89 | } |
90 | |
91 | return &decomp_data[index]; |
92 | } |
93 | |
94 | static int compare_reindex(const void *a, const void *b) |
95 | { |
96 | Reindex *ra = (Reindex *)a; |
97 | Reindex *rb = (Reindex *)b; |
98 | |
99 | if (ra->start < rb->start) |
100 | return -1; |
101 | else if (ra->start > (rb->start + rb->count)) |
102 | return 1; |
103 | else |
104 | return 0; |
105 | } |
106 | |
107 | static int get_comp_index(uint32_t code, const Reindex *idx, size_t len) |
108 | { |
109 | Reindex *res; |
110 | Reindex r = {0, 0, 0}; |
111 | r.start = code; |
112 | res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex); |
113 | |
114 | if (res != NULL) |
115 | return res->index + (code - res->start); |
116 | else |
117 | return -1; |
118 | } |
119 | |
120 | static int compare_mp(const void *a, const void *b) |
121 | { |
122 | MirrorPair *mpa = (MirrorPair *)a; |
123 | MirrorPair *mpb = (MirrorPair *)b; |
124 | return mpa->from - mpb->from; |
125 | } |
126 | |
127 | static int compare_bp(const void *a, const void *b) |
128 | { |
129 | BracketPair *bpa = (BracketPair *)a; |
130 | BracketPair *bpb = (BracketPair *)b; |
131 | return bpa->from - bpb->from; |
132 | } |
133 | |
134 | static BracketPair *search_bp(uint32_t code) |
135 | { |
136 | BracketPair bp = {0,0,2}; |
137 | BracketPair *res; |
138 | |
139 | bp.from = code; |
140 | res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, |
141 | sizeof(BracketPair), compare_bp); |
142 | return res; |
143 | } |
144 | |
145 | static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
146 | { |
147 | int si = code - SBASE; |
148 | |
149 | if (si < 0 || si >= SCOUNT) |
150 | return 0; |
151 | |
152 | if (si % TCOUNT) { |
153 | /* LV,T */ |
154 | *a = SBASE + (si / TCOUNT) * TCOUNT; |
155 | *b = TBASE + (si % TCOUNT); |
156 | return 3; |
157 | } else { |
158 | /* L,V */ |
159 | *a = LBASE + (si / NCOUNT); |
160 | *b = VBASE + (si % NCOUNT) / TCOUNT; |
161 | return 2; |
162 | } |
163 | } |
164 | |
165 | static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
166 | { |
167 | if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) { |
168 | /* LV,T */ |
169 | *code = a + (b - TBASE); |
170 | return 3; |
171 | } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) { |
172 | /* L,V */ |
173 | int li = a - LBASE; |
174 | int vi = b - VBASE; |
175 | *code = SBASE + li * NCOUNT + vi * TCOUNT; |
176 | return 2; |
177 | } else { |
178 | return 0; |
179 | } |
180 | } |
181 | |
182 | static uint32_t decode_utf16(const unsigned short **code_ptr) |
183 | { |
184 | const unsigned short *code = *code_ptr; |
185 | |
186 | if (code[0] < 0xd800 || code[0] > 0xdc00) { |
187 | *code_ptr += 1; |
188 | return (uint32_t)code[0]; |
189 | } else { |
190 | *code_ptr += 2; |
191 | return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
192 | (((uint32_t)code[0] - 0xd800) << 10); |
193 | } |
194 | } |
195 | |
196 | const char *ucdn_get_unicode_version(void) |
197 | { |
198 | return UNIDATA_VERSION; |
199 | } |
200 | |
201 | int ucdn_get_combining_class(uint32_t code) |
202 | { |
203 | return get_ucd_record(code)->combining; |
204 | } |
205 | |
206 | int ucdn_get_east_asian_width(uint32_t code) |
207 | { |
208 | return get_ucd_record(code)->east_asian_width; |
209 | } |
210 | |
211 | int ucdn_get_general_category(uint32_t code) |
212 | { |
213 | return get_ucd_record(code)->category; |
214 | } |
215 | |
216 | int ucdn_get_bidi_class(uint32_t code) |
217 | { |
218 | return get_ucd_record(code)->bidi_class; |
219 | } |
220 | |
221 | int ucdn_get_mirrored(uint32_t code) |
222 | { |
223 | return ucdn_mirror(code) != code; |
224 | } |
225 | |
226 | int ucdn_get_script(uint32_t code) |
227 | { |
228 | return get_ucd_record(code)->script; |
229 | } |
230 | |
231 | int ucdn_get_linebreak_class(uint32_t code) |
232 | { |
233 | return get_ucd_record(code)->linebreak_class; |
234 | } |
235 | |
236 | int ucdn_get_resolved_linebreak_class(uint32_t code) |
237 | { |
238 | const UCDRecord *record = get_ucd_record(code); |
239 | |
240 | switch (record->linebreak_class) |
241 | { |
242 | case UCDN_LINEBREAK_CLASS_AI: |
243 | case UCDN_LINEBREAK_CLASS_SG: |
244 | case UCDN_LINEBREAK_CLASS_XX: |
245 | return UCDN_LINEBREAK_CLASS_AL; |
246 | |
247 | case UCDN_LINEBREAK_CLASS_SA: |
248 | if (record->category == UCDN_GENERAL_CATEGORY_MC || |
249 | record->category == UCDN_GENERAL_CATEGORY_MN) |
250 | return UCDN_LINEBREAK_CLASS_CM; |
251 | return UCDN_LINEBREAK_CLASS_AL; |
252 | |
253 | case UCDN_LINEBREAK_CLASS_CJ: |
254 | return UCDN_LINEBREAK_CLASS_NS; |
255 | |
256 | case UCDN_LINEBREAK_CLASS_CB: |
257 | return UCDN_LINEBREAK_CLASS_B2; |
258 | |
259 | case UCDN_LINEBREAK_CLASS_NL: |
260 | return UCDN_LINEBREAK_CLASS_BK; |
261 | |
262 | default: |
263 | return record->linebreak_class; |
264 | } |
265 | } |
266 | |
267 | uint32_t ucdn_mirror(uint32_t code) |
268 | { |
269 | MirrorPair mp = {0}; |
270 | MirrorPair *res; |
271 | |
272 | mp.from = code; |
273 | res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, |
274 | sizeof(MirrorPair), compare_mp); |
275 | |
276 | if (res == NULL) |
277 | return code; |
278 | else |
279 | return res->to; |
280 | } |
281 | |
282 | uint32_t ucdn_paired_bracket(uint32_t code) |
283 | { |
284 | BracketPair *res = search_bp(code); |
285 | if (res == NULL) |
286 | return code; |
287 | else |
288 | return res->to; |
289 | } |
290 | |
291 | int ucdn_paired_bracket_type(uint32_t code) |
292 | { |
293 | BracketPair *res = search_bp(code); |
294 | if (res == NULL) |
295 | return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE; |
296 | else |
297 | return res->type; |
298 | } |
299 | |
300 | int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
301 | { |
302 | const unsigned short *rec; |
303 | int len; |
304 | |
305 | if (hangul_pair_decompose(code, a, b)) |
306 | return 1; |
307 | |
308 | rec = get_decomp_record(code); |
309 | len = rec[0] >> 8; |
310 | |
311 | if ((rec[0] & 0xff) != 0 || len == 0) |
312 | return 0; |
313 | |
314 | rec++; |
315 | *a = decode_utf16(&rec); |
316 | if (len > 1) |
317 | *b = decode_utf16(&rec); |
318 | else |
319 | *b = 0; |
320 | |
321 | return 1; |
322 | } |
323 | |
324 | int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
325 | { |
326 | int l, r, index, indexi, offset; |
327 | |
328 | if (hangul_pair_compose(code, a, b)) |
329 | return 1; |
330 | |
331 | l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex)); |
332 | r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex)); |
333 | |
334 | if (l < 0 || r < 0) |
335 | return 0; |
336 | |
337 | indexi = l * TOTAL_LAST + r; |
338 | index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
339 | offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
340 | index = comp_index1[index + offset] << COMP_SHIFT2; |
341 | offset = indexi & ((1<<COMP_SHIFT2) - 1); |
342 | *code = comp_data[index + offset]; |
343 | |
344 | return *code != 0; |
345 | } |
346 | |
347 | int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
348 | { |
349 | int i, len; |
350 | const unsigned short *rec = get_decomp_record(code); |
351 | len = rec[0] >> 8; |
352 | |
353 | if (len == 0) |
354 | return 0; |
355 | |
356 | rec++; |
357 | for (i = 0; i < len; i++) |
358 | decomposed[i] = decode_utf16(&rec); |
359 | |
360 | return len; |
361 | } |
362 | |