1 | /* |
2 | * Bidirectional text processing. |
3 | * |
4 | * Processes unicode text by arranging the characters into an order suitable |
5 | * for display. E.g. Hebrew text will be arranged from right-to-left and |
6 | * any English within the text will remain in the left-to-right order. |
7 | * Characters such as parenthesis will be substituted for their mirrored |
8 | * equivalents if they are part of text which must be reversed. |
9 | * |
10 | * This is an implementation of the unicode Bidirectional Algorithm which |
11 | * can be found here: http://www.unicode.org/reports/tr9/ and is based |
12 | * on the reference implementation of the algorithm found on that page. |
13 | * |
14 | * For a nice overview of how it works, read this... |
15 | * http://www.w3.org/TR/REC-html40/struct/dirlang.html |
16 | * |
17 | * Extracted from the SmartOffice code, where it was modified by Ian |
18 | * Beveridge. |
19 | * |
20 | * Copyright (C) Picsel, 2004. All Rights Reserved. |
21 | */ |
22 | |
23 | /* |
24 | * Original copyright notice from unicode reference implementation. |
25 | * ---------------------------------------------------------------- |
26 | * Written by: Asmus Freytag |
27 | * C++ and Windows dependencies removed, and |
28 | * command line interface added by: Rick McGowan |
29 | * |
30 | * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved |
31 | */ |
32 | |
33 | /* |
34 | * Includes... |
35 | */ |
36 | |
37 | #include "mupdf/fitz.h" |
38 | #include "mupdf/ucdn.h" |
39 | #include "bidi-imp.h" /* standard bidi code interface */ |
40 | #include <assert.h> |
41 | |
42 | /* |
43 | * Macros... |
44 | */ |
45 | |
46 | #define ODD(x) ((x) & 1) |
47 | |
48 | #define REPLACEABLE_TYPE(t) ( \ |
49 | ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \ |
50 | ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \ |
51 | ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) ) |
52 | |
53 | #ifdef DEBUG_BIDI_VERBOSE |
54 | #define DBUGVF(params) do { fz_warn params; } while (0) |
55 | #else |
56 | #define DBUGVF(params) do {} while (0) |
57 | #endif |
58 | |
59 | #ifdef DEBUG_BIDI_OUTLINE |
60 | #define DBUGH(params) do { fz_warn params; } while (0) |
61 | #else |
62 | #define DBUGH(params) do {} while (0) |
63 | #endif |
64 | |
65 | #define UNICODE_EOS 0 |
66 | #define UNICODE_DIGIT_ZERO 0x0030 |
67 | #define UNICODE_DIGIT_NINE 0x0039 |
68 | #define UNICODE_SUPERSCRIPT_TWO 0x00B2 |
69 | #define UNICODE_SUPERSCRIPT_THREE 0x00B3 |
70 | #define UNICODE_SUPERSCRIPT_ONE 0x00B9 |
71 | #define UNICODE_RTL_START 0x0590 |
72 | #define UNICODE_RTL_END 0x07BF |
73 | #define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660 |
74 | #define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669 |
75 | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0 |
76 | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9 |
77 | #define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C |
78 | #define UNICODE_SUPERSCRIPT_ZERO 0x2070 |
79 | #define UNICODE_SUPERSCRIPT_FOUR 0x2074 |
80 | #define UNICODE_SUPERSCRIPT_NINE 0x2079 |
81 | #define UNICODE_SUBSCRIPT_ZERO 0x2080 |
82 | #define UNICODE_SUBSCRIPT_NINE 0x2089 |
83 | #define UNICODE_CIRCLED_DIGIT_ONE 0x2460 |
84 | #define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B |
85 | #define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA |
86 | #define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10 |
87 | #define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19 |
88 | |
89 | #ifndef TRUE |
90 | #define TRUE (1) |
91 | #endif |
92 | #ifndef FALSE |
93 | #define FALSE (0) |
94 | #endif |
95 | |
96 | /* |
97 | * Enumerations... |
98 | */ |
99 | |
100 | #ifdef DEBUG_BIDI_VERBOSE |
101 | /* display support: */ |
102 | static const char char_from_types[] = |
103 | { |
104 | ' ', /* ON */ |
105 | '>', /* L */ |
106 | '<', /* R */ |
107 | '9', /* AN */ |
108 | '1', /* EN */ |
109 | 'a', /* AL */ |
110 | '@', /* NSM */ |
111 | '.', /* CS */ |
112 | ',', /* ES */ |
113 | '$', /* ET */ |
114 | ':', /* BN */ |
115 | 'X', /* S */ |
116 | '_', /* WS */ |
117 | 'B', /* B */ |
118 | '+', /* RLO */ |
119 | '+', /* RLE */ |
120 | '+', /* LRO */ |
121 | '+', /* LRE */ |
122 | '-', /* PDF */ |
123 | '=' /* LS */ |
124 | }; |
125 | #endif |
126 | |
127 | /* |
128 | * Functions and static functions... |
129 | */ |
130 | |
131 | /* UCDN uses a different ordering than Bidi does. We cannot |
132 | * change to the UCDN ordering, as the bidi-std.c code relies |
133 | * on the exact ordering (at least that N = ON = 0). We |
134 | * therefore map between the two using this small table. It |
135 | * also takes care of fudging LRI, RLI, FSI and PDI, that this |
136 | * code does not currently support. */ |
137 | static const uint8_t ucdn_to_bidi[] = |
138 | { |
139 | BDI_L, /* UCDN_BIDI_CLASS_L = 0 */ |
140 | BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */ |
141 | BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */ |
142 | BDI_R, /* UCDN_BIDI_CLASS_R = 3 */ |
143 | BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */ |
144 | BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */ |
145 | BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */ |
146 | BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */ |
147 | BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */ |
148 | BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */ |
149 | BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */ |
150 | BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */ |
151 | BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */ |
152 | BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */ |
153 | BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */ |
154 | BDI_B, /* UCDN_BIDI_CLASS_B = 15 */ |
155 | BDI_S, /* UCDN_BIDI_CLASS_S = 16 */ |
156 | BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */ |
157 | BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */ |
158 | BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */ |
159 | BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */ |
160 | BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */ |
161 | BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */ |
162 | }; |
163 | |
164 | #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)]) |
165 | |
166 | /* Return a direction for white-space on the second pass of the algorithm. */ |
167 | static fz_bidi_chartype class_from_ch_n(uint32_t ch) |
168 | { |
169 | fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch); |
170 | if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS) |
171 | return BDI_N; |
172 | return from_ch_ws; |
173 | } |
174 | |
175 | /* Split fragments into single scripts (or punctuation + single script) */ |
176 | static void |
177 | split_at_script(const uint32_t *fragment, |
178 | size_t fragment_len, |
179 | int level, |
180 | void *arg, |
181 | fz_bidi_fragment_fn *callback) |
182 | { |
183 | int script = UCDN_SCRIPT_COMMON; |
184 | size_t script_start, i; |
185 | |
186 | script_start = 0; |
187 | for (i = 0; i < fragment_len; i++) |
188 | { |
189 | int s = ucdn_get_script(fragment[i]); |
190 | if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED) |
191 | { |
192 | /* Punctuation etc. This is fine. */ |
193 | } |
194 | else if (s == script) |
195 | { |
196 | /* Same script. Still fine. */ |
197 | } |
198 | else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED) |
199 | { |
200 | /* First non punctuation thing. Set the script. */ |
201 | script = s; |
202 | } |
203 | else |
204 | { |
205 | /* Change of script. Break the fragment. */ |
206 | (*callback)(&fragment[script_start], i - script_start, level, script, arg); |
207 | script_start = i; |
208 | script = s; |
209 | } |
210 | } |
211 | if (script_start != fragment_len) |
212 | { |
213 | (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg); |
214 | } |
215 | } |
216 | |
217 | /* Determines the character classes for all following |
218 | * passes of the algorithm. A character class is basically the type of Bidi |
219 | * behaviour that the character exhibits. |
220 | */ |
221 | static void |
222 | classify_characters(const uint32_t *text, |
223 | fz_bidi_chartype *types, |
224 | size_t len, |
225 | fz_bidi_flags flags) |
226 | { |
227 | size_t i; |
228 | |
229 | if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0) |
230 | { |
231 | for (i = 0; i < len; i++) |
232 | { |
233 | types[i] = class_from_ch_ws(text[i]); |
234 | } |
235 | } |
236 | else |
237 | { |
238 | #ifdef DEBUG_BIDI_VERBOSE |
239 | fprintf(stderr, "Text: " ); |
240 | for (i = 0; i < len; i++) |
241 | { |
242 | /* So that we can actually sort of read the debug string, any |
243 | * non-ascii characters are replaced with a 1-digit hash |
244 | * value from 0-9, making non-english characters appear |
245 | * as numbers |
246 | */ |
247 | fprintf(stderr, "%c" , (text[i] <= 127 && text[i] >= 32) ? |
248 | text[i] : text[i] % 9 + '0'); |
249 | } |
250 | fprintf(stderr, "\nTypes: " ); |
251 | #endif |
252 | for (i = 0; i < len; i++) |
253 | { |
254 | types[i] = class_from_ch_n(text[i]); |
255 | #ifdef DEBUG_BIDI_VERBOSE |
256 | fprintf(stderr, "%c" , char_from_types[(int)types[i]]); |
257 | #endif |
258 | } |
259 | #ifdef DEBUG_BIDI_VERBOSE |
260 | fprintf(stderr, "\n" ); |
261 | #endif |
262 | } |
263 | } |
264 | |
265 | /* Determines the base level of the text. |
266 | * Implements rule P2 of the Unicode Bidi Algorithm. |
267 | * Note: Ignores explicit embeddings |
268 | */ |
269 | static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len) |
270 | { |
271 | size_t i; |
272 | |
273 | for (i = 0; i < len; i++) |
274 | { |
275 | switch (types[i]) |
276 | { |
277 | /* strong left */ |
278 | case BDI_L: |
279 | return FZ_BIDI_LTR; |
280 | |
281 | /* strong right */ |
282 | case BDI_R: |
283 | case BDI_AL: |
284 | return FZ_BIDI_RTL; |
285 | } |
286 | } |
287 | return FZ_BIDI_LTR; |
288 | } |
289 | |
290 | static fz_bidi_direction direction_from_type(fz_bidi_chartype type) |
291 | { |
292 | switch (type) |
293 | { |
294 | case BDI_L: |
295 | case BDI_EN: |
296 | return FZ_BIDI_LTR; |
297 | |
298 | case BDI_R: |
299 | case BDI_AL: |
300 | return FZ_BIDI_RTL; |
301 | |
302 | default: |
303 | return FZ_BIDI_NEUTRAL; |
304 | } |
305 | } |
306 | |
307 | static void |
308 | classify_quoted_blocks(const uint32_t *text, |
309 | fz_bidi_chartype *types, |
310 | size_t len) |
311 | { |
312 | size_t i; |
313 | int inQuote = FALSE; |
314 | int pdfNeeded = FALSE; |
315 | int ltrFound = FALSE; |
316 | int rtlFound = FALSE; |
317 | |
318 | /* Only do anything special here if there is mixed content |
319 | * (LTR *and* RTL) in the text. |
320 | */ |
321 | for (i = 0; i < len; i++) |
322 | { |
323 | switch (direction_from_type(types[i])) |
324 | { |
325 | case FZ_BIDI_LTR: |
326 | ltrFound = TRUE; |
327 | break; |
328 | |
329 | case FZ_BIDI_RTL: |
330 | rtlFound = TRUE; |
331 | break; |
332 | |
333 | default: |
334 | break; |
335 | } |
336 | } |
337 | |
338 | /* Only make any changes if *both* LTR and RTL characters exist |
339 | * in this text. |
340 | */ |
341 | if (!ltrFound || !rtlFound) |
342 | { |
343 | return; |
344 | } |
345 | |
346 | for (i = 0; i < len; i++) |
347 | { |
348 | if (text[i]=='"') |
349 | { |
350 | /* If we're already in a quote then terminate it, |
351 | * else start a new block. |
352 | */ |
353 | if (inQuote) |
354 | { |
355 | inQuote = FALSE; |
356 | if (pdfNeeded) |
357 | { |
358 | pdfNeeded = FALSE; |
359 | types[i] = BDI_PDF; |
360 | } |
361 | } |
362 | else |
363 | { |
364 | size_t j; |
365 | int done = FALSE; |
366 | |
367 | inQuote = TRUE; |
368 | |
369 | /* Find the first strong right or left type and |
370 | * use that to determine whether we should classify |
371 | * the quote as LRE or RLE. Or neither, if we |
372 | * hit another quote before any strongly-directional |
373 | * character. |
374 | */ |
375 | for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j) |
376 | { |
377 | switch(types[j]) |
378 | { |
379 | case BDI_RLE: |
380 | case BDI_LRE: |
381 | done = TRUE; |
382 | break; |
383 | |
384 | case BDI_L: |
385 | case BDI_EN: |
386 | types[i] = BDI_LRE; |
387 | pdfNeeded = TRUE; |
388 | done = TRUE; |
389 | break; |
390 | |
391 | case BDI_R: |
392 | case BDI_AL: |
393 | types[i] = BDI_RLE; |
394 | pdfNeeded = TRUE; |
395 | done = TRUE; |
396 | break; |
397 | |
398 | default: |
399 | break; |
400 | } |
401 | } |
402 | } |
403 | } |
404 | } |
405 | } |
406 | |
407 | /* Creates a buffer with an embedding level for every character in the |
408 | * given text. Also determines the base level and returns it in |
409 | * *baseDir if *baseDir does not initially contain a valid direction. |
410 | */ |
411 | static fz_bidi_level * |
412 | create_levels(fz_context *ctx, |
413 | const uint32_t *text, |
414 | size_t len, |
415 | fz_bidi_direction *baseDir, |
416 | int resolveWhiteSpace, |
417 | int flags) |
418 | { |
419 | fz_bidi_level *levels, *plevels; |
420 | fz_bidi_chartype *types = NULL; |
421 | fz_bidi_chartype *ptypes; |
422 | fz_bidi_level baseLevel; |
423 | const uint32_t *ptext; |
424 | size_t plen, remaining; |
425 | |
426 | levels = fz_malloc(ctx, len * sizeof(*levels)); |
427 | |
428 | fz_var(types); |
429 | |
430 | fz_try(ctx) |
431 | { |
432 | types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype)); |
433 | |
434 | classify_characters(text, types, len, flags); |
435 | |
436 | if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL) |
437 | { |
438 | /* Derive the base level from the text and |
439 | * update *baseDir in case the caller wants to know. |
440 | */ |
441 | baseLevel = base_level_from_text(types, len); |
442 | *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; |
443 | } |
444 | else |
445 | { |
446 | baseLevel = (fz_bidi_level)*baseDir; |
447 | } |
448 | |
449 | { |
450 | /* Replace tab with base direction, i.e. make tab appear as |
451 | * 'strong left' if the base direction is left-to-right and |
452 | * 'strong right' if base direction is right-to-left. This |
453 | * allows Layout to implicitly treat tabs as 'segment separators'. |
454 | */ |
455 | size_t i; |
456 | |
457 | for (i = 0u; i < len; i++) |
458 | { |
459 | if (text[i]=='\t') |
460 | { |
461 | types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L; |
462 | } |
463 | } |
464 | } |
465 | |
466 | /* Look for quotation marks. Classify them as RLE or LRE |
467 | * or leave them alone, depending on what follows them. |
468 | */ |
469 | classify_quoted_blocks(text, types, len); |
470 | |
471 | /* Work one paragraph at a time. */ |
472 | plevels = levels; |
473 | ptypes = types; |
474 | ptext = text; |
475 | remaining = len; |
476 | while (remaining) |
477 | { |
478 | plen = fz_bidi_resolve_paragraphs(ptypes, remaining); |
479 | |
480 | /* Work out the levels and character types... */ |
481 | (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0); |
482 | fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen); |
483 | fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen); |
484 | fz_bidi_resolve_implicit(ptypes, plevels, plen); |
485 | |
486 | classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE); |
487 | |
488 | if (resolveWhiteSpace) |
489 | { |
490 | /* resolve whitespace */ |
491 | fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen); |
492 | } |
493 | |
494 | plevels += plen; |
495 | ptypes += plen; |
496 | ptext += plen; |
497 | remaining -= plen; |
498 | } |
499 | |
500 | /* The levels buffer now has odd and even numbers indicating |
501 | * rtl or ltr characters, respectively. |
502 | */ |
503 | #ifdef DEBUG_BIDI_VERBOSE |
504 | fprintf(stderr, "Levels: " ); |
505 | { |
506 | size_t i; |
507 | for (i = 0; i < len; i++) |
508 | { |
509 | fprintf(stderr, "%d" , levels[i]>9?0:levels[i]); |
510 | } |
511 | fprintf(stderr, "\n" ); |
512 | } |
513 | #endif |
514 | } |
515 | fz_always(ctx) |
516 | { |
517 | fz_free(ctx, types); |
518 | } |
519 | fz_catch(ctx) |
520 | { |
521 | fz_free(ctx, levels); |
522 | fz_rethrow(ctx); |
523 | } |
524 | return levels; |
525 | } |
526 | |
527 | /* Partitions the given character sequence into one or more unidirectional |
528 | * fragments and invokes the given callback function for each fragment. |
529 | */ |
530 | void fz_bidi_fragment_text(fz_context *ctx, |
531 | const uint32_t *text, |
532 | size_t textlen, |
533 | fz_bidi_direction *baseDir, |
534 | fz_bidi_fragment_fn *callback, |
535 | void *arg, |
536 | int flags) |
537 | { |
538 | size_t startOfFragment; |
539 | size_t i; |
540 | fz_bidi_level *levels; |
541 | |
542 | if (text == NULL || callback == NULL || textlen == 0) |
543 | return; |
544 | |
545 | DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n" , text, textlen)); |
546 | |
547 | levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags); |
548 | |
549 | /* We now have an array with an embedding level |
550 | * for each character in text. |
551 | */ |
552 | assert(levels != NULL); |
553 | |
554 | fz_try(ctx) |
555 | { |
556 | startOfFragment = 0; |
557 | for (i = 1; i < textlen; i++) |
558 | { |
559 | if (levels[i] != levels[i-1]) |
560 | { |
561 | /* We've gone past the end of the fragment. |
562 | * Create a text object for it, then start |
563 | * a new fragment. |
564 | */ |
565 | split_at_script(&text[startOfFragment], |
566 | i - startOfFragment, |
567 | levels[startOfFragment], |
568 | arg, |
569 | callback); |
570 | startOfFragment = i; |
571 | } |
572 | } |
573 | /* Now i == textlen. Deal with the final (or maybe only) fragment. */ |
574 | /* otherwise create 1 fragment */ |
575 | split_at_script(&text[startOfFragment], |
576 | i - startOfFragment, |
577 | levels[startOfFragment], |
578 | arg, |
579 | callback); |
580 | } |
581 | fz_always(ctx) |
582 | { |
583 | fz_free(ctx, levels); |
584 | } |
585 | fz_catch(ctx) |
586 | { |
587 | fz_rethrow(ctx); |
588 | } |
589 | } |
590 | |