1/*
2 * Bidirectional text processing.
3 *
4 * Processes unicode text by arranging the characters into an order suitable
5 * for display. E.g. Hebrew text will be arranged from right-to-left and
6 * any English within the text will remain in the left-to-right order.
7 * Characters such as parenthesis will be substituted for their mirrored
8 * equivalents if they are part of text which must be reversed.
9 *
10 * This is an implementation of the unicode Bidirectional Algorithm which
11 * can be found here: http://www.unicode.org/reports/tr9/ and is based
12 * on the reference implementation of the algorithm found on that page.
13 *
14 * For a nice overview of how it works, read this...
15 * http://www.w3.org/TR/REC-html40/struct/dirlang.html
16 *
17 * Extracted from the SmartOffice code, where it was modified by Ian
18 * Beveridge.
19 *
20 * Copyright (C) Picsel, 2004. All Rights Reserved.
21 */
22
23/*
24 * Original copyright notice from unicode reference implementation.
25 * ----------------------------------------------------------------
26 * Written by: Asmus Freytag
27 * C++ and Windows dependencies removed, and
28 * command line interface added by: Rick McGowan
29 *
30 * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
31 */
32
33/*
34 * Includes...
35 */
36
37#include "mupdf/fitz.h"
38#include "mupdf/ucdn.h"
39#include "bidi-imp.h" /* standard bidi code interface */
40#include <assert.h>
41
42/*
43 * Macros...
44 */
45
46#define ODD(x) ((x) & 1)
47
48#define REPLACEABLE_TYPE(t) ( \
49 ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
50 ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
51 ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
52
53#ifdef DEBUG_BIDI_VERBOSE
54#define DBUGVF(params) do { fz_warn params; } while (0)
55#else
56#define DBUGVF(params) do {} while (0)
57#endif
58
59#ifdef DEBUG_BIDI_OUTLINE
60#define DBUGH(params) do { fz_warn params; } while (0)
61#else
62#define DBUGH(params) do {} while (0)
63#endif
64
65#define UNICODE_EOS 0
66#define UNICODE_DIGIT_ZERO 0x0030
67#define UNICODE_DIGIT_NINE 0x0039
68#define UNICODE_SUPERSCRIPT_TWO 0x00B2
69#define UNICODE_SUPERSCRIPT_THREE 0x00B3
70#define UNICODE_SUPERSCRIPT_ONE 0x00B9
71#define UNICODE_RTL_START 0x0590
72#define UNICODE_RTL_END 0x07BF
73#define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
74#define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
75#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
76#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
77#define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
78#define UNICODE_SUPERSCRIPT_ZERO 0x2070
79#define UNICODE_SUPERSCRIPT_FOUR 0x2074
80#define UNICODE_SUPERSCRIPT_NINE 0x2079
81#define UNICODE_SUBSCRIPT_ZERO 0x2080
82#define UNICODE_SUBSCRIPT_NINE 0x2089
83#define UNICODE_CIRCLED_DIGIT_ONE 0x2460
84#define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
85#define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
86#define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
87#define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
88
89#ifndef TRUE
90#define TRUE (1)
91#endif
92#ifndef FALSE
93#define FALSE (0)
94#endif
95
96/*
97 * Enumerations...
98 */
99
100#ifdef DEBUG_BIDI_VERBOSE
101/* display support: */
102static const char char_from_types[] =
103{
104 ' ', /* ON */
105 '>', /* L */
106 '<', /* R */
107 '9', /* AN */
108 '1', /* EN */
109 'a', /* AL */
110 '@', /* NSM */
111 '.', /* CS */
112 ',', /* ES */
113 '$', /* ET */
114 ':', /* BN */
115 'X', /* S */
116 '_', /* WS */
117 'B', /* B */
118 '+', /* RLO */
119 '+', /* RLE */
120 '+', /* LRO */
121 '+', /* LRE */
122 '-', /* PDF */
123 '=' /* LS */
124};
125#endif
126
127/*
128 * Functions and static functions...
129 */
130
131/* UCDN uses a different ordering than Bidi does. We cannot
132 * change to the UCDN ordering, as the bidi-std.c code relies
133 * on the exact ordering (at least that N = ON = 0). We
134 * therefore map between the two using this small table. It
135 * also takes care of fudging LRI, RLI, FSI and PDI, that this
136 * code does not currently support. */
137static const uint8_t ucdn_to_bidi[] =
138{
139 BDI_L, /* UCDN_BIDI_CLASS_L = 0 */
140 BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */
141 BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */
142 BDI_R, /* UCDN_BIDI_CLASS_R = 3 */
143 BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */
144 BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */
145 BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */
146 BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */
147 BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */
148 BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */
149 BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */
150 BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */
151 BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */
152 BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */
153 BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */
154 BDI_B, /* UCDN_BIDI_CLASS_B = 15 */
155 BDI_S, /* UCDN_BIDI_CLASS_S = 16 */
156 BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */
157 BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */
158 BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */
159 BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */
160 BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */
161 BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */
162};
163
164#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
165
166/* Return a direction for white-space on the second pass of the algorithm. */
167static fz_bidi_chartype class_from_ch_n(uint32_t ch)
168{
169 fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
170 if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
171 return BDI_N;
172 return from_ch_ws;
173}
174
175/* Split fragments into single scripts (or punctuation + single script) */
176static void
177split_at_script(const uint32_t *fragment,
178 size_t fragment_len,
179 int level,
180 void *arg,
181 fz_bidi_fragment_fn *callback)
182{
183 int script = UCDN_SCRIPT_COMMON;
184 size_t script_start, i;
185
186 script_start = 0;
187 for (i = 0; i < fragment_len; i++)
188 {
189 int s = ucdn_get_script(fragment[i]);
190 if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
191 {
192 /* Punctuation etc. This is fine. */
193 }
194 else if (s == script)
195 {
196 /* Same script. Still fine. */
197 }
198 else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
199 {
200 /* First non punctuation thing. Set the script. */
201 script = s;
202 }
203 else
204 {
205 /* Change of script. Break the fragment. */
206 (*callback)(&fragment[script_start], i - script_start, level, script, arg);
207 script_start = i;
208 script = s;
209 }
210 }
211 if (script_start != fragment_len)
212 {
213 (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
214 }
215}
216
217/* Determines the character classes for all following
218 * passes of the algorithm. A character class is basically the type of Bidi
219 * behaviour that the character exhibits.
220 */
221static void
222classify_characters(const uint32_t *text,
223 fz_bidi_chartype *types,
224 size_t len,
225 fz_bidi_flags flags)
226{
227 size_t i;
228
229 if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
230 {
231 for (i = 0; i < len; i++)
232 {
233 types[i] = class_from_ch_ws(text[i]);
234 }
235 }
236 else
237 {
238#ifdef DEBUG_BIDI_VERBOSE
239 fprintf(stderr, "Text: ");
240 for (i = 0; i < len; i++)
241 {
242 /* So that we can actually sort of read the debug string, any
243 * non-ascii characters are replaced with a 1-digit hash
244 * value from 0-9, making non-english characters appear
245 * as numbers
246 */
247 fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
248 text[i] : text[i] % 9 + '0');
249 }
250 fprintf(stderr, "\nTypes: ");
251#endif
252 for (i = 0; i < len; i++)
253 {
254 types[i] = class_from_ch_n(text[i]);
255#ifdef DEBUG_BIDI_VERBOSE
256 fprintf(stderr, "%c", char_from_types[(int)types[i]]);
257#endif
258 }
259#ifdef DEBUG_BIDI_VERBOSE
260 fprintf(stderr, "\n");
261#endif
262 }
263}
264
265/* Determines the base level of the text.
266 * Implements rule P2 of the Unicode Bidi Algorithm.
267 * Note: Ignores explicit embeddings
268 */
269static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
270{
271 size_t i;
272
273 for (i = 0; i < len; i++)
274 {
275 switch (types[i])
276 {
277 /* strong left */
278 case BDI_L:
279 return FZ_BIDI_LTR;
280
281 /* strong right */
282 case BDI_R:
283 case BDI_AL:
284 return FZ_BIDI_RTL;
285 }
286 }
287 return FZ_BIDI_LTR;
288}
289
290static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
291{
292 switch (type)
293 {
294 case BDI_L:
295 case BDI_EN:
296 return FZ_BIDI_LTR;
297
298 case BDI_R:
299 case BDI_AL:
300 return FZ_BIDI_RTL;
301
302 default:
303 return FZ_BIDI_NEUTRAL;
304 }
305}
306
307static void
308classify_quoted_blocks(const uint32_t *text,
309 fz_bidi_chartype *types,
310 size_t len)
311{
312 size_t i;
313 int inQuote = FALSE;
314 int pdfNeeded = FALSE;
315 int ltrFound = FALSE;
316 int rtlFound = FALSE;
317
318 /* Only do anything special here if there is mixed content
319 * (LTR *and* RTL) in the text.
320 */
321 for (i = 0; i < len; i++)
322 {
323 switch (direction_from_type(types[i]))
324 {
325 case FZ_BIDI_LTR:
326 ltrFound = TRUE;
327 break;
328
329 case FZ_BIDI_RTL:
330 rtlFound = TRUE;
331 break;
332
333 default:
334 break;
335 }
336 }
337
338 /* Only make any changes if *both* LTR and RTL characters exist
339 * in this text.
340 */
341 if (!ltrFound || !rtlFound)
342 {
343 return;
344 }
345
346 for (i = 0; i < len; i++)
347 {
348 if (text[i]=='"')
349 {
350 /* If we're already in a quote then terminate it,
351 * else start a new block.
352 */
353 if (inQuote)
354 {
355 inQuote = FALSE;
356 if (pdfNeeded)
357 {
358 pdfNeeded = FALSE;
359 types[i] = BDI_PDF;
360 }
361 }
362 else
363 {
364 size_t j;
365 int done = FALSE;
366
367 inQuote = TRUE;
368
369 /* Find the first strong right or left type and
370 * use that to determine whether we should classify
371 * the quote as LRE or RLE. Or neither, if we
372 * hit another quote before any strongly-directional
373 * character.
374 */
375 for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
376 {
377 switch(types[j])
378 {
379 case BDI_RLE:
380 case BDI_LRE:
381 done = TRUE;
382 break;
383
384 case BDI_L:
385 case BDI_EN:
386 types[i] = BDI_LRE;
387 pdfNeeded = TRUE;
388 done = TRUE;
389 break;
390
391 case BDI_R:
392 case BDI_AL:
393 types[i] = BDI_RLE;
394 pdfNeeded = TRUE;
395 done = TRUE;
396 break;
397
398 default:
399 break;
400 }
401 }
402 }
403 }
404 }
405}
406
407/* Creates a buffer with an embedding level for every character in the
408 * given text. Also determines the base level and returns it in
409 * *baseDir if *baseDir does not initially contain a valid direction.
410 */
411static fz_bidi_level *
412create_levels(fz_context *ctx,
413 const uint32_t *text,
414 size_t len,
415 fz_bidi_direction *baseDir,
416 int resolveWhiteSpace,
417 int flags)
418{
419 fz_bidi_level *levels, *plevels;
420 fz_bidi_chartype *types = NULL;
421 fz_bidi_chartype *ptypes;
422 fz_bidi_level baseLevel;
423 const uint32_t *ptext;
424 size_t plen, remaining;
425
426 levels = fz_malloc(ctx, len * sizeof(*levels));
427
428 fz_var(types);
429
430 fz_try(ctx)
431 {
432 types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
433
434 classify_characters(text, types, len, flags);
435
436 if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
437 {
438 /* Derive the base level from the text and
439 * update *baseDir in case the caller wants to know.
440 */
441 baseLevel = base_level_from_text(types, len);
442 *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
443 }
444 else
445 {
446 baseLevel = (fz_bidi_level)*baseDir;
447 }
448
449 {
450 /* Replace tab with base direction, i.e. make tab appear as
451 * 'strong left' if the base direction is left-to-right and
452 * 'strong right' if base direction is right-to-left. This
453 * allows Layout to implicitly treat tabs as 'segment separators'.
454 */
455 size_t i;
456
457 for (i = 0u; i < len; i++)
458 {
459 if (text[i]=='\t')
460 {
461 types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
462 }
463 }
464 }
465
466 /* Look for quotation marks. Classify them as RLE or LRE
467 * or leave them alone, depending on what follows them.
468 */
469 classify_quoted_blocks(text, types, len);
470
471 /* Work one paragraph at a time. */
472 plevels = levels;
473 ptypes = types;
474 ptext = text;
475 remaining = len;
476 while (remaining)
477 {
478 plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
479
480 /* Work out the levels and character types... */
481 (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
482 fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
483 fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
484 fz_bidi_resolve_implicit(ptypes, plevels, plen);
485
486 classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
487
488 if (resolveWhiteSpace)
489 {
490 /* resolve whitespace */
491 fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
492 }
493
494 plevels += plen;
495 ptypes += plen;
496 ptext += plen;
497 remaining -= plen;
498 }
499
500 /* The levels buffer now has odd and even numbers indicating
501 * rtl or ltr characters, respectively.
502 */
503#ifdef DEBUG_BIDI_VERBOSE
504 fprintf(stderr, "Levels: ");
505 {
506 size_t i;
507 for (i = 0; i < len; i++)
508 {
509 fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
510 }
511 fprintf(stderr, "\n");
512 }
513#endif
514 }
515 fz_always(ctx)
516 {
517 fz_free(ctx, types);
518 }
519 fz_catch(ctx)
520 {
521 fz_free(ctx, levels);
522 fz_rethrow(ctx);
523 }
524 return levels;
525}
526
527/* Partitions the given character sequence into one or more unidirectional
528 * fragments and invokes the given callback function for each fragment.
529 */
530void fz_bidi_fragment_text(fz_context *ctx,
531 const uint32_t *text,
532 size_t textlen,
533 fz_bidi_direction *baseDir,
534 fz_bidi_fragment_fn *callback,
535 void *arg,
536 int flags)
537{
538 size_t startOfFragment;
539 size_t i;
540 fz_bidi_level *levels;
541
542 if (text == NULL || callback == NULL || textlen == 0)
543 return;
544
545 DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
546
547 levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
548
549 /* We now have an array with an embedding level
550 * for each character in text.
551 */
552 assert(levels != NULL);
553
554 fz_try(ctx)
555 {
556 startOfFragment = 0;
557 for (i = 1; i < textlen; i++)
558 {
559 if (levels[i] != levels[i-1])
560 {
561 /* We've gone past the end of the fragment.
562 * Create a text object for it, then start
563 * a new fragment.
564 */
565 split_at_script(&text[startOfFragment],
566 i - startOfFragment,
567 levels[startOfFragment],
568 arg,
569 callback);
570 startOfFragment = i;
571 }
572 }
573 /* Now i == textlen. Deal with the final (or maybe only) fragment. */
574 /* otherwise create 1 fragment */
575 split_at_script(&text[startOfFragment],
576 i - startOfFragment,
577 levels[startOfFragment],
578 arg,
579 callback);
580 }
581 fz_always(ctx)
582 {
583 fz_free(ctx, levels);
584 }
585 fz_catch(ctx)
586 {
587 fz_rethrow(ctx);
588 }
589}
590