1// Extracted from Bidi.cpp - version 26
2
3// Reference implementation for Unicode Bidirectional Algorithm
4
5// Bidi include file
6#include "mupdf/fitz.h"
7#include "bidi-imp.h"
8
9#include <assert.h>
10
11#ifndef TRUE
12#define TRUE (1)
13#endif
14#ifndef FALSE
15#define FALSE (0)
16#endif
17
18/*------------------------------------------------------------------------
19 File: Bidi.Cpp
20
21 Description
22 -----------
23
24 Sample Implementation of the Unicode Bidirectional Algorithm as it
25 was revised by Revision 5 of the Unicode Technical Report # 9
26 (1999-8-17)
27
28 Verified for changes to the algorithm up through Unicode 5.2.0 (2009).
29
30 This implementation is organized into several passes, each implemen-
31 ting one or more of the rules of the Unicode Bidi Algorithm. The
32 resolution of Weak Types and of Neutrals each use a state table
33 approach.
34
35 Both a printf based interface and a Windows DlgProc are provided for
36 interactive testing.
37
38 A stress harness comparing this implementation (v24) to a Java based
39 implementation was used by Doug Felt to verify that the two
40 implementations produce identical results for all strings up to six
41 bidi classes and stochastic strings up to length 20.
42
43 Version 26 was verified by the author against the Unicode 5.2.0
44 file BidiTest.txt, which provides an exhaustive text of strings of
45 length 4 or less, but covers some important cases where the language
46 in UAX#9 had been clarified.
47
48 To see this code running in an actual Windows program,
49 download the free Unibook utility from http://unicode.org/unibook
50 The bidi demo is executed from the tools menu. It is build from
51 this source file.
52
53 Build Notes
54 -----------
55
56 To compile the sample implementation please set the #define
57 directives above so the correct headers get included. Not all the
58 files are needed for all purposes. For the commandline version
59 only bidi.h and bidi.cpp are needed.
60
61 The Win32 version is provided as a dialog procedure. To use as a
62 standalone program compile with the lbmain.cpp file. If all you
63 need is the ability to run the code "as is", you can instead download
64 the unibook utility from http://uincode.org/unibook/ which contains
65 the bidi demo compiled from this source file.
66
67 This code uses an extension to C++ that gives variables declared in
68 a for() statement function the same scope as the for() statement.
69 If your compiler does not support this extension, you may need to
70 move the declaration, e.g. int ich = 0; in front of the for statement.
71
72 Implementation Note
73 -------------------
74
75 NOTE: The Unicode Bidirectional Algorithm removes all explicit
76 formatting codes in rule X9, but states that this can be
77 simulated by conformant implementations. This implementation
78 attempts to demonstrate such a simulation
79
80 To demonstrate this, the current implementation does the
81 following:
82
83 in resolveExplicit()
84 - change LRE, LRO, RLE, RLO, PDF to BN
85 - assign nested levels to BN
86
87 in resolveWeak and resolveNeutrals
88 - assign L and R to BN's where they exist in place of
89 sor and eor by changing the last BN in front of a
90 level change to a strong type
91 - skip over BN's for the purpose of determining actions
92 - include BN in the count of deferred runs
93 which will resolve some of them to EN, AN and N
94
95 in resolveWhiteSpace
96 - set the level of any surviving BN to the base level,
97 or the level of the preceding character
98 - include LRE,LRO, RLE, RLO, PDF and BN in the count
99 whitespace to be reset
100
101 This will result in the same order for non-BN characters as
102 if the BN characters had been removed.
103
104 The clean() function can be used to remove boundary marks for
105 verification purposes.
106
107 Notation
108 --------
109 Pointer variables generally start with the letter p
110 Counter variables generally start with the letter c
111 Index variables generally start with the letter i
112 Boolean variables generally start with the letter f
113
114 The enumerated bidirectional types have the same name as in the
115 description for the Unicode Bidirectional Algorithm
116
117 Using this code outside a demo context
118 --------------------------------------
119
120 The way the functions are broken down in this demo code is based
121 on the needs of the demo to show the evolution in internal state
122 as the algorithm proceeds. This obscures how the algorithm would
123 be used in practice. These are the steps:
124
125 1. Allocate a pair of arrays large enough to hold bidi class
126 and calculated levels (one for each input character)
127
128 2. Provide your own function to assign directional types (bidi
129 classes) corresponding to each character in the input,
130 conflating ON, WS, S to N. Unlike the classify function in this
131 demo, the input would be actual Unicode characters.
132
133 3. Process the first paragraph by calling BidiParagraph. That
134 function changes B into BN and returns a length including the
135 paragraph separator. (The iteration over multiple paragraphs
136 is left as exercise for the reader).
137
138 4. Assign directional types again, but now assign specific types
139 to whitespace characters.
140
141 5. Instead of reordering the input in place it is often desirable
142 to calculate an array of offsets indicating the reordering.
143 To that end, allocate such an array here and use it instead
144 of the input array in the next step.
145
146 6. Resolve and reorder the lines by calling BidiLines. That
147 function 'breaks' lines on LS characters. Provide an optional
148 array of flags indicating the location of other line breaks as
149 needed.
150
151 Update History
152 --------------
153 Version 24 is the initial published and verified version of this
154 reference implementation. Version 25 and its updates fix various
155 minor issues with the scaffolding used for demonstrating the
156 algorithm using pseudo-alphabets from the command line or dialog
157 box. No changes to the implementation of the actual bidi algorithm
158 are made in any of the minor updates to version 25. Version 26
159 also makes no change to the actual algorithm but was verified
160 against the official BidiTest.txt file for Unicode 5.2.0.
161
162 - updated pseudo-alphabet
163
164 - Last Revised 12-10-99 (25)
165
166 - enable demo mode for release builds - no other changes
167
168 - Last Revised 12-10-00 (25a)
169
170 - fix regression in pseudo alphabet use for Windows UI
171
172 - Last Revised 02-01-01 (25b)
173
174 - fixed a few comments, renamed a variable
175
176 - Last Revised 03-04-01 (25c)
177
178 - make base level settable, enable mirror by default,
179 fix dialog size
180
181 - Last Revised 06-02-01 (25e)
182
183 - fixed some comments
184
185 - Last Revised 09-29-01 (25f)
186
187 - fixed classification for LS,RLM,LRM in pseudo alphabet,
188 focus issues in UI, regression fix to commandline from 25(e)
189 fix DEMO switch
190
191 - Last Revised 11-07-01 (25g)
192
193 - fixed classification for plus/minus in pseudo alphabet
194 to track changes made in Unicode 4.0.1
195
196 - Last Revised 12-03-04 (25h)
197
198 - now compiles as dialog-only program for WINDOWS_UI==1
199 using new bidimain.cpp
200
201 - Last Revised 12-02-07 (25i)
202
203 - cleaned up whitespace and indenting in the source,
204 fixed two comments (table headers)
205
206 - Last Revised 15-03-07 (25j)
207
208 - named enumerations
209
210 - Last Revised 30-05-07 (25k)
211
212 - added usage notes, minor edits to comments, indentation, etc
213 throughout. Added the bidiParagraph function. Checked against
214 changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No
215 changes needed to this implementation to match the values in
216 the BidiTest.txt file in the Unicode Character Database.
217 Minor fixes to dialog/windows proc, updated preprocessor directives.
218
219 - Last Revised 03-08-09 (26)
220
221 Credits:
222 -------
223 Written by: Asmus Freytag
224 Command line interface by: Rick McGowan
225 Verification (v24): Doug Felt
226
227 Disclaimer and legal rights:
228 ---------------------------
229 Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved.
230 Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
231
232 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
233 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
234 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
235 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
236 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
237 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
238 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
239 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.
240
241 The file bid.rc is included in the software covered by the above.
242------------------------------------------------------------------------*/
243
244/* === HELPER FUNCTIONS AND DECLARATIONS ================================= */
245
246#define odd(x) ((x) & 1)
247
248/*----------------------------------------------------------------------
249 The following array maps character codes to types for the purpose of
250 this sample implementation. The legend string gives a human readable
251 explanation of the pseudo alphabet.
252
253 For simplicity, characters entered by buttons are given a 1:1 mapping
254 between their type and pseudo character value. Pseudo characters that
255 can be typed from the keyboard are explained in the legend string.
256
257 Use the Unicode Character Database for the real values in real use.
258 ---------------------------------------------------------------------*/
259
260enum
261{
262 chLS = 0x15
263};
264
265#if 0
266static const fz_bidi_chartype types_from_char[] =
267{
268// 0 1 2 3 4 5 6 7 8 9 a b c d e f
269BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_S, BDI_B, BDI_S, BDI_WS, BDI_B, BDI_BN, BDI_BN, /*00-0f*/
270BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B, BDI_B, BDI_B, BDI_S, /*10-1f*/
271BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/
272BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_AN, BDI_AN, BDI_AN, BDI_AN, BDI_CS, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, /*30-3f*/
273BDI_ON, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, /*40-4f*/
274BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_LRE,BDI_ON, BDI_RLE,BDI_PDF,BDI_S, /*50-5f*/
275BDI_NSM,BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, /*60-6f*/
276BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_LRO,BDI_B, BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/
277};
278#endif
279
280/***************************************
281 Reverse, human readable reference:
282
283 LRM: 0x4
284 RLM: 0x5
285 L: 0x16,a-z
286 LRE: 0x11,[
287 LRO: 0x10,{
288 R: 0x17,G-Z
289 AL: A-F
290 RLE: 0x14,]
291 RLO: 0x13,}
292 PDF: 0x12,^
293 EN: 0-5
294 ES: /,+,[hyphen]
295 ET: #,$,%
296 AN: 6-9
297 CS: [comma],.,:
298 NSM: `
299 BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~
300 B: 0xa,0xd,0x1c-0x1e,|
301 S: 0x9,0xb,0x1f,_
302 WS: 0xc,0x15,[space]
303 ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f
304****************************************/
305
306// === HELPER FUNCTIONS ================================================
307
308#ifdef BIDI_LINE_AT_A_TIME
309// reverse cch characters
310static
311void reverse(uint32_t *psz, int cch)
312{
313 uint32_t ch_temp;
314 int ich;
315
316 for (ich = 0; ich < --cch; ich++)
317 {
318 ch_temp = psz[ich];
319 psz[ich] = psz[cch];
320 psz[cch] = ch_temp;
321 }
322}
323#endif
324
325// Set a run of cval values at locations all prior to, but not including
326// iStart, to the new value nval.
327static
328void set_deferred_run(fz_bidi_chartype *pval, size_t cval, size_t iStart, fz_bidi_chartype nval)
329{
330 size_t i;
331
332 for (i = iStart; i > iStart - cval; )
333 {
334 pval[--i] = nval;
335 }
336}
337
338static
339void set_deferred_level_run(fz_bidi_level *pval, size_t cval, size_t iStart, fz_bidi_level nval)
340{
341 size_t i;
342
343 for (i = iStart; i > iStart - cval; )
344 {
345 pval[--i] = nval;
346 }
347}
348
349// === ASSIGNING BIDI CLASSES ============================================
350
351// === THE PARAGRAPH LEVEL ===============================================
352
353/*------------------------------------------------------------------------
354 Function: resolve_paragraphs
355
356 Resolves the input strings into blocks over which the algorithm
357 is then applied.
358
359 Implements Rule P1 of the Unicode Bidi Algorithm
360
361 Input: Text string
362 Character count
363
364 Output: revised character count
365
366 Note: This is a very simplistic function. In effect it restricts
367 the action of the algorithm to the first paragraph in the input
368 where a paragraph ends at the end of the first block separator
369 or at the end of the input text.
370
371------------------------------------------------------------------------*/
372int fz_bidi_resolve_paragraphs(fz_bidi_chartype *types, int cch)
373{
374 int ich;
375
376 // skip characters not of type B
377 for(ich = 0; ich < cch && types[ich] != BDI_B; ich++)
378 ;
379 // stop after first B, make it a BN for use in the next steps
380 if (ich < cch && types[ich] == BDI_B)
381 types[ich++] = BDI_BN;
382 return ich;
383}
384
385#if 0
386/*------------------------------------------------------------------------
387 Function: base_level
388
389 Determines the base level
390
391 Implements rule P2 of the Unicode Bidi Algorithm.
392
393 Input: Array of directional classes
394 Character count
395
396 Note: Ignores explicit embeddings
397------------------------------------------------------------------------*/
398static int base_level(const fz_bidi_chartype *pcls, int cch)
399{
400 int ich;
401
402 for (ich = 0; ich < cch; ich++)
403 {
404 switch (pcls[ich])
405 {
406 // strong left
407 case BDI_L:
408 return 0;
409
410 // strong right
411 case BDI_R:
412 case BDI_AL:
413 return 1;
414 }
415 }
416 return 0;
417}
418#endif
419
420//====== RESOLVE EXPLICIT ================================================
421
422static fz_bidi_level greater_even(fz_bidi_level i)
423{
424 return odd(i) ? i + 1 : i + 2;
425}
426
427static fz_bidi_level greater_odd(fz_bidi_level i)
428{
429 return odd(i) ? i + 2 : i + 1;
430}
431
432static fz_bidi_chartype embedding_direction(fz_bidi_chartype level)
433{
434 return odd(level) ? BDI_R : BDI_L;
435}
436
437/*------------------------------------------------------------------------
438 Function: resolveExplicit
439
440 Recursively resolves explicit embedding levels and overrides.
441 Implements rules X1-X9, of the Unicode Bidirectional Algorithm.
442
443 Input: Base embedding level and direction
444 Character count
445
446 Output: Array of embedding levels
447 Caller must allocate (one level per input character)
448
449 In/Out: Array of direction classes
450
451 Note: The function uses two simple counters to keep track of
452 matching explicit codes and PDF. Use the default argument for
453 the outermost call. The nesting counter counts the recursion
454 depth and not the embedding level.
455------------------------------------------------------------------------*/
456size_t fz_bidi_resolve_explicit(fz_bidi_level level, fz_bidi_chartype dir, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch,
457 fz_bidi_level n_nest)
458{
459 size_t ich;
460
461 // always called with a valid nesting level
462 // nesting levels are != embedding levels
463 int nLastValid = n_nest;
464
465 // check input values
466 assert(n_nest >= 0 && level >= 0 && level <= BIDI_LEVEL_MAX);
467
468 // process the text
469 for (ich = 0; ich < cch; ich++)
470 {
471 fz_bidi_chartype cls = pcls[ich];
472 switch (cls)
473 {
474 case BDI_LRO:
475 case BDI_LRE:
476 n_nest++;
477 if (greater_even(level) <= BIDI_LEVEL_MAX)
478 {
479 plevel[ich] = greater_even(level);
480 pcls[ich] = BDI_BN;
481 ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L),
482 &pcls[ich+1], &plevel[ich+1],
483 cch - (ich+1), n_nest);
484 n_nest--;
485 continue;
486 }
487 cls = pcls[ich] = BDI_BN;
488 break;
489
490 case BDI_RLO:
491 case BDI_RLE:
492 n_nest++;
493 if (greater_odd(level) <= BIDI_LEVEL_MAX)
494 {
495 plevel[ich] = greater_odd(level);
496 pcls[ich] = BDI_BN;
497 ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R),
498 &pcls[ich+1], &plevel[ich+1],
499 cch - (ich+1), n_nest);
500 n_nest--;
501 continue;
502 }
503 cls = pcls[ich] = BDI_BN;
504 break;
505
506 case BDI_PDF:
507 cls = pcls[ich] = BDI_BN;
508 if (n_nest)
509 {
510 if (nLastValid < n_nest)
511 {
512 n_nest--;
513 }
514 else
515 {
516 cch = ich; // break the loop, but complete body
517 }
518 }
519 break;
520 }
521
522 // Apply the override
523 if (dir != BDI_N)
524 {
525 cls = dir;
526 }
527 plevel[ich] = level;
528 if (pcls[ich] != BDI_BN)
529 pcls[ich] = cls;
530 }
531
532 return ich;
533}
534
535// === RESOLVE WEAK TYPES ================================================
536
537enum bidi_state // possible states
538{
539 xa, // arabic letter
540 xr, // right letter
541 xl, // left letter
542
543 ao, // arabic lett. foll by ON
544 ro, // right lett. foll by ON
545 lo, // left lett. foll by ON
546
547 rt, // ET following R
548 lt, // ET following L
549
550 cn, // EN, AN following AL
551 ra, // arabic number foll R
552 re, // european number foll R
553 la, // arabic number foll L
554 le, // european number foll L
555
556 ac, // CS following cn
557 rc, // CS following ra
558 rs, // CS,ES following re
559 lc, // CS following la
560 ls, // CS,ES following le
561
562 ret, // ET following re
563 let // ET following le
564} ;
565
566const unsigned char state_weak[][10] =
567{
568 // N, L, R, AN, EN, AL,NSM, CS, ES, ET,
569/*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */
570/*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */
571/*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */
572
573/*ao*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/
574/*ro*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */
575/*lo*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON */
576
577/*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */
578/*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */
579
580/*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */
581/*ra*/ { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R */
582/*re*/ { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */
583/*la*/ { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L */
584/*le*/ { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */
585
586/*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */
587/*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */
588/*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */
589/*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */
590/*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */
591
592/*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */
593/*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */
594
595};
596
597enum bidi_action // possible actions
598{
599 // primitives
600 IX = 0x100, // increment
601 XX = 0xF, // no-op
602
603 // actions
604 xxx = (XX << 4) + XX, // no-op
605 xIx = IX + xxx, // increment run
606 xxN = (XX << 4) + BDI_ON, // set current to N
607 xxE = (XX << 4) + BDI_EN, // set current to EN
608 xxA = (XX << 4) + BDI_AN, // set current to AN
609 xxR = (XX << 4) + BDI_R, // set current to R
610 xxL = (XX << 4) + BDI_L, // set current to L
611 Nxx = (BDI_ON << 4) + 0xF, // set run to neutral
612 Axx = (BDI_AN << 4) + 0xF, // set run to AN
613 ExE = (BDI_EN << 4) + BDI_EN, // set run to EN, set current to EN
614 NIx = (BDI_ON << 4) + 0xF + IX, // set run to N, increment
615 NxN = (BDI_ON << 4) + BDI_ON, // set run to N, set current to N
616 NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R
617 NxE = (BDI_ON << 4) + BDI_EN, // set run to N, set current to EN
618
619 AxA = (BDI_AN << 4) + BDI_AN, // set run to AN, set current to AN
620 NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L
621 LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L
622};
623
624typedef uint16_t fz_bidi_action;
625
626const fz_bidi_action action_weak[][10] =
627{
628 // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET,
629/*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */
630/*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right letter */
631/*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */
632
633/*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */
634/*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */
635/*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */
636
637/*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */
638/*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */
639
640/*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */
641/*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */
642/*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */
643/*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */
644/*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */
645
646/*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */
647/*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */
648/*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */
649/*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */
650/*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */
651
652/*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */
653/*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */
654};
655
656static
657fz_bidi_chartype get_deferred_type(fz_bidi_action action)
658{
659 return (action >> 4) & 0xF;
660}
661
662static
663fz_bidi_chartype get_resolved_type(fz_bidi_action action)
664{
665 return action & 0xF;
666}
667
668/* Note on action table:
669
670 States can be of two kinds:
671 - Immediate Resolution State, where each input token
672 is resolved as soon as it is seen. These states have
673 only single action codes (xxN) or the no-op (xxx)
674 for static input tokens.
675 - Deferred Resolution State, where input tokens either
676 either extend the run (xIx) or resolve its Type (e.g. Nxx).
677
678 Input classes are of three kinds
679 - Static Input Token, where the class of the token remains
680 unchanged on output (AN, L, N, R)
681 - Replaced Input Token, where the class of the token is
682 always replaced on output (AL, BDI_BN, NSM, CS, ES, ET)
683 - Conditional Input Token, where the class of the token is
684 changed on output in some but not all cases (EN)
685
686 Where tokens are subject to change, a double action
687 (e.g. NxA, or NxN) is _required_ after deferred states,
688 resolving both the deferred state and changing the current token.
689
690 These properties of the table are verified by assertions below.
691 This code is needed only during debugging and maintenance
692*/
693
694/*------------------------------------------------------------------------
695 Function: resolveWeak
696
697 Resolves the directionality of numeric and other weak character types
698
699 Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm.
700
701 Input: Array of embedding levels
702 Character count
703
704 In/Out: Array of directional classes
705
706 Note: On input only these directional classes are expected
707 AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS,
708------------------------------------------------------------------------*/
709void fz_bidi_resolve_weak(fz_context *ctx, fz_bidi_level baselevel, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
710{
711 int state = odd(baselevel) ? xr : xl;
712 fz_bidi_chartype cls;
713 size_t ich;
714 fz_bidi_action action;
715 fz_bidi_chartype cls_run;
716 fz_bidi_chartype cls_new;
717
718 fz_bidi_level level = baselevel;
719
720 size_t cch_run = 0;
721
722 for (ich = 0; ich < cch; ich++)
723 {
724 if (pcls[ich] > BDI_BN) {
725 fz_warn(ctx, "error: pcls[%zu] > BN (%d)\n", ich, pcls[ich]);
726 }
727
728 // ignore boundary neutrals
729 if (pcls[ich] == BDI_BN)
730 {
731 // must flatten levels unless at a level change;
732 plevel[ich] = level;
733
734 // lookahead for level changes
735 if (ich + 1 == cch && level != baselevel)
736 {
737 // have to fixup last BN before end of the loop, since
738 // its fix-upped value will be needed below the assert
739 pcls[ich] = embedding_direction(level);
740 }
741 else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN)
742 {
743 // fixup LAST BN in front / after a level run to make
744 // it act like the SOR/EOR in rule X10
745 int newlevel = plevel[ich+1];
746 if (level > newlevel) {
747 newlevel = level;
748 }
749 plevel[ich] = newlevel;
750
751 // must match assigned level
752 pcls[ich] = embedding_direction(newlevel);
753 level = plevel[ich+1];
754 }
755 else
756 {
757 // don't interrupt runs
758 if (cch_run)
759 {
760 cch_run++;
761 }
762 continue;
763 }
764 }
765
766 assert(pcls[ich] <= BDI_BN);
767 cls = pcls[ich];
768
769 action = action_weak[state][cls];
770
771 // resolve the directionality for deferred runs
772 cls_run = get_deferred_type(action);
773 if (cls_run != XX)
774 {
775 set_deferred_run(pcls, cch_run, ich, cls_run);
776 cch_run = 0;
777 }
778
779 // resolve the directionality class at the current location
780 cls_new = get_resolved_type(action);
781 if (cls_new != XX)
782 pcls[ich] = cls_new;
783
784 // increment a deferred run
785 if (IX & action)
786 cch_run++;
787
788 state = state_weak[state][cls];
789 }
790
791 // resolve any deferred runs
792 // use the direction of the current level to emulate PDF
793 cls = embedding_direction(level);
794
795 // resolve the directionality for deferred runs
796 cls_run = get_deferred_type(action_weak[state][cls]);
797 if (cls_run != XX)
798 set_deferred_run(pcls, cch_run, ich, cls_run);
799}
800
801// === RESOLVE NEUTRAL TYPES ==============================================
802
803// action values
804enum neutral_action
805{
806 // action to resolve previous input
807 nL = BDI_L, // resolve EN to L
808 En = 3 << 4, // resolve neutrals run to embedding level direction
809 Rn = BDI_R << 4, // resolve neutrals run to strong right
810 Ln = BDI_L << 4, // resolved neutrals run to strong left
811 In = (1<<8), // increment count of deferred neutrals
812 LnL = (1<<4)+BDI_L // set run and EN to L
813};
814
815static fz_bidi_chartype
816get_deferred_neutrals(fz_bidi_action action, fz_bidi_level level)
817{
818 action = (action >> 4) & 0xF;
819 if (action == (En >> 4))
820 return embedding_direction(level);
821 else
822 return action;
823}
824
825static fz_bidi_chartype get_resolved_neutrals(fz_bidi_action action)
826{
827 action = action & 0xF;
828 if (action == In)
829 return 0;
830 else
831 return action;
832}
833
834// state values
835enum neutral_state
836{
837 // new temporary class
838 r, // R and characters resolved to R
839 l, // L and characters resolved to L
840 rn, // N preceded by right
841 ln, // N preceded by left
842 a, // AN preceded by left (the abbrev 'la' is used up above)
843 na // N preceded by a
844} ;
845
846/*------------------------------------------------------------------------
847 Notes:
848
849 By rule W7, whenever a EN is 'dominated' by an L (including start of
850 run with embedding direction = L) it is resolved to, and further treated
851 as L.
852
853 This leads to the need for 'a' and 'na' states.
854------------------------------------------------------------------------*/
855
856const int action_neutrals[][5] =
857{
858// N, L, R, AN, EN, = cls
859 // state =
860 {In, 0, 0, 0, 0}, // r right
861 {In, 0, 0, 0, BDI_L}, // l left
862
863 {In, En, Rn, Rn, Rn}, // rn N preceded by right
864 {In, Ln, En, En, LnL}, // ln N preceded by left
865
866 {In, 0, 0, 0, BDI_L}, // a AN preceded by left
867 {In, En, Rn, Rn, En} // na N preceded by a
868} ;
869
870const int state_neutrals[][5] =
871{
872// N, L, R, AN, EN = cls
873 // state =
874 {rn, l, r, r, r}, // r right
875 {ln, l, r, a, l}, // l left
876
877 {rn, l, r, r, r}, // rn N preceded by right
878 {ln, l, r, a, l}, // ln N preceded by left
879
880 {na, l, r, a, l}, // a AN preceded by left
881 {na, l, r, a, l} // na N preceded by la
882} ;
883
884/*------------------------------------------------------------------------
885 Function: resolveNeutrals
886
887 Resolves the directionality of neutral character types.
888
889 Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm.
890
891 Input: Array of embedding levels
892 Character count
893 Baselevel
894
895 In/Out: Array of directional classes
896
897 Note: On input only these directional classes are expected
898 R, L, N, AN, EN and BN
899
900 W8 resolves a number of ENs to L
901------------------------------------------------------------------------*/
902void fz_bidi_resolve_neutrals(fz_bidi_level baselevel, fz_bidi_chartype *pcls, const fz_bidi_level *plevel, size_t cch)
903{
904 // the state at the start of text depends on the base level
905 int state = odd(baselevel) ? r : l;
906 fz_bidi_chartype cls;
907 size_t ich;
908 fz_bidi_chartype cls_run;
909
910 size_t cch_run = 0;
911 fz_bidi_level level = baselevel;
912
913 for (ich = 0; ich < cch; ich++)
914 {
915 int action;
916 fz_bidi_chartype cls_new;
917
918 // ignore boundary neutrals
919 if (pcls[ich] == BDI_BN)
920 {
921 // include in the count for a deferred run
922 if (cch_run)
923 cch_run++;
924
925 // skip any further processing
926 continue;
927 }
928
929 assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed"
930 cls = pcls[ich];
931
932 action = action_neutrals[state][cls];
933
934 // resolve the directionality for deferred runs
935 cls_run = get_deferred_neutrals(action, level);
936 if (cls_run != BDI_N)
937 {
938 set_deferred_run(pcls, cch_run, ich, cls_run);
939 cch_run = 0;
940 }
941
942 // resolve the directionality class at the current location
943 cls_new = get_resolved_neutrals(action);
944 if (cls_new != BDI_N)
945 pcls[ich] = cls_new;
946
947 if (In & action)
948 cch_run++;
949
950 state = state_neutrals[state][cls];
951 level = plevel[ich];
952 }
953
954 // resolve any deferred runs
955 cls = embedding_direction(level); // eor has type of current level
956
957 // resolve the directionality for deferred runs
958 cls_run = get_deferred_neutrals(action_neutrals[state][cls], level);
959 if (cls_run != BDI_N)
960 set_deferred_run(pcls, cch_run, ich, cls_run);
961}
962
963// === RESOLVE IMPLICITLY =================================================
964
965/*------------------------------------------------------------------------
966 Function: resolveImplicit
967
968 Recursively resolves implicit embedding levels.
969 Implements rules I1 and I2 of the Unicode Bidirectional Algorithm.
970
971 Input: Array of direction classes
972 Character count
973 Base level
974
975 In/Out: Array of embedding levels
976
977 Note: levels may exceed 15 on output.
978 Accepted subset of direction classes
979 R, L, AN, EN
980------------------------------------------------------------------------*/
981const fz_bidi_level add_level[][4] =
982{
983 // L, R, AN, EN = cls
984 // level =
985/* even */ { 0, 1, 2, 2 }, // EVEN
986/* odd */ { 1, 0, 1, 1 } // ODD
987
988};
989
990void fz_bidi_resolve_implicit(const fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
991{
992 size_t ich;
993
994 for (ich = 0; ich < cch; ich++)
995 {
996 // cannot resolve bn here, since some bn were resolved to strong
997 // types in resolveWeak. To remove these we need the original
998 // types, which are available again in resolveWhiteSpace
999 if (pcls[ich] == BDI_BN)
1000 {
1001 continue;
1002 }
1003 assert(pcls[ich] > 0); // "No Neutrals allowed to survive here."
1004 assert(pcls[ich] < 5); // "Out of range."
1005 plevel[ich] += add_level[odd(plevel[ich])][pcls[ich] - 1];
1006 }
1007}
1008
1009#if 0
1010// === REORDER ===========================================================
1011/*------------------------------------------------------------------------
1012 Function: resolve_lines
1013
1014 Breaks a paragraph into lines
1015
1016 Input: Character count
1017 Array of line break flags
1018 In/Out: Array of characters
1019
1020 Returns the count of characters on the first line
1021
1022 Note: This function only breaks lines at hard line breaks. Other
1023 line breaks can be passed in. If pbrk[n] is true, then a break
1024 occurs after the character in psz_input[n]. Breaks before the first
1025 character are not allowed.
1026------------------------------------------------------------------------*/
1027static int resolve_lines(uint32_t *psz_input, int *pbrk, int cch)
1028{
1029 int ich;
1030
1031 // skip characters not of type LS
1032 for(ich = 0; ich < cch; ich++)
1033 {
1034 if (psz_input[ich] == chLS || (pbrk && pbrk[ich]))
1035 {
1036 ich++;
1037 break;
1038 }
1039 }
1040
1041 return ich;
1042}
1043#endif
1044
1045/*------------------------------------------------------------------------
1046 Function: fz_bidi_resolve_whitespace
1047
1048 Resolves levels for WS and S
1049 Implements rule L1 of the Unicode bidi Algorithm.
1050
1051 Input: Base embedding level
1052 Character count
1053 Array of direction classes (for one line of text)
1054
1055 In/Out: Array of embedding levels (for one line of text)
1056
1057 Note: this should be applied a line at a time. The default driver
1058 code supplied in this file assumes a single line of text; for
1059 a real implementation, cch and the initial pointer values
1060 would have to be adjusted.
1061------------------------------------------------------------------------*/
1062void fz_bidi_resolve_whitespace(fz_bidi_level baselevel, const fz_bidi_chartype *pcls, fz_bidi_level *plevel,
1063 size_t cch)
1064{
1065 size_t cchrun = 0;
1066 fz_bidi_level oldlevel = baselevel;
1067 size_t ich;
1068
1069 for (ich = 0; ich < cch; ich++)
1070 {
1071 switch(pcls[ich])
1072 {
1073 default:
1074 cchrun = 0; // any other character breaks the run
1075 break;
1076 case BDI_WS:
1077 cchrun++;
1078 break;
1079
1080 case BDI_RLE:
1081 case BDI_LRE:
1082 case BDI_LRO:
1083 case BDI_RLO:
1084 case BDI_PDF:
1085 case BDI_BN:
1086 plevel[ich] = oldlevel;
1087 cchrun++;
1088 break;
1089
1090 case BDI_S:
1091 case BDI_B:
1092 // reset levels for WS before eot
1093 set_deferred_level_run(plevel, cchrun, ich, baselevel);
1094 cchrun = 0;
1095 plevel[ich] = baselevel;
1096 break;
1097 }
1098 oldlevel = plevel[ich];
1099 }
1100 // reset level before eot
1101 set_deferred_level_run(plevel, cchrun, ich, baselevel);
1102}
1103
1104#ifdef BIDI_LINE_AT_A_TIME
1105/*------------------------------------------------------------------------
1106 Functions: reorder/reorderLevel
1107
1108 Recursively reorders the display string
1109 "From the highest level down, reverse all characters at that level and
1110 higher, down to the lowest odd level"
1111
1112 Implements rule L2 of the Unicode bidi Algorithm.
1113
1114 Input: Array of embedding levels
1115 Character count
1116 Flag enabling reversal (set to false by initial caller)
1117
1118 In/Out: Text to reorder
1119
1120 Note: levels may exceed 15 resp. 61 on input.
1121
1122 Rule L3 - reorder combining marks is not implemented here
1123 Rule L4 - glyph mirroring is implemented as a display option below
1124
1125 Note: this should be applied a line at a time
1126-------------------------------------------------------------------------*/
1127static int reorderLevel(fz_bidi_level level, uint32_t *psz_text, const fz_bidi_level *plevel, int cch,
1128 int f_reverse)
1129{
1130 int ich;
1131
1132 // true as soon as first odd level encountered
1133 f_reverse = f_reverse || odd(level);
1134
1135 for (ich = 0; ich < cch; ich++)
1136 {
1137 if (plevel[ich] < level)
1138 {
1139 break;
1140 }
1141 else if (plevel[ich] > level)
1142 {
1143 ich += reorderLevel(level + 1, psz_text + ich, plevel + ich,
1144 cch - ich, f_reverse) - 1;
1145 }
1146 }
1147 if (f_reverse)
1148 {
1149 reverse(psz_text, ich);
1150 }
1151 return ich;
1152}
1153
1154int Bidi_reorder(fz_bidi_level baselevel, uint32_t *psz_text, const fz_bidi_level *plevel, int cch)
1155{
1156 int ich = 0;
1157
1158 while (ich < cch)
1159 {
1160 ich += reorderLevel(baselevel, psz_text + ich, plevel + ich,
1161 cch - ich, FALSE);
1162 }
1163 return ich;
1164}
1165#endif
1166