| 1 | /* | 
|---|
| 2 | * Bidirectional text processing. | 
|---|
| 3 | * | 
|---|
| 4 | * Processes unicode text by arranging the characters into an order suitable | 
|---|
| 5 | * for display. E.g. Hebrew text will be arranged from right-to-left and | 
|---|
| 6 | * any English within the text will remain in the left-to-right order. | 
|---|
| 7 | * Characters such as parenthesis will be substituted for their mirrored | 
|---|
| 8 | * equivalents if they are part of text which must be reversed. | 
|---|
| 9 | * | 
|---|
| 10 | * This is an implementation of the unicode Bidirectional Algorithm which | 
|---|
| 11 | * can be found here: http://www.unicode.org/reports/tr9/ and is based | 
|---|
| 12 | * on the reference implementation of the algorithm found on that page. | 
|---|
| 13 | * | 
|---|
| 14 | * For a nice overview of how it works, read this... | 
|---|
| 15 | * http://www.w3.org/TR/REC-html40/struct/dirlang.html | 
|---|
| 16 | * | 
|---|
| 17 | * Extracted from the SmartOffice code, where it was modified by Ian | 
|---|
| 18 | * Beveridge. | 
|---|
| 19 | * | 
|---|
| 20 | * Copyright (C) Picsel, 2004. All Rights Reserved. | 
|---|
| 21 | */ | 
|---|
| 22 |  | 
|---|
| 23 | /* | 
|---|
| 24 | * Original copyright notice from unicode reference implementation. | 
|---|
| 25 | * ---------------------------------------------------------------- | 
|---|
| 26 | * Written by: Asmus Freytag | 
|---|
| 27 | *	C++ and Windows dependencies removed, and | 
|---|
| 28 | *	command line interface added by: Rick McGowan | 
|---|
| 29 | * | 
|---|
| 30 | *	Copyright (C) 1999, ASMUS, Inc. All Rights Reserved | 
|---|
| 31 | */ | 
|---|
| 32 |  | 
|---|
| 33 | /* | 
|---|
| 34 | * Includes... | 
|---|
| 35 | */ | 
|---|
| 36 |  | 
|---|
| 37 | #include "mupdf/fitz.h" | 
|---|
| 38 | #include "mupdf/ucdn.h" | 
|---|
| 39 | #include "bidi-imp.h" /* standard bidi code interface */ | 
|---|
| 40 | #include <assert.h> | 
|---|
| 41 |  | 
|---|
| 42 | /* | 
|---|
| 43 | * Macros... | 
|---|
| 44 | */ | 
|---|
| 45 |  | 
|---|
| 46 | #define ODD(x) ((x) & 1) | 
|---|
| 47 |  | 
|---|
| 48 | #define REPLACEABLE_TYPE(t) ( \ | 
|---|
| 49 | ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \ | 
|---|
| 50 | ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \ | 
|---|
| 51 | ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) ) | 
|---|
| 52 |  | 
|---|
| 53 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 54 | #define DBUGVF(params) do { fz_warn params; } while (0) | 
|---|
| 55 | #else | 
|---|
| 56 | #define DBUGVF(params) do {} while (0) | 
|---|
| 57 | #endif | 
|---|
| 58 |  | 
|---|
| 59 | #ifdef DEBUG_BIDI_OUTLINE | 
|---|
| 60 | #define DBUGH(params) do { fz_warn params; } while (0) | 
|---|
| 61 | #else | 
|---|
| 62 | #define DBUGH(params) do {} while (0) | 
|---|
| 63 | #endif | 
|---|
| 64 |  | 
|---|
| 65 | #define UNICODE_EOS					0 | 
|---|
| 66 | #define UNICODE_DIGIT_ZERO				0x0030 | 
|---|
| 67 | #define UNICODE_DIGIT_NINE				0x0039 | 
|---|
| 68 | #define UNICODE_SUPERSCRIPT_TWO				0x00B2 | 
|---|
| 69 | #define UNICODE_SUPERSCRIPT_THREE			0x00B3 | 
|---|
| 70 | #define UNICODE_SUPERSCRIPT_ONE				0x00B9 | 
|---|
| 71 | #define UNICODE_RTL_START				0x0590 | 
|---|
| 72 | #define UNICODE_RTL_END					0x07BF | 
|---|
| 73 | #define UNICODE_ARABIC_INDIC_DIGIT_ZERO			0x0660 | 
|---|
| 74 | #define UNICODE_ARABIC_INDIC_DIGIT_NINE			0x0669 | 
|---|
| 75 | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO	0x06F0 | 
|---|
| 76 | #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE	0x06F9 | 
|---|
| 77 | #define UNICODE_ZERO_WIDTH_NON_JOINER			0x200C | 
|---|
| 78 | #define UNICODE_SUPERSCRIPT_ZERO			0x2070 | 
|---|
| 79 | #define UNICODE_SUPERSCRIPT_FOUR			0x2074 | 
|---|
| 80 | #define UNICODE_SUPERSCRIPT_NINE			0x2079 | 
|---|
| 81 | #define UNICODE_SUBSCRIPT_ZERO				0x2080 | 
|---|
| 82 | #define UNICODE_SUBSCRIPT_NINE				0x2089 | 
|---|
| 83 | #define UNICODE_CIRCLED_DIGIT_ONE			0x2460 | 
|---|
| 84 | #define UNICODE_NUMBER_TWENTY_FULL_STOP			0x249B | 
|---|
| 85 | #define UNICODE_CIRCLED_DIGIT_ZERO			0x24EA | 
|---|
| 86 | #define UNICODE_FULLWIDTH_DIGIT_ZERO			0xFF10 | 
|---|
| 87 | #define UNICODE_FULLWIDTH_DIGIT_NINE			0xFF19 | 
|---|
| 88 |  | 
|---|
| 89 | #ifndef TRUE | 
|---|
| 90 | #define TRUE (1) | 
|---|
| 91 | #endif | 
|---|
| 92 | #ifndef FALSE | 
|---|
| 93 | #define FALSE (0) | 
|---|
| 94 | #endif | 
|---|
| 95 |  | 
|---|
| 96 | /* | 
|---|
| 97 | * Enumerations... | 
|---|
| 98 | */ | 
|---|
| 99 |  | 
|---|
| 100 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 101 | /* display support: */ | 
|---|
| 102 | static const char char_from_types[] = | 
|---|
| 103 | { | 
|---|
| 104 | ' ',	/* ON */ | 
|---|
| 105 | '>',	/* L */ | 
|---|
| 106 | '<',	/* R */ | 
|---|
| 107 | '9',	/* AN */ | 
|---|
| 108 | '1',	/* EN */ | 
|---|
| 109 | 'a',	/* AL */ | 
|---|
| 110 | '@',	/* NSM */ | 
|---|
| 111 | '.',	/* CS */ | 
|---|
| 112 | ',',	/* ES */ | 
|---|
| 113 | '$',	/* ET */ | 
|---|
| 114 | ':',	/* BN */ | 
|---|
| 115 | 'X',	/* S */ | 
|---|
| 116 | '_',	/* WS */ | 
|---|
| 117 | 'B',	/* B */ | 
|---|
| 118 | '+',	/* RLO */ | 
|---|
| 119 | '+',	/* RLE */ | 
|---|
| 120 | '+',	/* LRO */ | 
|---|
| 121 | '+',	/* LRE */ | 
|---|
| 122 | '-',	/* PDF */ | 
|---|
| 123 | '='	/* LS */ | 
|---|
| 124 | }; | 
|---|
| 125 | #endif | 
|---|
| 126 |  | 
|---|
| 127 | /* | 
|---|
| 128 | * Functions and static functions... | 
|---|
| 129 | */ | 
|---|
| 130 |  | 
|---|
| 131 | /* UCDN uses a different ordering than Bidi does. We cannot | 
|---|
| 132 | * change to the UCDN ordering, as the bidi-std.c code relies | 
|---|
| 133 | * on the exact ordering (at least that N = ON = 0). We | 
|---|
| 134 | * therefore map between the two using this small table. It | 
|---|
| 135 | * also takes care of fudging LRI, RLI, FSI and PDI, that this | 
|---|
| 136 | * code does not currently support. */ | 
|---|
| 137 | static const uint8_t ucdn_to_bidi[] = | 
|---|
| 138 | { | 
|---|
| 139 | BDI_L,		/* UCDN_BIDI_CLASS_L = 0 */ | 
|---|
| 140 | BDI_LRE,	/* UCDN_BIDI_CLASS_LRE = 1 */ | 
|---|
| 141 | BDI_LRO,	/* UCDN_BIDI_CLASS_LRO = 2 */ | 
|---|
| 142 | BDI_R,		/* UCDN_BIDI_CLASS_R = 3 */ | 
|---|
| 143 | BDI_AL,		/* UCDN_BIDI_CLASS_AL = 4 */ | 
|---|
| 144 | BDI_RLE,	/* UCDN_BIDI_CLASS_RLE = 5 */ | 
|---|
| 145 | BDI_RLO,	/* UCDN_BIDI_CLASS_RLO = 6 */ | 
|---|
| 146 | BDI_PDF,	/* UCDN_BIDI_CLASS_PDF = 7 */ | 
|---|
| 147 | BDI_EN,		/* UCDN_BIDI_CLASS_EN = 8 */ | 
|---|
| 148 | BDI_ES,		/* UCDN_BIDI_CLASS_ES = 9 */ | 
|---|
| 149 | BDI_ET,		/* UCDN_BIDI_CLASS_ET = 10 */ | 
|---|
| 150 | BDI_AN,		/* UCDN_BIDI_CLASS_AN = 11 */ | 
|---|
| 151 | BDI_CS,		/* UCDN_BIDI_CLASS_CS = 12 */ | 
|---|
| 152 | BDI_NSM,	/* UCDN_BIDI_CLASS_NSM = 13 */ | 
|---|
| 153 | BDI_BN,		/* UCDN_BIDI_CLASS_BN = 14 */ | 
|---|
| 154 | BDI_B,		/* UCDN_BIDI_CLASS_B = 15 */ | 
|---|
| 155 | BDI_S,		/* UCDN_BIDI_CLASS_S = 16 */ | 
|---|
| 156 | BDI_WS,		/* UCDN_BIDI_CLASS_WS = 17 */ | 
|---|
| 157 | BDI_ON,		/* UCDN_BIDI_CLASS_ON = 18 */ | 
|---|
| 158 | BDI_LRE,	/* UCDN_BIDI_CLASS_LRI = 19 */ | 
|---|
| 159 | BDI_RLE,	/* UCDN_BIDI_CLASS_RLI = 20 */ | 
|---|
| 160 | BDI_N,		/* UCDN_BIDI_CLASS_FSI = 21 */ | 
|---|
| 161 | BDI_N,		/* UCDN_BIDI_CLASS_PDI = 22 */ | 
|---|
| 162 | }; | 
|---|
| 163 |  | 
|---|
| 164 | #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)]) | 
|---|
| 165 |  | 
|---|
| 166 | /* Return a direction for white-space on the second pass of the algorithm. */ | 
|---|
| 167 | static fz_bidi_chartype class_from_ch_n(uint32_t ch) | 
|---|
| 168 | { | 
|---|
| 169 | fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch); | 
|---|
| 170 | if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS) | 
|---|
| 171 | return BDI_N; | 
|---|
| 172 | return from_ch_ws; | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | /* Split fragments into single scripts (or punctuation + single script) */ | 
|---|
| 176 | static void | 
|---|
| 177 | split_at_script(const uint32_t *fragment, | 
|---|
| 178 | size_t fragment_len, | 
|---|
| 179 | int level, | 
|---|
| 180 | void *arg, | 
|---|
| 181 | fz_bidi_fragment_fn *callback) | 
|---|
| 182 | { | 
|---|
| 183 | int script = UCDN_SCRIPT_COMMON; | 
|---|
| 184 | size_t script_start, i; | 
|---|
| 185 |  | 
|---|
| 186 | script_start = 0; | 
|---|
| 187 | for (i = 0; i < fragment_len; i++) | 
|---|
| 188 | { | 
|---|
| 189 | int s = ucdn_get_script(fragment[i]); | 
|---|
| 190 | if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED) | 
|---|
| 191 | { | 
|---|
| 192 | /* Punctuation etc. This is fine. */ | 
|---|
| 193 | } | 
|---|
| 194 | else if (s == script) | 
|---|
| 195 | { | 
|---|
| 196 | /* Same script. Still fine. */ | 
|---|
| 197 | } | 
|---|
| 198 | else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED) | 
|---|
| 199 | { | 
|---|
| 200 | /* First non punctuation thing. Set the script. */ | 
|---|
| 201 | script = s; | 
|---|
| 202 | } | 
|---|
| 203 | else | 
|---|
| 204 | { | 
|---|
| 205 | /* Change of script. Break the fragment. */ | 
|---|
| 206 | (*callback)(&fragment[script_start], i - script_start, level, script, arg); | 
|---|
| 207 | script_start = i; | 
|---|
| 208 | script = s; | 
|---|
| 209 | } | 
|---|
| 210 | } | 
|---|
| 211 | if (script_start != fragment_len) | 
|---|
| 212 | { | 
|---|
| 213 | (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg); | 
|---|
| 214 | } | 
|---|
| 215 | } | 
|---|
| 216 |  | 
|---|
| 217 | /* Determines the character classes for all following | 
|---|
| 218 | * passes of the algorithm. A character class is basically the type of Bidi | 
|---|
| 219 | * behaviour that the character exhibits. | 
|---|
| 220 | */ | 
|---|
| 221 | static void | 
|---|
| 222 | classify_characters(const uint32_t *text, | 
|---|
| 223 | fz_bidi_chartype *types, | 
|---|
| 224 | size_t len, | 
|---|
| 225 | fz_bidi_flags flags) | 
|---|
| 226 | { | 
|---|
| 227 | size_t i; | 
|---|
| 228 |  | 
|---|
| 229 | if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0) | 
|---|
| 230 | { | 
|---|
| 231 | for (i = 0; i < len; i++) | 
|---|
| 232 | { | 
|---|
| 233 | types[i] = class_from_ch_ws(text[i]); | 
|---|
| 234 | } | 
|---|
| 235 | } | 
|---|
| 236 | else | 
|---|
| 237 | { | 
|---|
| 238 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 239 | fprintf(stderr, "Text:  "); | 
|---|
| 240 | for (i = 0; i < len; i++) | 
|---|
| 241 | { | 
|---|
| 242 | /* So that we can actually sort of read the debug string, any | 
|---|
| 243 | * non-ascii characters are replaced with a 1-digit hash | 
|---|
| 244 | * value from 0-9, making non-english characters appear | 
|---|
| 245 | * as numbers | 
|---|
| 246 | */ | 
|---|
| 247 | fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ? | 
|---|
| 248 | text[i] : text[i] % 9 + '0'); | 
|---|
| 249 | } | 
|---|
| 250 | fprintf(stderr, "\nTypes: "); | 
|---|
| 251 | #endif | 
|---|
| 252 | for (i = 0; i < len; i++) | 
|---|
| 253 | { | 
|---|
| 254 | types[i] = class_from_ch_n(text[i]); | 
|---|
| 255 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 256 | fprintf(stderr, "%c", char_from_types[(int)types[i]]); | 
|---|
| 257 | #endif | 
|---|
| 258 | } | 
|---|
| 259 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 260 | fprintf(stderr, "\n"); | 
|---|
| 261 | #endif | 
|---|
| 262 | } | 
|---|
| 263 | } | 
|---|
| 264 |  | 
|---|
| 265 | /* Determines the base level of the text. | 
|---|
| 266 | * Implements rule P2 of the Unicode Bidi Algorithm. | 
|---|
| 267 | * Note: Ignores explicit embeddings | 
|---|
| 268 | */ | 
|---|
| 269 | static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len) | 
|---|
| 270 | { | 
|---|
| 271 | size_t i; | 
|---|
| 272 |  | 
|---|
| 273 | for (i = 0; i < len; i++) | 
|---|
| 274 | { | 
|---|
| 275 | switch (types[i]) | 
|---|
| 276 | { | 
|---|
| 277 | /* strong left */ | 
|---|
| 278 | case BDI_L: | 
|---|
| 279 | return FZ_BIDI_LTR; | 
|---|
| 280 |  | 
|---|
| 281 | /* strong right */ | 
|---|
| 282 | case BDI_R: | 
|---|
| 283 | case BDI_AL: | 
|---|
| 284 | return FZ_BIDI_RTL; | 
|---|
| 285 | } | 
|---|
| 286 | } | 
|---|
| 287 | return FZ_BIDI_LTR; | 
|---|
| 288 | } | 
|---|
| 289 |  | 
|---|
| 290 | static fz_bidi_direction direction_from_type(fz_bidi_chartype type) | 
|---|
| 291 | { | 
|---|
| 292 | switch (type) | 
|---|
| 293 | { | 
|---|
| 294 | case BDI_L: | 
|---|
| 295 | case BDI_EN: | 
|---|
| 296 | return FZ_BIDI_LTR; | 
|---|
| 297 |  | 
|---|
| 298 | case BDI_R: | 
|---|
| 299 | case BDI_AL: | 
|---|
| 300 | return FZ_BIDI_RTL; | 
|---|
| 301 |  | 
|---|
| 302 | default: | 
|---|
| 303 | return FZ_BIDI_NEUTRAL; | 
|---|
| 304 | } | 
|---|
| 305 | } | 
|---|
| 306 |  | 
|---|
| 307 | static void | 
|---|
| 308 | classify_quoted_blocks(const uint32_t *text, | 
|---|
| 309 | fz_bidi_chartype *types, | 
|---|
| 310 | size_t len) | 
|---|
| 311 | { | 
|---|
| 312 | size_t i; | 
|---|
| 313 | int inQuote = FALSE; | 
|---|
| 314 | int pdfNeeded = FALSE; | 
|---|
| 315 | int ltrFound = FALSE; | 
|---|
| 316 | int rtlFound = FALSE; | 
|---|
| 317 |  | 
|---|
| 318 | /* Only do anything special here if there is mixed content | 
|---|
| 319 | * (LTR *and* RTL) in the text. | 
|---|
| 320 | */ | 
|---|
| 321 | for (i = 0; i < len; i++) | 
|---|
| 322 | { | 
|---|
| 323 | switch (direction_from_type(types[i])) | 
|---|
| 324 | { | 
|---|
| 325 | case FZ_BIDI_LTR: | 
|---|
| 326 | ltrFound = TRUE; | 
|---|
| 327 | break; | 
|---|
| 328 |  | 
|---|
| 329 | case FZ_BIDI_RTL: | 
|---|
| 330 | rtlFound = TRUE; | 
|---|
| 331 | break; | 
|---|
| 332 |  | 
|---|
| 333 | default: | 
|---|
| 334 | break; | 
|---|
| 335 | } | 
|---|
| 336 | } | 
|---|
| 337 |  | 
|---|
| 338 | /* Only make any changes if *both* LTR and RTL characters exist | 
|---|
| 339 | * in this text. | 
|---|
| 340 | */ | 
|---|
| 341 | if (!ltrFound || !rtlFound) | 
|---|
| 342 | { | 
|---|
| 343 | return; | 
|---|
| 344 | } | 
|---|
| 345 |  | 
|---|
| 346 | for (i = 0; i < len; i++) | 
|---|
| 347 | { | 
|---|
| 348 | if (text[i]=='"') | 
|---|
| 349 | { | 
|---|
| 350 | /* If we're already in a quote then terminate it, | 
|---|
| 351 | * else start a new block. | 
|---|
| 352 | */ | 
|---|
| 353 | if (inQuote) | 
|---|
| 354 | { | 
|---|
| 355 | inQuote = FALSE; | 
|---|
| 356 | if (pdfNeeded) | 
|---|
| 357 | { | 
|---|
| 358 | pdfNeeded = FALSE; | 
|---|
| 359 | types[i] = BDI_PDF; | 
|---|
| 360 | } | 
|---|
| 361 | } | 
|---|
| 362 | else | 
|---|
| 363 | { | 
|---|
| 364 | size_t j; | 
|---|
| 365 | int done = FALSE; | 
|---|
| 366 |  | 
|---|
| 367 | inQuote = TRUE; | 
|---|
| 368 |  | 
|---|
| 369 | /* Find the first strong right or left type and | 
|---|
| 370 | * use that to determine whether we should classify | 
|---|
| 371 | * the quote as LRE or RLE. Or neither, if we | 
|---|
| 372 | * hit another quote before any strongly-directional | 
|---|
| 373 | * character. | 
|---|
| 374 | */ | 
|---|
| 375 | for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j) | 
|---|
| 376 | { | 
|---|
| 377 | switch(types[j]) | 
|---|
| 378 | { | 
|---|
| 379 | case BDI_RLE: | 
|---|
| 380 | case BDI_LRE: | 
|---|
| 381 | done = TRUE; | 
|---|
| 382 | break; | 
|---|
| 383 |  | 
|---|
| 384 | case BDI_L: | 
|---|
| 385 | case BDI_EN: | 
|---|
| 386 | types[i] = BDI_LRE; | 
|---|
| 387 | pdfNeeded = TRUE; | 
|---|
| 388 | done = TRUE; | 
|---|
| 389 | break; | 
|---|
| 390 |  | 
|---|
| 391 | case BDI_R: | 
|---|
| 392 | case BDI_AL: | 
|---|
| 393 | types[i] = BDI_RLE; | 
|---|
| 394 | pdfNeeded = TRUE; | 
|---|
| 395 | done = TRUE; | 
|---|
| 396 | break; | 
|---|
| 397 |  | 
|---|
| 398 | default: | 
|---|
| 399 | break; | 
|---|
| 400 | } | 
|---|
| 401 | } | 
|---|
| 402 | } | 
|---|
| 403 | } | 
|---|
| 404 | } | 
|---|
| 405 | } | 
|---|
| 406 |  | 
|---|
| 407 | /* Creates a buffer with an embedding level for every character in the | 
|---|
| 408 | * given text. Also determines the base level and returns it in | 
|---|
| 409 | * *baseDir if *baseDir does not initially contain a valid direction. | 
|---|
| 410 | */ | 
|---|
| 411 | static fz_bidi_level * | 
|---|
| 412 | create_levels(fz_context *ctx, | 
|---|
| 413 | const uint32_t *text, | 
|---|
| 414 | size_t len, | 
|---|
| 415 | fz_bidi_direction *baseDir, | 
|---|
| 416 | int resolveWhiteSpace, | 
|---|
| 417 | int flags) | 
|---|
| 418 | { | 
|---|
| 419 | fz_bidi_level *levels, *plevels; | 
|---|
| 420 | fz_bidi_chartype *types = NULL; | 
|---|
| 421 | fz_bidi_chartype *ptypes; | 
|---|
| 422 | fz_bidi_level baseLevel; | 
|---|
| 423 | const uint32_t *ptext; | 
|---|
| 424 | size_t plen, remaining; | 
|---|
| 425 |  | 
|---|
| 426 | levels = fz_malloc(ctx, len * sizeof(*levels)); | 
|---|
| 427 |  | 
|---|
| 428 | fz_var(types); | 
|---|
| 429 |  | 
|---|
| 430 | fz_try(ctx) | 
|---|
| 431 | { | 
|---|
| 432 | types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype)); | 
|---|
| 433 |  | 
|---|
| 434 | classify_characters(text, types, len, flags); | 
|---|
| 435 |  | 
|---|
| 436 | if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL) | 
|---|
| 437 | { | 
|---|
| 438 | /* Derive the base level from the text and | 
|---|
| 439 | * update *baseDir in case the caller wants to know. | 
|---|
| 440 | */ | 
|---|
| 441 | baseLevel = base_level_from_text(types, len); | 
|---|
| 442 | *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; | 
|---|
| 443 | } | 
|---|
| 444 | else | 
|---|
| 445 | { | 
|---|
| 446 | baseLevel = (fz_bidi_level)*baseDir; | 
|---|
| 447 | } | 
|---|
| 448 |  | 
|---|
| 449 | { | 
|---|
| 450 | /* Replace tab with base direction, i.e. make tab appear as | 
|---|
| 451 | * 'strong left' if the base direction is left-to-right and | 
|---|
| 452 | * 'strong right' if base direction is right-to-left. This | 
|---|
| 453 | * allows Layout to implicitly treat tabs as 'segment separators'. | 
|---|
| 454 | */ | 
|---|
| 455 | size_t i; | 
|---|
| 456 |  | 
|---|
| 457 | for (i = 0u; i < len; i++) | 
|---|
| 458 | { | 
|---|
| 459 | if (text[i]=='\t') | 
|---|
| 460 | { | 
|---|
| 461 | types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L; | 
|---|
| 462 | } | 
|---|
| 463 | } | 
|---|
| 464 | } | 
|---|
| 465 |  | 
|---|
| 466 | /* Look for quotation marks. Classify them as RLE or LRE | 
|---|
| 467 | * or leave them alone, depending on what follows them. | 
|---|
| 468 | */ | 
|---|
| 469 | classify_quoted_blocks(text, types, len); | 
|---|
| 470 |  | 
|---|
| 471 | /* Work one paragraph at a time. */ | 
|---|
| 472 | plevels = levels; | 
|---|
| 473 | ptypes = types; | 
|---|
| 474 | ptext = text; | 
|---|
| 475 | remaining = len; | 
|---|
| 476 | while (remaining) | 
|---|
| 477 | { | 
|---|
| 478 | plen = fz_bidi_resolve_paragraphs(ptypes, remaining); | 
|---|
| 479 |  | 
|---|
| 480 | /* Work out the levels and character types... */ | 
|---|
| 481 | (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0); | 
|---|
| 482 | fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen); | 
|---|
| 483 | fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen); | 
|---|
| 484 | fz_bidi_resolve_implicit(ptypes, plevels, plen); | 
|---|
| 485 |  | 
|---|
| 486 | classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE); | 
|---|
| 487 |  | 
|---|
| 488 | if (resolveWhiteSpace) | 
|---|
| 489 | { | 
|---|
| 490 | /* resolve whitespace */ | 
|---|
| 491 | fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen); | 
|---|
| 492 | } | 
|---|
| 493 |  | 
|---|
| 494 | plevels += plen; | 
|---|
| 495 | ptypes += plen; | 
|---|
| 496 | ptext += plen; | 
|---|
| 497 | remaining -= plen; | 
|---|
| 498 | } | 
|---|
| 499 |  | 
|---|
| 500 | /* The levels buffer now has odd and even numbers indicating | 
|---|
| 501 | * rtl or ltr characters, respectively. | 
|---|
| 502 | */ | 
|---|
| 503 | #ifdef DEBUG_BIDI_VERBOSE | 
|---|
| 504 | fprintf(stderr, "Levels: "); | 
|---|
| 505 | { | 
|---|
| 506 | size_t i; | 
|---|
| 507 | for (i = 0; i < len; i++) | 
|---|
| 508 | { | 
|---|
| 509 | fprintf(stderr, "%d", levels[i]>9?0:levels[i]); | 
|---|
| 510 | } | 
|---|
| 511 | fprintf(stderr, "\n"); | 
|---|
| 512 | } | 
|---|
| 513 | #endif | 
|---|
| 514 | } | 
|---|
| 515 | fz_always(ctx) | 
|---|
| 516 | { | 
|---|
| 517 | fz_free(ctx, types); | 
|---|
| 518 | } | 
|---|
| 519 | fz_catch(ctx) | 
|---|
| 520 | { | 
|---|
| 521 | fz_free(ctx, levels); | 
|---|
| 522 | fz_rethrow(ctx); | 
|---|
| 523 | } | 
|---|
| 524 | return levels; | 
|---|
| 525 | } | 
|---|
| 526 |  | 
|---|
| 527 | /* Partitions the given character sequence into one or more unidirectional | 
|---|
| 528 | * fragments and invokes the given callback function for each fragment. | 
|---|
| 529 | */ | 
|---|
| 530 | void fz_bidi_fragment_text(fz_context *ctx, | 
|---|
| 531 | const uint32_t *text, | 
|---|
| 532 | size_t textlen, | 
|---|
| 533 | fz_bidi_direction *baseDir, | 
|---|
| 534 | fz_bidi_fragment_fn *callback, | 
|---|
| 535 | void *arg, | 
|---|
| 536 | int flags) | 
|---|
| 537 | { | 
|---|
| 538 | size_t startOfFragment; | 
|---|
| 539 | size_t i; | 
|---|
| 540 | fz_bidi_level *levels; | 
|---|
| 541 |  | 
|---|
| 542 | if (text == NULL || callback == NULL || textlen == 0) | 
|---|
| 543 | return; | 
|---|
| 544 |  | 
|---|
| 545 | DBUGH(( "fz_bidi_fragment_text('%S', len = %d)\n", text, textlen)); | 
|---|
| 546 |  | 
|---|
| 547 | levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags); | 
|---|
| 548 |  | 
|---|
| 549 | /* We now have an array with an embedding level | 
|---|
| 550 | * for each character in text. | 
|---|
| 551 | */ | 
|---|
| 552 | assert(levels != NULL); | 
|---|
| 553 |  | 
|---|
| 554 | fz_try(ctx) | 
|---|
| 555 | { | 
|---|
| 556 | startOfFragment = 0; | 
|---|
| 557 | for (i = 1; i < textlen; i++) | 
|---|
| 558 | { | 
|---|
| 559 | if (levels[i] != levels[i-1]) | 
|---|
| 560 | { | 
|---|
| 561 | /* We've gone past the end of the fragment. | 
|---|
| 562 | * Create a text object for it, then start | 
|---|
| 563 | * a new fragment. | 
|---|
| 564 | */ | 
|---|
| 565 | split_at_script(&text[startOfFragment], | 
|---|
| 566 | i - startOfFragment, | 
|---|
| 567 | levels[startOfFragment], | 
|---|
| 568 | arg, | 
|---|
| 569 | callback); | 
|---|
| 570 | startOfFragment = i; | 
|---|
| 571 | } | 
|---|
| 572 | } | 
|---|
| 573 | /* Now i == textlen. Deal with the final (or maybe only) fragment. */ | 
|---|
| 574 | /* otherwise create 1 fragment */ | 
|---|
| 575 | split_at_script(&text[startOfFragment], | 
|---|
| 576 | i - startOfFragment, | 
|---|
| 577 | levels[startOfFragment], | 
|---|
| 578 | arg, | 
|---|
| 579 | callback); | 
|---|
| 580 | } | 
|---|
| 581 | fz_always(ctx) | 
|---|
| 582 | { | 
|---|
| 583 | fz_free(ctx, levels); | 
|---|
| 584 | } | 
|---|
| 585 | fz_catch(ctx) | 
|---|
| 586 | { | 
|---|
| 587 | fz_rethrow(ctx); | 
|---|
| 588 | } | 
|---|
| 589 | } | 
|---|
| 590 |  | 
|---|