1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 1999-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: ubidiimp.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 1999aug06
16* created by: Markus W. Scherer, updated by Matitiahu Allouche
17*/
18
19#ifndef UBIDIIMP_H
20#define UBIDIIMP_H
21
22#include "unicode/utypes.h"
23#include "unicode/ubidi.h"
24#include "unicode/uchar.h"
25#include "ubidi_props.h"
26
27/* miscellaneous definitions ---------------------------------------------- */
28
29// ICU-20853=ICU-20935 Solaris #defines CS and ES in sys/regset.h
30#ifdef CS
31# undef CS
32#endif
33#ifdef ES
34# undef ES
35#endif
36
37typedef uint8_t DirProp;
38typedef uint32_t Flags;
39
40/* Comparing the description of the BiDi algorithm with this implementation
41 is easier with the same names for the BiDi types in the code as there.
42 See UCharDirection in uchar.h .
43*/
44enum {
45 L= U_LEFT_TO_RIGHT, /* 0 */
46 R= U_RIGHT_TO_LEFT, /* 1 */
47 EN= U_EUROPEAN_NUMBER, /* 2 */
48 ES= U_EUROPEAN_NUMBER_SEPARATOR, /* 3 */
49 ET= U_EUROPEAN_NUMBER_TERMINATOR, /* 4 */
50 AN= U_ARABIC_NUMBER, /* 5 */
51 CS= U_COMMON_NUMBER_SEPARATOR, /* 6 */
52 B= U_BLOCK_SEPARATOR, /* 7 */
53 S= U_SEGMENT_SEPARATOR, /* 8 */
54 WS= U_WHITE_SPACE_NEUTRAL, /* 9 */
55 ON= U_OTHER_NEUTRAL, /* 10 */
56 LRE=U_LEFT_TO_RIGHT_EMBEDDING, /* 11 */
57 LRO=U_LEFT_TO_RIGHT_OVERRIDE, /* 12 */
58 AL= U_RIGHT_TO_LEFT_ARABIC, /* 13 */
59 RLE=U_RIGHT_TO_LEFT_EMBEDDING, /* 14 */
60 RLO=U_RIGHT_TO_LEFT_OVERRIDE, /* 15 */
61 PDF=U_POP_DIRECTIONAL_FORMAT, /* 16 */
62 NSM=U_DIR_NON_SPACING_MARK, /* 17 */
63 BN= U_BOUNDARY_NEUTRAL, /* 18 */
64 FSI=U_FIRST_STRONG_ISOLATE, /* 19 */
65 LRI=U_LEFT_TO_RIGHT_ISOLATE, /* 20 */
66 RLI=U_RIGHT_TO_LEFT_ISOLATE, /* 21 */
67 PDI=U_POP_DIRECTIONAL_ISOLATE, /* 22 */
68 ENL, /* EN after W7 */ /* 23 */
69 ENR, /* EN not subject to W7 */ /* 24 */
70 dirPropCount
71};
72
73/* Sometimes, bit values are more appropriate
74 to deal with directionality properties.
75 Abbreviations in these macro names refer to names
76 used in the BiDi algorithm.
77*/
78#define DIRPROP_FLAG(dir) (1UL<<(dir))
79#define PURE_DIRPROP(prop) ((prop)&~0xE0) ?????????????????????????
80
81/* special flag for multiple runs from explicit embedding codes */
82#define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
83
84/* are there any characters that are LTR or RTL? */
85#define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
86#define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
87#define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
88#define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
89
90/* explicit embedding codes */
91#define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
92
93/* explicit isolate codes */
94#define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
95
96#define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
97
98/* paragraph and segment separators */
99#define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
100
101/* all types that are counted as White Space or Neutral in some steps */
102#define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
103
104/* types that are neutrals or could becomes neutrals in (Wn) */
105#define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
106
107/*
108 * These types may be changed to "e",
109 * the embedding type (L or R) of the run,
110 * in the BiDi algorithm (N2)
111 */
112#define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
113
114/* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
115#define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
116
117#define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
118
119/*
120 * The following bit is used for the directional isolate status.
121 * Stack entries corresponding to isolate sequences are greater than ISOLATE.
122 */
123#define ISOLATE 0x0100
124
125U_CFUNC UBiDiLevel
126ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
127
128#define GET_PARALEVEL(ubidi, index) \
129 ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
130 (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
131
132/* number of paras entries allocated initially without malloc */
133#define SIMPLE_PARAS_COUNT 10
134/* number of isolate entries allocated initially without malloc */
135#define SIMPLE_ISOLATES_COUNT 5
136/* number of isolate run entries for paired brackets allocated initially without malloc */
137#define SIMPLE_OPENINGS_COUNT 20
138
139#define CR 0x000D
140#define LF 0x000A
141
142/* Run structure for reordering --------------------------------------------- */
143enum {
144 LRM_BEFORE=1,
145 LRM_AFTER=2,
146 RLM_BEFORE=4,
147 RLM_AFTER=8
148};
149
150typedef struct Para {
151 int32_t limit;
152 int32_t level;
153} Para;
154
155enum { /* flags for Opening.flags */
156 FOUND_L=DIRPROP_FLAG(L),
157 FOUND_R=DIRPROP_FLAG(R)
158};
159
160typedef struct Opening {
161 int32_t position; /* position of opening bracket */
162 int32_t match; /* matching char or -position of closing bracket */
163 int32_t contextPos; /* position of last strong char found before opening */
164 uint16_t flags; /* bits for L or R/AL found within the pair */
165 UBiDiDirection contextDir; /* L or R according to last strong char before opening */
166 uint8_t filler; /* to complete a nice multiple of 4 chars */
167} Opening;
168
169typedef struct IsoRun {
170 int32_t contextPos; /* position of char determining context */
171 uint16_t start; /* index of first opening entry for this run */
172 uint16_t limit; /* index after last opening entry for this run */
173 UBiDiLevel level; /* level of this run */
174 DirProp lastStrong; /* bidi class of last strong char found in this run */
175 DirProp lastBase; /* bidi class of last base char found in this run */
176 UBiDiDirection contextDir; /* L or R to use as context for following openings */
177} IsoRun;
178
179typedef struct BracketData {
180 UBiDi *pBiDi;
181 /* array of opening entries which should be enough in most cases; no malloc() */
182 Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
183 Opening *openings; /* pointer to current array of entries */
184 int32_t openingsCount; /* number of allocated entries */
185 int32_t isoRunLast; /* index of last used entry */
186 /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
187 + 1 for index 0, + 1 for before the first isolated sequence */
188 IsoRun isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
189 UBool isNumbersSpecial; /* reordering mode for NUMBERS_SPECIAL */
190} BracketData;
191
192typedef struct Isolate {
193 int32_t startON;
194 int32_t start1;
195 int32_t state;
196 int16_t stateImp;
197} Isolate;
198
199typedef struct Run {
200 int32_t logicalStart, /* first character of the run; b31 indicates even/odd level */
201 visualLimit, /* last visual position of the run +1 */
202 insertRemove; /* if >0, flags for inserting LRM/RLM before/after run,
203 if <0, count of bidi controls within run */
204} Run;
205
206/* in a Run, logicalStart will get this bit set if the run level is odd */
207#define INDEX_ODD_BIT (1UL<<31)
208
209#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31))
210#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31))
211#define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT)
212
213#define GET_INDEX(x) ((x)&~INDEX_ODD_BIT)
214#define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
215#define IS_ODD_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)!=0))
216#define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
217
218U_CFUNC UBool
219ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
220
221/** BiDi control code points */
222enum {
223 ZWNJ_CHAR=0x200c,
224 ZWJ_CHAR,
225 LRM_CHAR,
226 RLM_CHAR,
227 LRE_CHAR=0x202a,
228 RLE_CHAR,
229 PDF_CHAR,
230 LRO_CHAR,
231 RLO_CHAR,
232 LRI_CHAR=0x2066,
233 RLI_CHAR,
234 FSI_CHAR,
235 PDI_CHAR
236};
237
238#define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
239
240/* InsertPoints structure for noting where to put BiDi marks ---------------- */
241
242typedef struct Point {
243 int32_t pos; /* position in text */
244 int32_t flag; /* flag for LRM/RLM, before/after */
245} Point;
246
247typedef struct InsertPoints {
248 int32_t capacity; /* number of points allocated */
249 int32_t size; /* number of points used */
250 int32_t confirmed; /* number of points confirmed */
251 UErrorCode errorCode; /* for eventual memory shortage */
252 Point *points; /* pointer to array of points */
253} InsertPoints;
254
255
256/* UBiDi structure ----------------------------------------------------------- */
257
258struct UBiDi {
259 /* pointer to parent paragraph object (pointer to self if this object is
260 * a paragraph object); set to NULL in a newly opened object; set to a
261 * real value after a successful execution of ubidi_setPara or ubidi_setLine
262 */
263 const UBiDi * pParaBiDi;
264
265 /* alias pointer to the current text */
266 const UChar *text;
267
268 /* length of the current text */
269 int32_t originalLength;
270
271 /* if the UBIDI_OPTION_STREAMING option is set, this is the length
272 * of text actually processed by ubidi_setPara, which may be shorter than
273 * the original length.
274 * Otherwise, it is identical to the original length.
275 */
276 int32_t length;
277
278 /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
279 * marks are allowed to be inserted in one of the reordering mode, the
280 * length of the result string may be different from the processed length.
281 */
282 int32_t resultLength;
283
284 /* memory sizes in bytes */
285 int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
286
287 /* allocated memory */
288 DirProp *dirPropsMemory;
289 UBiDiLevel *levelsMemory;
290 Opening *openingsMemory;
291 Para *parasMemory;
292 Run *runsMemory;
293 Isolate *isolatesMemory;
294
295 /* indicators for whether memory may be allocated after ubidi_open() */
296 UBool mayAllocateText, mayAllocateRuns;
297
298 /* arrays with one value per text-character */
299 DirProp *dirProps;
300 UBiDiLevel *levels;
301
302 /* are we performing an approximation of the "inverse BiDi" algorithm? */
303 UBool isInverse;
304
305 /* are we using the basic algorithm or its variation? */
306 UBiDiReorderingMode reorderingMode;
307
308 /* UBIDI_REORDER_xxx values must be ordered so that all the regular
309 * logical to visual modes come first, and all inverse BiDi modes
310 * come last.
311 */
312 #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL UBIDI_REORDER_NUMBERS_SPECIAL
313
314 /* bitmask for reordering options */
315 uint32_t reorderingOptions;
316
317 /* must block separators receive level 0? */
318 UBool orderParagraphsLTR;
319
320 /* the paragraph level */
321 UBiDiLevel paraLevel;
322 /* original paraLevel when contextual */
323 /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
324 UBiDiLevel defaultParaLevel;
325
326 /* context data */
327 const UChar *prologue;
328 int32_t proLength;
329 const UChar *epilogue;
330 int32_t epiLength;
331
332 /* the following is set in ubidi_setPara, used in processPropertySeq */
333 const struct ImpTabPair * pImpTabPair; /* pointer to levels state table pair */
334
335 /* the overall paragraph or line directionality - see UBiDiDirection */
336 UBiDiDirection direction;
337
338 /* flags is a bit set for which directional properties are in the text */
339 Flags flags;
340
341 /* lastArabicPos is index to the last AL in the text, -1 if none */
342 int32_t lastArabicPos;
343
344 /* characters after trailingWSStart are WS and are */
345 /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
346 int32_t trailingWSStart;
347
348 /* fields for paragraph handling */
349 int32_t paraCount; /* set in getDirProps() */
350 /* filled in getDirProps() */
351 Para *paras;
352
353 /* for relatively short text, we only need a tiny array of paras (no malloc()) */
354 Para simpleParas[SIMPLE_PARAS_COUNT];
355
356 /* fields for line reordering */
357 int32_t runCount; /* ==-1: runs not set up yet */
358 Run *runs;
359
360 /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
361 Run simpleRuns[1];
362
363 /* maximum or current nesting depth of isolate sequences */
364 /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
365 nesting encountered.
366 Within resolveImplicitLevels(), this is the index of the current isolates
367 stack entry. */
368 int32_t isolateCount;
369 Isolate *isolates;
370
371 /* for simple text, have a small stack (no malloc()) */
372 Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
373
374 /* for inverse Bidi with insertion of directional marks */
375 InsertPoints insertPoints;
376
377 /* for option UBIDI_OPTION_REMOVE_CONTROLS */
378 int32_t controlCount;
379
380 /* for Bidi class callback */
381 UBiDiClassCallback *fnClassCallback; /* action pointer */
382 const void *coClassCallback; /* context pointer */
383};
384
385#define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
386#define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
387
388typedef union {
389 DirProp *dirPropsMemory;
390 UBiDiLevel *levelsMemory;
391 Opening *openingsMemory;
392 Para *parasMemory;
393 Run *runsMemory;
394 Isolate *isolatesMemory;
395} BidiMemoryForAllocation;
396
397/* Macros for initial checks at function entry */
398#define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
399 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue; \
400} UPRV_BLOCK_MACRO_END
401#define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
402 if(!IS_VALID_PARA(bidi)) { \
403 errcode=U_INVALID_STATE_ERROR; \
404 return retvalue; \
405 } \
406} UPRV_BLOCK_MACRO_END
407#define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
408 if(!IS_VALID_PARA_OR_LINE(bidi)) { \
409 errcode=U_INVALID_STATE_ERROR; \
410 return retvalue; \
411 } \
412} UPRV_BLOCK_MACRO_END
413#define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \
414 if((arg)<(start) || (arg)>=(limit)) { \
415 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
416 return retvalue; \
417 } \
418} UPRV_BLOCK_MACRO_END
419
420#define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode) UPRV_BLOCK_MACRO_BEGIN { \
421 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return; \
422} UPRV_BLOCK_MACRO_END
423#define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
424 if(!IS_VALID_PARA(bidi)) { \
425 errcode=U_INVALID_STATE_ERROR; \
426 return; \
427 } \
428} UPRV_BLOCK_MACRO_END
429#define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \
430 if(!IS_VALID_PARA_OR_LINE(bidi)) { \
431 errcode=U_INVALID_STATE_ERROR; \
432 return; \
433 } \
434} UPRV_BLOCK_MACRO_END
435#define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode) UPRV_BLOCK_MACRO_BEGIN { \
436 if((arg)<(start) || (arg)>=(limit)) { \
437 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \
438 return; \
439 } \
440} UPRV_BLOCK_MACRO_END
441
442/* helper function to (re)allocate memory if allowed */
443U_CFUNC UBool
444ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
445
446/* helper macros for each allocated array in UBiDi */
447#define getDirPropsMemory(pBiDi, length) \
448 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
449 (pBiDi)->mayAllocateText, (length))
450
451#define getLevelsMemory(pBiDi, length) \
452 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
453 (pBiDi)->mayAllocateText, (length))
454
455#define getRunsMemory(pBiDi, length) \
456 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
457 (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
458
459/* additional macros used by ubidi_open() - always allow allocation */
460#define getInitialDirPropsMemory(pBiDi, length) \
461 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
462 true, (length))
463
464#define getInitialLevelsMemory(pBiDi, length) \
465 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
466 true, (length))
467
468#define getInitialOpeningsMemory(pBiDi, length) \
469 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
470 true, (length)*sizeof(Opening))
471
472#define getInitialParasMemory(pBiDi, length) \
473 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
474 true, (length)*sizeof(Para))
475
476#define getInitialRunsMemory(pBiDi, length) \
477 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
478 true, (length)*sizeof(Run))
479
480#define getInitialIsolatesMemory(pBiDi, length) \
481 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
482 true, (length)*sizeof(Isolate))
483
484#endif
485