1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 1999-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File USC_IMPL.C
10*
11* Modification History:
12*
13* Date Name Description
14* 07/08/2002 Eric Mader Creation.
15******************************************************************************
16*/
17
18#include "unicode/uscript.h"
19#include "usc_impl.h"
20#include "cmemory.h"
21
22#define PAREN_STACK_DEPTH 32
23
24#define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
25#define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
26#define INC(sp,count) (MOD((sp) + (count)))
27#define INC1(sp) (INC(sp, 1))
28#define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
29#define DEC1(sp) (DEC(sp, 1))
30#define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
31#define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
32#define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
33#define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
34
35struct ParenStackEntry
36{
37 int32_t pairIndex;
38 UScriptCode scriptCode;
39};
40
41struct UScriptRun
42{
43 int32_t textLength;
44 const char16_t *textArray;
45
46 int32_t scriptStart;
47 int32_t scriptLimit;
48 UScriptCode scriptCode;
49
50 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
51 int32_t parenSP;
52 int32_t pushCount;
53 int32_t fixupCount;
54};
55
56static int8_t highBit(int32_t value);
57
58static const UChar32 pairedChars[] = {
59 0x0028, 0x0029, /* ascii paired punctuation */
60 0x003c, 0x003e,
61 0x005b, 0x005d,
62 0x007b, 0x007d,
63 0x00ab, 0x00bb, /* guillemets */
64 0x2018, 0x2019, /* general punctuation */
65 0x201c, 0x201d,
66 0x2039, 0x203a,
67 0x3008, 0x3009, /* chinese paired punctuation */
68 0x300a, 0x300b,
69 0x300c, 0x300d,
70 0x300e, 0x300f,
71 0x3010, 0x3011,
72 0x3014, 0x3015,
73 0x3016, 0x3017,
74 0x3018, 0x3019,
75 0x301a, 0x301b
76};
77
78static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
79{
80 scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount);
81 scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
82
83 scriptRun->parenSP = INC1(scriptRun->parenSP);
84 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
85 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
86}
87
88static void pop(UScriptRun *scriptRun)
89{
90 if (STACK_IS_EMPTY(scriptRun)) {
91 return;
92 }
93
94 if (scriptRun->fixupCount > 0) {
95 scriptRun->fixupCount -= 1;
96 }
97
98 scriptRun->pushCount -= 1;
99 scriptRun->parenSP = DEC1(scriptRun->parenSP);
100
101 /* If the stack is now empty, reset the stack
102 pointers to their initial values.
103 */
104 if (STACK_IS_EMPTY(scriptRun)) {
105 scriptRun->parenSP = -1;
106 }
107}
108
109static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
110{
111 int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
112
113 while (scriptRun->fixupCount-- > 0) {
114 fixupSP = INC1(fixupSP);
115 scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
116 }
117}
118
119static int8_t
120highBit(int32_t value)
121{
122 int8_t bit = 0;
123
124 if (value <= 0) {
125 return -32;
126 }
127
128 if (value >= 1 << 16) {
129 value >>= 16;
130 bit += 16;
131 }
132
133 if (value >= 1 << 8) {
134 value >>= 8;
135 bit += 8;
136 }
137
138 if (value >= 1 << 4) {
139 value >>= 4;
140 bit += 4;
141 }
142
143 if (value >= 1 << 2) {
144 value >>= 2;
145 bit += 2;
146 }
147
148 if (value >= 1 << 1) {
149 //value >>= 1;
150 bit += 1;
151 }
152
153 return bit;
154}
155
156static int32_t
157getPairIndex(UChar32 ch)
158{
159 int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars);
160 int32_t pairedCharPower = 1 << highBit(pairedCharCount);
161 int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
162
163 int32_t probe = pairedCharPower;
164 int32_t pairIndex = 0;
165
166 if (ch >= pairedChars[pairedCharExtra]) {
167 pairIndex = pairedCharExtra;
168 }
169
170 while (probe > (1 << 0)) {
171 probe >>= 1;
172
173 if (ch >= pairedChars[pairIndex + probe]) {
174 pairIndex += probe;
175 }
176 }
177
178 if (pairedChars[pairIndex] != ch) {
179 pairIndex = -1;
180 }
181
182 return pairIndex;
183}
184
185static UBool
186sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
187{
188 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
189}
190
191U_CAPI UScriptRun * U_EXPORT2
192uscript_openRun(const char16_t *src, int32_t length, UErrorCode *pErrorCode)
193{
194 UScriptRun *result = nullptr;
195
196 if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
197 return nullptr;
198 }
199
200 result = (UScriptRun *)uprv_malloc(sizeof (UScriptRun));
201
202 if (result == nullptr) {
203 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
204 return nullptr;
205 }
206
207 uscript_setRunText(result, src, length, pErrorCode);
208
209 /* Release the UScriptRun if uscript_setRunText() returns an error */
210 if (U_FAILURE(*pErrorCode)) {
211 uprv_free(result);
212 result = nullptr;
213 }
214
215 return result;
216}
217
218U_CAPI void U_EXPORT2
219uscript_closeRun(UScriptRun *scriptRun)
220{
221 if (scriptRun != nullptr) {
222 uprv_free(scriptRun);
223 }
224}
225
226U_CAPI void U_EXPORT2
227uscript_resetRun(UScriptRun *scriptRun)
228{
229 if (scriptRun != nullptr) {
230 scriptRun->scriptStart = 0;
231 scriptRun->scriptLimit = 0;
232 scriptRun->scriptCode = USCRIPT_INVALID_CODE;
233 scriptRun->parenSP = -1;
234 scriptRun->pushCount = 0;
235 scriptRun->fixupCount = 0;
236 }
237}
238
239U_CAPI void U_EXPORT2
240uscript_setRunText(UScriptRun *scriptRun, const char16_t *src, int32_t length, UErrorCode *pErrorCode)
241{
242 if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
243 return;
244 }
245
246 if (scriptRun == nullptr || length < 0 || ((src == nullptr) != (length == 0))) {
247 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
248 return;
249 }
250
251 scriptRun->textArray = src;
252 scriptRun->textLength = length;
253
254 uscript_resetRun(scriptRun);
255}
256
257U_CAPI UBool U_EXPORT2
258uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
259{
260 UErrorCode error = U_ZERO_ERROR;
261
262 /* if we've fallen off the end of the text, we're done */
263 if (scriptRun == nullptr || scriptRun->scriptLimit >= scriptRun->textLength) {
264 return false;
265 }
266
267 SYNC_FIXUP(scriptRun);
268 scriptRun->scriptCode = USCRIPT_COMMON;
269
270 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
271 char16_t high = scriptRun->textArray[scriptRun->scriptLimit];
272 UChar32 ch = high;
273 UScriptCode sc;
274 int32_t pairIndex;
275
276 /*
277 * if the character is a high surrogate and it's not the last one
278 * in the text, see if it's followed by a low surrogate
279 */
280 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
281 char16_t low = scriptRun->textArray[scriptRun->scriptLimit + 1];
282
283 /*
284 * if it is followed by a low surrogate,
285 * consume it and form the full character
286 */
287 if (low >= 0xDC00 && low <= 0xDFFF) {
288 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
289 scriptRun->scriptLimit += 1;
290 }
291 }
292
293 sc = uscript_getScript(ch, &error);
294 pairIndex = getPairIndex(ch);
295
296 /*
297 * Paired character handling:
298 *
299 * if it's an open character, push it onto the stack.
300 * if it's a close character, find the matching open on the
301 * stack, and use that script code. Any non-matching open
302 * characters above it on the stack will be poped.
303 */
304 if (pairIndex >= 0) {
305 if ((pairIndex & 1) == 0) {
306 push(scriptRun, pairIndex, scriptRun->scriptCode);
307 } else {
308 int32_t pi = pairIndex & ~1;
309
310 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
311 pop(scriptRun);
312 }
313
314 if (STACK_IS_NOT_EMPTY(scriptRun)) {
315 sc = TOP(scriptRun).scriptCode;
316 }
317 }
318 }
319
320 if (sameScript(scriptRun->scriptCode, sc)) {
321 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
322 scriptRun->scriptCode = sc;
323
324 fixup(scriptRun, scriptRun->scriptCode);
325 }
326
327 /*
328 * if this character is a close paired character,
329 * pop the matching open character from the stack
330 */
331 if (pairIndex >= 0 && (pairIndex & 1) != 0) {
332 pop(scriptRun);
333 }
334 } else {
335 /*
336 * if the run broke on a surrogate pair,
337 * end it before the high surrogate
338 */
339 if (ch >= 0x10000) {
340 scriptRun->scriptLimit -= 1;
341 }
342
343 break;
344 }
345 }
346
347
348 if (pRunStart != nullptr) {
349 *pRunStart = scriptRun->scriptStart;
350 }
351
352 if (pRunLimit != nullptr) {
353 *pRunLimit = scriptRun->scriptLimit;
354 }
355
356 if (pRunScript != nullptr) {
357 *pRunScript = scriptRun->scriptCode;
358 }
359
360 return true;
361}
362