1 | // Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // file: rbbi_cache.cpp |
5 | |
6 | #include "unicode/utypes.h" |
7 | |
8 | #if !UCONFIG_NO_BREAK_ITERATION |
9 | |
10 | #include "unicode/ubrk.h" |
11 | #include "unicode/rbbi.h" |
12 | |
13 | #include "rbbi_cache.h" |
14 | |
15 | #include "brkeng.h" |
16 | #include "cmemory.h" |
17 | #include "rbbidata.h" |
18 | #include "rbbirb.h" |
19 | #include "uassert.h" |
20 | #include "uvectr32.h" |
21 | |
22 | U_NAMESPACE_BEGIN |
23 | |
24 | /* |
25 | * DictionaryCache implementation |
26 | */ |
27 | |
28 | RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
29 | fBI(bi), fBreaks(status), fPositionInCache(-1), |
30 | fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { |
31 | } |
32 | |
33 | RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { |
34 | } |
35 | |
36 | void RuleBasedBreakIterator::DictionaryCache::reset() { |
37 | fPositionInCache = -1; |
38 | fStart = 0; |
39 | fLimit = 0; |
40 | fFirstRuleStatusIndex = 0; |
41 | fOtherRuleStatusIndex = 0; |
42 | fBreaks.removeAllElements(); |
43 | } |
44 | |
45 | UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
46 | if (fromPos >= fLimit || fromPos < fStart) { |
47 | fPositionInCache = -1; |
48 | return false; |
49 | } |
50 | |
51 | // Sequential iteration, move from previous boundary to the following |
52 | |
53 | int32_t r = 0; |
54 | if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
55 | ++fPositionInCache; |
56 | if (fPositionInCache >= fBreaks.size()) { |
57 | fPositionInCache = -1; |
58 | return false; |
59 | } |
60 | r = fBreaks.elementAti(fPositionInCache); |
61 | U_ASSERT(r > fromPos); |
62 | *result = r; |
63 | *statusIndex = fOtherRuleStatusIndex; |
64 | return true; |
65 | } |
66 | |
67 | // Random indexing. Linear search for the boundary following the given position. |
68 | |
69 | for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { |
70 | r= fBreaks.elementAti(fPositionInCache); |
71 | if (r > fromPos) { |
72 | *result = r; |
73 | *statusIndex = fOtherRuleStatusIndex; |
74 | return true; |
75 | } |
76 | } |
77 | UPRV_UNREACHABLE_EXIT; |
78 | } |
79 | |
80 | |
81 | UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
82 | if (fromPos <= fStart || fromPos > fLimit) { |
83 | fPositionInCache = -1; |
84 | return false; |
85 | } |
86 | |
87 | if (fromPos == fLimit) { |
88 | fPositionInCache = fBreaks.size() - 1; |
89 | if (fPositionInCache >= 0) { |
90 | U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); |
91 | } |
92 | } |
93 | |
94 | int32_t r; |
95 | if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
96 | --fPositionInCache; |
97 | r = fBreaks.elementAti(fPositionInCache); |
98 | U_ASSERT(r < fromPos); |
99 | *result = r; |
100 | *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
101 | return true; |
102 | } |
103 | |
104 | if (fPositionInCache == 0) { |
105 | fPositionInCache = -1; |
106 | return false; |
107 | } |
108 | |
109 | for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { |
110 | r = fBreaks.elementAti(fPositionInCache); |
111 | if (r < fromPos) { |
112 | *result = r; |
113 | *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
114 | return true; |
115 | } |
116 | } |
117 | UPRV_UNREACHABLE_EXIT; |
118 | } |
119 | |
120 | void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, |
121 | int32_t firstRuleStatus, int32_t otherRuleStatus) { |
122 | if ((endPos - startPos) <= 1) { |
123 | return; |
124 | } |
125 | |
126 | reset(); |
127 | fFirstRuleStatusIndex = firstRuleStatus; |
128 | fOtherRuleStatusIndex = otherRuleStatus; |
129 | |
130 | int32_t rangeStart = startPos; |
131 | int32_t rangeEnd = endPos; |
132 | |
133 | uint16_t category; |
134 | int32_t current; |
135 | UErrorCode status = U_ZERO_ERROR; |
136 | int32_t foundBreakCount = 0; |
137 | UText *text = &fBI->fText; |
138 | |
139 | // Loop through the text, looking for ranges of dictionary characters. |
140 | // For each span, find the appropriate break engine, and ask it to find |
141 | // any breaks within the span. |
142 | |
143 | utext_setNativeIndex(text, rangeStart); |
144 | UChar32 c = utext_current32(text); |
145 | category = ucptrie_get(fBI->fData->fTrie, c); |
146 | uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart; |
147 | |
148 | while(U_SUCCESS(status)) { |
149 | while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd |
150 | && (category < dictStart)) { |
151 | utext_next32(text); // TODO: cleaner loop structure. |
152 | c = utext_current32(text); |
153 | category = ucptrie_get(fBI->fData->fTrie, c); |
154 | } |
155 | if (current >= rangeEnd) { |
156 | break; |
157 | } |
158 | |
159 | // We now have a dictionary character. Get the appropriate language object |
160 | // to deal with it. |
161 | const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); |
162 | |
163 | // Ask the language object if there are any breaks. It will add them to the cache and |
164 | // leave the text pointer on the other side of its range, ready to search for the next one. |
165 | if (lbe != nullptr) { |
166 | foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); |
167 | } |
168 | |
169 | // Reload the loop variables for the next go-round |
170 | c = utext_current32(text); |
171 | category = ucptrie_get(fBI->fData->fTrie, c); |
172 | } |
173 | |
174 | // If we found breaks, ensure that the first and last entries are |
175 | // the original starting and ending position. And initialize the |
176 | // cache iteration position to the first entry. |
177 | |
178 | // printf("foundBreakCount = %d\n", foundBreakCount); |
179 | if (foundBreakCount > 0) { |
180 | U_ASSERT(foundBreakCount == fBreaks.size()); |
181 | if (startPos < fBreaks.elementAti(0)) { |
182 | // The dictionary did not place a boundary at the start of the segment of text. |
183 | // Add one now. This should not commonly happen, but it would be easy for interactions |
184 | // of the rules for dictionary segments and the break engine implementations to |
185 | // inadvertently cause it. Cover it here, just in case. |
186 | fBreaks.insertElementAt(startPos, 0, status); |
187 | } |
188 | if (endPos > fBreaks.peeki()) { |
189 | fBreaks.push(endPos, status); |
190 | } |
191 | fPositionInCache = 0; |
192 | // Note: Dictionary matching may extend beyond the original limit. |
193 | fStart = fBreaks.elementAti(0); |
194 | fLimit = fBreaks.peeki(); |
195 | } else { |
196 | // there were no language-based breaks, even though the segment contained |
197 | // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache |
198 | // for this range will fail, and the calling code will fall back to the rule based boundaries. |
199 | } |
200 | } |
201 | |
202 | |
203 | /* |
204 | * BreakCache implementation |
205 | */ |
206 | |
207 | RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
208 | fBI(bi), fSideBuffer(status) { |
209 | reset(); |
210 | } |
211 | |
212 | |
213 | RuleBasedBreakIterator::BreakCache::~BreakCache() { |
214 | } |
215 | |
216 | |
217 | void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { |
218 | fStartBufIdx = 0; |
219 | fEndBufIdx = 0; |
220 | fTextIdx = pos; |
221 | fBufIdx = 0; |
222 | fBoundaries[0] = pos; |
223 | fStatuses[0] = (uint16_t)ruleStatus; |
224 | } |
225 | |
226 | |
227 | int32_t RuleBasedBreakIterator::BreakCache::current() { |
228 | fBI->fPosition = fTextIdx; |
229 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
230 | fBI->fDone = false; |
231 | return fTextIdx; |
232 | } |
233 | |
234 | |
235 | void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { |
236 | if (U_FAILURE(status)) { |
237 | return; |
238 | } |
239 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
240 | // startPos is in the cache. Do a next() from that position. |
241 | // TODO: an awkward set of interactions with bi->fDone |
242 | // seek() does not clear it; it can't because of interactions with populateNear(). |
243 | // next() does not clear it in the fast-path case, where everything matters. Maybe it should. |
244 | // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. |
245 | fBI->fDone = false; |
246 | next(); |
247 | } |
248 | return; |
249 | } |
250 | |
251 | |
252 | void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { |
253 | if (U_FAILURE(status)) { |
254 | return; |
255 | } |
256 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
257 | if (startPos == fTextIdx) { |
258 | previous(status); |
259 | } else { |
260 | // seek() leaves the BreakCache positioned at the preceding boundary |
261 | // if the requested position is between two boundaries. |
262 | // current() pushes the BreakCache position out to the BreakIterator itself. |
263 | U_ASSERT(startPos > fTextIdx); |
264 | current(); |
265 | } |
266 | } |
267 | return; |
268 | } |
269 | |
270 | |
271 | /* |
272 | * Out-of-line code for BreakCache::next(). |
273 | * Cache does not already contain the boundary |
274 | */ |
275 | void RuleBasedBreakIterator::BreakCache::nextOL() { |
276 | fBI->fDone = !populateFollowing(); |
277 | fBI->fPosition = fTextIdx; |
278 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
279 | return; |
280 | } |
281 | |
282 | |
283 | void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { |
284 | if (U_FAILURE(status)) { |
285 | return; |
286 | } |
287 | int32_t initialBufIdx = fBufIdx; |
288 | if (fBufIdx == fStartBufIdx) { |
289 | // At start of cache. Prepend to it. |
290 | populatePreceding(status); |
291 | } else { |
292 | // Cache already holds the next boundary |
293 | fBufIdx = modChunkSize(fBufIdx - 1); |
294 | fTextIdx = fBoundaries[fBufIdx]; |
295 | } |
296 | fBI->fDone = (fBufIdx == initialBufIdx); |
297 | fBI->fPosition = fTextIdx; |
298 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
299 | return; |
300 | } |
301 | |
302 | |
303 | UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { |
304 | if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { |
305 | return false; |
306 | } |
307 | if (pos == fBoundaries[fStartBufIdx]) { |
308 | // Common case: seek(0), from BreakIterator::first() |
309 | fBufIdx = fStartBufIdx; |
310 | fTextIdx = fBoundaries[fBufIdx]; |
311 | return true; |
312 | } |
313 | if (pos == fBoundaries[fEndBufIdx]) { |
314 | fBufIdx = fEndBufIdx; |
315 | fTextIdx = fBoundaries[fBufIdx]; |
316 | return true; |
317 | } |
318 | |
319 | int32_t min = fStartBufIdx; |
320 | int32_t max = fEndBufIdx; |
321 | while (min != max) { |
322 | int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; |
323 | probe = modChunkSize(probe); |
324 | if (fBoundaries[probe] > pos) { |
325 | max = probe; |
326 | } else { |
327 | min = modChunkSize(probe + 1); |
328 | } |
329 | } |
330 | U_ASSERT(fBoundaries[max] > pos); |
331 | fBufIdx = modChunkSize(max - 1); |
332 | fTextIdx = fBoundaries[fBufIdx]; |
333 | U_ASSERT(fTextIdx <= pos); |
334 | return true; |
335 | } |
336 | |
337 | |
338 | UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { |
339 | if (U_FAILURE(status)) { |
340 | return false; |
341 | } |
342 | U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); |
343 | |
344 | // Add boundaries to the cache near the specified position. |
345 | // The given position need not be a boundary itself. |
346 | // The input position must be within the range of the text, and |
347 | // on a code point boundary. |
348 | // If the requested position is a break boundary, leave the iteration |
349 | // position on it. |
350 | // If the requested position is not a boundary, leave the iteration |
351 | // position on the preceding boundary and include both the |
352 | // preceding and following boundaries in the cache. |
353 | // Additional boundaries, either preceding or following, may be added |
354 | // to the cache as a side effect. |
355 | |
356 | // If the requested position is not near already cached positions, clear the existing cache, |
357 | // find a near-by boundary and begin new cache contents there. |
358 | |
359 | // Threshold for a text position to be considered near to existing cache contents. |
360 | // TODO: See issue ICU-22024 "perf tuning of Cache needed." |
361 | // This value is subject to change. See the ticket for more details. |
362 | static constexpr int32_t CACHE_NEAR = 15; |
363 | |
364 | int32_t aBoundary = -1; |
365 | int32_t ruleStatusIndex = 0; |
366 | bool retainCache = false; |
367 | if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { |
368 | // Requested position is near the existing cache. Retain it. |
369 | retainCache = true; |
370 | } else if (position <= CACHE_NEAR) { |
371 | // Requested position is near the start of the text. Fill cache from start, skipping |
372 | // the need to find a safe point. |
373 | retainCache = false; |
374 | aBoundary = 0; |
375 | } else { |
376 | // Requested position is not near the existing cache. |
377 | // Find a safe point to refill the cache from. |
378 | int32_t backupPos = fBI->handleSafePrevious(position); |
379 | |
380 | if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { |
381 | // The requested position is beyond the end of the existing cache, but the |
382 | // reverse rules produced a position near or before the cached region. |
383 | // Retain the existing cache, and fill from the end of it. |
384 | retainCache = true; |
385 | } else if (backupPos < CACHE_NEAR) { |
386 | // The safe reverse rules moved us to near the start of text. |
387 | // Take that (index 0) as the backup boundary, avoiding the complication |
388 | // (in the following block) of moving forward from the safe point to a known boundary. |
389 | // |
390 | // Retain the cache if it begins not too far from the requested position. |
391 | aBoundary = 0; |
392 | retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); |
393 | } else { |
394 | // The safe reverse rules produced a position that is neither near the existing |
395 | // cache, nor near the start of text. |
396 | // Advance to the boundary following. |
397 | // There is a complication: the safe reverse rules identify pairs of code points |
398 | // that are safe. If advancing from the safe point moves forwards by less than |
399 | // two code points, we need to advance one more time to ensure that the boundary |
400 | // is good, including a correct rules status value. |
401 | retainCache = false; |
402 | fBI->fPosition = backupPos; |
403 | aBoundary = fBI->handleNext(); |
404 | if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) { |
405 | // +4 is a quick test for possibly having advanced only one codepoint. |
406 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
407 | utext_setNativeIndex(&fBI->fText, aBoundary); |
408 | if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { |
409 | // The initial handleNext() only advanced by a single code point. Go again. |
410 | aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. |
411 | } |
412 | } |
413 | if (aBoundary == UBRK_DONE) { |
414 | // Note (Andy Heninger): I don't think this condition can occur, but it's hard |
415 | // to prove that it can't. We ran off the end of the string looking a boundary |
416 | // following a safe point; choose the end of the string as that boundary. |
417 | aBoundary = utext_nativeLength(&fBI->fText); |
418 | } |
419 | ruleStatusIndex = fBI->fRuleStatusIndex; |
420 | } |
421 | } |
422 | |
423 | if (!retainCache) { |
424 | U_ASSERT(aBoundary != -1); |
425 | reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. |
426 | } |
427 | |
428 | // Fill in boundaries between existing cache content and the new requested position. |
429 | |
430 | if (fBoundaries[fEndBufIdx] < position) { |
431 | // The last position in the cache precedes the requested position. |
432 | // Add following position(s) to the cache. |
433 | while (fBoundaries[fEndBufIdx] < position) { |
434 | if (!populateFollowing()) { |
435 | UPRV_UNREACHABLE_EXIT; |
436 | } |
437 | } |
438 | fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. |
439 | fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. |
440 | while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. |
441 | previous(status); |
442 | } |
443 | return true; |
444 | } |
445 | |
446 | if (fBoundaries[fStartBufIdx] > position) { |
447 | // The first position in the cache is beyond the requested position. |
448 | // back up more until we get a boundary <= the requested position. |
449 | while (fBoundaries[fStartBufIdx] > position) { |
450 | populatePreceding(status); |
451 | } |
452 | fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. |
453 | fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. |
454 | while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. |
455 | next(); |
456 | } |
457 | if (fTextIdx > position) { |
458 | // If position is not itself a boundary, the next() loop above will overshoot. |
459 | // Back up one, leaving cache position at the boundary preceding the requested position. |
460 | previous(status); |
461 | } |
462 | return true; |
463 | } |
464 | |
465 | U_ASSERT(fTextIdx == position); |
466 | return true; |
467 | } |
468 | |
469 | |
470 | |
471 | UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { |
472 | int32_t fromPosition = fBoundaries[fEndBufIdx]; |
473 | int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; |
474 | int32_t pos = 0; |
475 | int32_t ruleStatusIdx = 0; |
476 | |
477 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
478 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
479 | return true; |
480 | } |
481 | |
482 | fBI->fPosition = fromPosition; |
483 | pos = fBI->handleNext(); |
484 | if (pos == UBRK_DONE) { |
485 | return false; |
486 | } |
487 | |
488 | ruleStatusIdx = fBI->fRuleStatusIndex; |
489 | if (fBI->fDictionaryCharCount > 0) { |
490 | // The text segment obtained from the rules includes dictionary characters. |
491 | // Subdivide it, with subdivided results going into the dictionary cache. |
492 | fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); |
493 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
494 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
495 | return true; |
496 | // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. |
497 | // But be careful with interactions with populateNear(). |
498 | } |
499 | } |
500 | |
501 | // Rule based segment did not include dictionary characters. |
502 | // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, |
503 | // meaning that we didn't take the return, above. |
504 | // Add its end point to the cache. |
505 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
506 | |
507 | // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. |
508 | // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. |
509 | // |
510 | for (int count=0; count<6; ++count) { |
511 | pos = fBI->handleNext(); |
512 | if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { |
513 | break; |
514 | } |
515 | addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); |
516 | } |
517 | |
518 | return true; |
519 | } |
520 | |
521 | |
522 | UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { |
523 | if (U_FAILURE(status)) { |
524 | return false; |
525 | } |
526 | |
527 | int32_t fromPosition = fBoundaries[fStartBufIdx]; |
528 | if (fromPosition == 0) { |
529 | return false; |
530 | } |
531 | |
532 | int32_t position = 0; |
533 | int32_t positionStatusIdx = 0; |
534 | |
535 | if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { |
536 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
537 | return true; |
538 | } |
539 | |
540 | int32_t backupPosition = fromPosition; |
541 | |
542 | // Find a boundary somewhere preceding the first already-cached boundary |
543 | do { |
544 | backupPosition = backupPosition - 30; |
545 | if (backupPosition <= 0) { |
546 | backupPosition = 0; |
547 | } else { |
548 | backupPosition = fBI->handleSafePrevious(backupPosition); |
549 | } |
550 | if (backupPosition == UBRK_DONE || backupPosition == 0) { |
551 | position = 0; |
552 | positionStatusIdx = 0; |
553 | } else { |
554 | // Advance to the boundary following the backup position. |
555 | // There is a complication: the safe reverse rules identify pairs of code points |
556 | // that are safe. If advancing from the safe point moves forwards by less than |
557 | // two code points, we need to advance one more time to ensure that the boundary |
558 | // is good, including a correct rules status value. |
559 | // |
560 | fBI->fPosition = backupPosition; |
561 | position = fBI->handleNext(); |
562 | if (position <= backupPosition + 4) { |
563 | // +4 is a quick test for possibly having advanced only one codepoint. |
564 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
565 | utext_setNativeIndex(&fBI->fText, position); |
566 | if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { |
567 | // The initial handleNext() only advanced by a single code point. Go again. |
568 | position = fBI->handleNext(); // Safe rules identify safe pairs. |
569 | } |
570 | } |
571 | positionStatusIdx = fBI->fRuleStatusIndex; |
572 | } |
573 | } while (position >= fromPosition); |
574 | |
575 | // Find boundaries between the one we just located and the first already-cached boundary |
576 | // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. |
577 | |
578 | fSideBuffer.removeAllElements(); |
579 | fSideBuffer.addElement(position, status); |
580 | fSideBuffer.addElement(positionStatusIdx, status); |
581 | |
582 | do { |
583 | int32_t prevPosition = fBI->fPosition = position; |
584 | int32_t prevStatusIdx = positionStatusIdx; |
585 | position = fBI->handleNext(); |
586 | positionStatusIdx = fBI->fRuleStatusIndex; |
587 | if (position == UBRK_DONE) { |
588 | break; |
589 | } |
590 | |
591 | UBool segmentHandledByDictionary = false; |
592 | if (fBI->fDictionaryCharCount != 0) { |
593 | // Segment from the rules includes dictionary characters. |
594 | // Subdivide it, with subdivided results going into the dictionary cache. |
595 | int32_t dictSegEndPosition = position; |
596 | fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); |
597 | while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { |
598 | segmentHandledByDictionary = true; |
599 | U_ASSERT(position > prevPosition); |
600 | if (position >= fromPosition) { |
601 | break; |
602 | } |
603 | U_ASSERT(position <= dictSegEndPosition); |
604 | fSideBuffer.addElement(position, status); |
605 | fSideBuffer.addElement(positionStatusIdx, status); |
606 | prevPosition = position; |
607 | } |
608 | U_ASSERT(position==dictSegEndPosition || position>=fromPosition); |
609 | } |
610 | |
611 | if (!segmentHandledByDictionary && position < fromPosition) { |
612 | fSideBuffer.addElement(position, status); |
613 | fSideBuffer.addElement(positionStatusIdx, status); |
614 | } |
615 | } while (position < fromPosition); |
616 | |
617 | // Move boundaries from the side buffer to the main circular buffer. |
618 | UBool success = false; |
619 | if (!fSideBuffer.isEmpty()) { |
620 | positionStatusIdx = fSideBuffer.popi(); |
621 | position = fSideBuffer.popi(); |
622 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
623 | success = true; |
624 | } |
625 | |
626 | while (!fSideBuffer.isEmpty()) { |
627 | positionStatusIdx = fSideBuffer.popi(); |
628 | position = fSideBuffer.popi(); |
629 | if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { |
630 | // No space in circular buffer to hold a new preceding result while |
631 | // also retaining the current cache (iteration) position. |
632 | // Bailing out is safe; the cache will refill again if needed. |
633 | break; |
634 | } |
635 | } |
636 | |
637 | return success; |
638 | } |
639 | |
640 | |
641 | void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
642 | U_ASSERT(position > fBoundaries[fEndBufIdx]); |
643 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
644 | int32_t nextIdx = modChunkSize(fEndBufIdx + 1); |
645 | if (nextIdx == fStartBufIdx) { |
646 | fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. |
647 | } |
648 | fBoundaries[nextIdx] = position; |
649 | fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); |
650 | fEndBufIdx = nextIdx; |
651 | if (update == UpdateCachePosition) { |
652 | // Set current position to the newly added boundary. |
653 | fBufIdx = nextIdx; |
654 | fTextIdx = position; |
655 | } else { |
656 | // Retaining the original cache position. |
657 | // Check if the added boundary wraps around the buffer, and would over-write the original position. |
658 | // It's the responsibility of callers of this function to not add too many. |
659 | U_ASSERT(nextIdx != fBufIdx); |
660 | } |
661 | } |
662 | |
663 | bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
664 | U_ASSERT(position < fBoundaries[fStartBufIdx]); |
665 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
666 | int32_t nextIdx = modChunkSize(fStartBufIdx - 1); |
667 | if (nextIdx == fEndBufIdx) { |
668 | if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { |
669 | // Failure. The insertion of the new boundary would claim the buffer position that is the |
670 | // current iteration position. And we also want to retain the current iteration position. |
671 | // (The buffer is already completely full of entries that precede the iteration position.) |
672 | return false; |
673 | } |
674 | fEndBufIdx = modChunkSize(fEndBufIdx - 1); |
675 | } |
676 | fBoundaries[nextIdx] = position; |
677 | fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); |
678 | fStartBufIdx = nextIdx; |
679 | if (update == UpdateCachePosition) { |
680 | fBufIdx = nextIdx; |
681 | fTextIdx = position; |
682 | } |
683 | return true; |
684 | } |
685 | |
686 | |
687 | void RuleBasedBreakIterator::BreakCache::dumpCache() { |
688 | #ifdef RBBI_DEBUG |
689 | RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n" , fTextIdx, fBufIdx); |
690 | for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { |
691 | RBBIDebugPrintf("%d %d\n" , i, fBoundaries[i]); |
692 | if (i == fEndBufIdx) { |
693 | break; |
694 | } |
695 | } |
696 | #endif |
697 | } |
698 | |
699 | U_NAMESPACE_END |
700 | |
701 | #endif // #if !UCONFIG_NO_BREAK_ITERATION |
702 | |