1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// file: rbbi_cache.h
5//
6#ifndef RBBI_CACHE_H
7#define RBBI_CACHE_H
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_BREAK_ITERATION
12
13#include "unicode/rbbi.h"
14#include "unicode/uobject.h"
15
16#include "uvectr32.h"
17
18U_NAMESPACE_BEGIN
19
20/* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
21 * Dictionary boundaries are moved first to this cache, then from here
22 * to the main BreakCache, where they may inter-leave with non-dictionary
23 * boundaries. The public BreakIterator API always fetches directly
24 * from the main BreakCache, not from here.
25 *
26 * In common situations, the number of boundaries in a single dictionary run
27 * should be quite small, it will be terminated by punctuation, spaces,
28 * or any other non-dictionary characters. The main BreakCache may end
29 * up with boundaries from multiple dictionary based runs.
30 *
31 * The boundaries are stored in a simple ArrayList (vector), with the
32 * assumption that they will be accessed sequentially.
33 */
34class RuleBasedBreakIterator::DictionaryCache: public UMemory {
35 public:
36 DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
37 ~DictionaryCache();
38
39 void reset();
40
41 UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
42 UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
43
44 /**
45 * Populate the cache with the dictionary based boundaries within a region of text.
46 * @param startPos The start position of a range of text
47 * @param endPos The end position of a range of text
48 * @param firstRuleStatus The rule status index that applies to the break at startPos
49 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
50 * @internal
51 */
52 void populateDictionary(int32_t startPos, int32_t endPos,
53 int32_t firstRuleStatus, int32_t otherRuleStatus);
54
55
56
57 RuleBasedBreakIterator *fBI;
58
59 UVector32 fBreaks; // A vector containing the boundaries.
60 int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
61 // or preceding(). Optimizes sequential access.
62 int32_t fStart; // Text position of first boundary in cache.
63 int32_t fLimit; // Last boundary in cache. Which is the limit of the
64 // text segment being handled by the dictionary.
65 int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
66 int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
67};
68
69
70/*
71 * class BreakCache
72 *
73 * Cache of break boundary positions and rule status values.
74 * Break iterator API functions, next(), previous(), etc., will use cached results
75 * when possible, and otherwise cache new results as they are obtained.
76 *
77 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
78 *
79 * The cache is implemented as a single circular buffer.
80 */
81
82/*
83 * size of the circular cache buffer.
84 */
85
86class RuleBasedBreakIterator::BreakCache: public UMemory {
87 public:
88 BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
89 virtual ~BreakCache();
90 void reset(int32_t pos = 0, int32_t ruleStatus = 0);
91 void next() { if (fBufIdx == fEndBufIdx) {
92 nextOL();
93 } else {
94 fBufIdx = modChunkSize(fBufIdx + 1);
95 fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
96 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
97 }
98 }
99
100
101 void nextOL();
102 void previous(UErrorCode &status);
103
104 // Move the iteration state to the position following the startPosition.
105 // Input position must be pinned to the input length.
106 void following(int32_t startPosition, UErrorCode &status);
107
108 void preceding(int32_t startPosition, UErrorCode &status);
109
110 /*
111 * Update the state of the public BreakIterator (fBI) to reflect the
112 * current state of the break iterator cache (this).
113 */
114 int32_t current();
115
116 /**
117 * Add boundaries to the cache near the specified position.
118 * The given position need not be a boundary itself.
119 * The input position must be within the range of the text, and
120 * on a code point boundary.
121 * If the requested position is a break boundary, leave the iteration
122 * position on it.
123 * If the requested position is not a boundary, leave the iteration
124 * position on the preceding boundary and include both the
125 * preceding and following boundaries in the cache.
126 * Additional boundaries, either preceding or following, may be added
127 * to the cache as a side effect.
128 *
129 * Return FALSE if the operation failed.
130 */
131 UBool populateNear(int32_t position, UErrorCode &status);
132
133 /**
134 * Add boundary(s) to the cache following the current last boundary.
135 * Return FALSE if at the end of the text, and no more boundaries can be added.
136 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
137 */
138 UBool populateFollowing();
139
140 /**
141 * Add one or more boundaries to the cache preceding the first currently cached boundary.
142 * Leave the iteration position on the first added boundary.
143 * Return false if no boundaries could be added (if at the start of the text.)
144 */
145 UBool populatePreceding(UErrorCode &status);
146
147 enum UpdatePositionValues {
148 RetainCachePosition = 0,
149 UpdateCachePosition = 1
150 };
151
152 /*
153 * Add the boundary following the current position.
154 * The current position can be left as it was, or changed to the newly added boundary,
155 * as specified by the update parameter.
156 */
157 void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
158
159
160 /*
161 * Add the boundary preceding the current position.
162 * The current position can be left as it was, or changed to the newly added boundary,
163 * as specified by the update parameter.
164 */
165 bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
166
167 /**
168 * Set the cache position to the specified position, or, if the position
169 * falls between to cached boundaries, to the preceding boundary.
170 * Fails if the requested position is outside of the range of boundaries currently held by the cache.
171 * The startPosition must be on a code point boundary.
172 *
173 * Return TRUE if successful, FALSE if the specified position is after
174 * the last cached boundary or before the first.
175 */
176 UBool seek(int32_t startPosition);
177
178 void dumpCache();
179
180 private:
181 static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
182
183 static constexpr int32_t CACHE_SIZE = 128;
184 static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
185
186 RuleBasedBreakIterator *fBI;
187 int32_t fStartBufIdx;
188 int32_t fEndBufIdx; // inclusive
189
190 int32_t fTextIdx;
191 int32_t fBufIdx;
192
193 int32_t fBoundaries[CACHE_SIZE];
194 uint16_t fStatuses[CACHE_SIZE];
195
196 UVector32 fSideBuffer;
197};
198
199U_NAMESPACE_END
200
201#endif // #if !UCONFIG_NO_BREAK_ITERATION
202
203#endif // RBBI_CACHE_H
204