1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // edits.h |
5 | // created: 2016dec30 Markus W. Scherer |
6 | |
7 | #ifndef __EDITS_H__ |
8 | #define __EDITS_H__ |
9 | |
10 | #include "unicode/utypes.h" |
11 | |
12 | #if U_SHOW_CPLUSPLUS_API |
13 | |
14 | #include "unicode/uobject.h" |
15 | |
16 | /** |
17 | * \file |
18 | * \brief C++ API: C++ class Edits for low-level string transformations on styled text. |
19 | */ |
20 | |
21 | U_NAMESPACE_BEGIN |
22 | |
23 | class UnicodeString; |
24 | |
25 | /** |
26 | * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions |
27 | * in linear progression. Does not support moving/reordering of text. |
28 | * |
29 | * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to |
30 | * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and |
31 | * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity, |
32 | * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one |
33 | * mapping between code points in the source and destination strings. |
34 | * |
35 | * After all edits have been added, instances of this class should be considered immutable, and an |
36 | * {@link Edits::Iterator} can be used for queries. |
37 | * |
38 | * There are four flavors of Edits::Iterator: |
39 | * |
40 | * <ul> |
41 | * <li>{@link #getFineIterator()} retains full granularity of change edits. |
42 | * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling |
43 | * next() on the iterator, skips over no-change edits (unchanged regions). |
44 | * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change |
45 | * edits are automatically merged during the construction phase.) |
46 | * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when |
47 | * calling next() on the iterator, skips over no-change edits (unchanged regions). |
48 | * </ul> |
49 | * |
50 | * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the |
51 | * following fine edits: |
52 | * <ul> |
53 | * <li>abc ⇨ abc (no-change) |
54 | * <li>ß ⇨ ss (change) |
55 | * <li>D ⇨ d (change) |
56 | * <li>e ⇨ e (no-change) |
57 | * <li>F ⇨ f (change) |
58 | * </ul> |
59 | * and the following coarse edits (note how adjacent change edits get merged together): |
60 | * <ul> |
61 | * <li>abc ⇨ abc (no-change) |
62 | * <li>ßD ⇨ ssd (change) |
63 | * <li>e ⇨ e (no-change) |
64 | * <li>F ⇨ f (change) |
65 | * </ul> |
66 | * |
67 | * The "fine changes" and "coarse changes" iterators will step through only the change edits when their |
68 | * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when |
69 | * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()` |
70 | * methods are used to walk through the string. |
71 | * |
72 | * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in |
73 | * UCharacterCaseTest.java. |
74 | * |
75 | * An Edits object tracks a separate UErrorCode, but ICU string transformation functions |
76 | * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. |
77 | * |
78 | * @stable ICU 59 |
79 | */ |
80 | class U_COMMON_API Edits final : public UMemory { |
81 | public: |
82 | /** |
83 | * Constructs an empty object. |
84 | * @stable ICU 59 |
85 | */ |
86 | Edits() : |
87 | array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), |
88 | errorCode_(U_ZERO_ERROR) {} |
89 | /** |
90 | * Copy constructor. |
91 | * @param other source edits |
92 | * @stable ICU 60 |
93 | */ |
94 | Edits(const Edits &other) : |
95 | array(stackArray), capacity(STACK_CAPACITY), length(other.length), |
96 | delta(other.delta), numChanges(other.numChanges), |
97 | errorCode_(other.errorCode_) { |
98 | copyArray(other); |
99 | } |
100 | /** |
101 | * Move constructor, might leave src empty. |
102 | * This object will have the same contents that the source object had. |
103 | * @param src source edits |
104 | * @stable ICU 60 |
105 | */ |
106 | Edits(Edits &&src) noexcept : |
107 | array(stackArray), capacity(STACK_CAPACITY), length(src.length), |
108 | delta(src.delta), numChanges(src.numChanges), |
109 | errorCode_(src.errorCode_) { |
110 | moveArray(src); |
111 | } |
112 | |
113 | /** |
114 | * Destructor. |
115 | * @stable ICU 59 |
116 | */ |
117 | ~Edits(); |
118 | |
119 | /** |
120 | * Assignment operator. |
121 | * @param other source edits |
122 | * @return *this |
123 | * @stable ICU 60 |
124 | */ |
125 | Edits &operator=(const Edits &other); |
126 | |
127 | /** |
128 | * Move assignment operator, might leave src empty. |
129 | * This object will have the same contents that the source object had. |
130 | * The behavior is undefined if *this and src are the same object. |
131 | * @param src source edits |
132 | * @return *this |
133 | * @stable ICU 60 |
134 | */ |
135 | Edits &operator=(Edits &&src) noexcept; |
136 | |
137 | /** |
138 | * Resets the data but may not release memory. |
139 | * @stable ICU 59 |
140 | */ |
141 | void reset() noexcept; |
142 | |
143 | /** |
144 | * Adds a no-change edit: a record for an unchanged segment of text. |
145 | * Normally called from inside ICU string transformation functions, not user code. |
146 | * @stable ICU 59 |
147 | */ |
148 | void addUnchanged(int32_t unchangedLength); |
149 | /** |
150 | * Adds a change edit: a record for a text replacement/insertion/deletion. |
151 | * Normally called from inside ICU string transformation functions, not user code. |
152 | * @stable ICU 59 |
153 | */ |
154 | void addReplace(int32_t oldLength, int32_t newLength); |
155 | /** |
156 | * Sets the UErrorCode if an error occurred while recording edits. |
157 | * Preserves older error codes in the outErrorCode. |
158 | * Normally called from inside ICU string transformation functions, not user code. |
159 | * @param outErrorCode Set to an error code if it does not contain one already |
160 | * and an error occurred while recording edits. |
161 | * Otherwise unchanged. |
162 | * @return true if U_FAILURE(outErrorCode) |
163 | * @stable ICU 59 |
164 | */ |
165 | UBool copyErrorTo(UErrorCode &outErrorCode) const; |
166 | |
167 | /** |
168 | * How much longer is the new text compared with the old text? |
169 | * @return new length minus old length |
170 | * @stable ICU 59 |
171 | */ |
172 | int32_t lengthDelta() const { return delta; } |
173 | /** |
174 | * @return true if there are any change edits |
175 | * @stable ICU 59 |
176 | */ |
177 | UBool hasChanges() const { return numChanges != 0; } |
178 | |
179 | /** |
180 | * @return the number of change edits |
181 | * @stable ICU 60 |
182 | */ |
183 | int32_t numberOfChanges() const { return numChanges; } |
184 | |
185 | /** |
186 | * Access to the list of edits. |
187 | * |
188 | * At any moment in time, an instance of this class points to a single edit: a "window" into a span |
189 | * of the source string and the corresponding span of the destination string. The source string span |
190 | * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string |
191 | * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. |
192 | * |
193 | * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`, |
194 | * and `findDestinationIndex(int32_t, UErrorCode &)` methods. |
195 | * Calling any of these methods mutates the iterator to make it point to the corresponding edit. |
196 | * |
197 | * For more information, see the documentation for {@link Edits}. |
198 | * |
199 | * @see getCoarseIterator |
200 | * @see getFineIterator |
201 | * @stable ICU 59 |
202 | */ |
203 | struct U_COMMON_API Iterator final : public UMemory { |
204 | /** |
205 | * Default constructor, empty iterator. |
206 | * @stable ICU 60 |
207 | */ |
208 | Iterator() : |
209 | array(nullptr), index(0), length(0), |
210 | remaining(0), onlyChanges_(false), coarse(false), |
211 | dir(0), changed(false), oldLength_(0), newLength_(0), |
212 | srcIndex(0), replIndex(0), destIndex(0) {} |
213 | /** |
214 | * Copy constructor. |
215 | * @stable ICU 59 |
216 | */ |
217 | Iterator(const Iterator &other) = default; |
218 | /** |
219 | * Assignment operator. |
220 | * @stable ICU 59 |
221 | */ |
222 | Iterator &operator=(const Iterator &other) = default; |
223 | |
224 | /** |
225 | * Advances the iterator to the next edit. |
226 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
227 | * or else the function returns immediately. Check for U_FAILURE() |
228 | * on output or use with function chaining. (See User Guide for details.) |
229 | * @return true if there is another edit |
230 | * @stable ICU 59 |
231 | */ |
232 | UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } |
233 | |
234 | /** |
235 | * Moves the iterator to the edit that contains the source index. |
236 | * The source index may be found in a no-change edit |
237 | * even if normal iteration would skip no-change edits. |
238 | * Normal iteration can continue from a found edit. |
239 | * |
240 | * The iterator state before this search logically does not matter. |
241 | * (It may affect the performance of the search.) |
242 | * |
243 | * The iterator state after this search is undefined |
244 | * if the source index is out of bounds for the source string. |
245 | * |
246 | * @param i source index |
247 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
248 | * or else the function returns immediately. Check for U_FAILURE() |
249 | * on output or use with function chaining. (See User Guide for details.) |
250 | * @return true if the edit for the source index was found |
251 | * @stable ICU 59 |
252 | */ |
253 | UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { |
254 | return findIndex(i, true, errorCode) == 0; |
255 | } |
256 | |
257 | /** |
258 | * Moves the iterator to the edit that contains the destination index. |
259 | * The destination index may be found in a no-change edit |
260 | * even if normal iteration would skip no-change edits. |
261 | * Normal iteration can continue from a found edit. |
262 | * |
263 | * The iterator state before this search logically does not matter. |
264 | * (It may affect the performance of the search.) |
265 | * |
266 | * The iterator state after this search is undefined |
267 | * if the source index is out of bounds for the source string. |
268 | * |
269 | * @param i destination index |
270 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
271 | * or else the function returns immediately. Check for U_FAILURE() |
272 | * on output or use with function chaining. (See User Guide for details.) |
273 | * @return true if the edit for the destination index was found |
274 | * @stable ICU 60 |
275 | */ |
276 | UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { |
277 | return findIndex(i, false, errorCode) == 0; |
278 | } |
279 | |
280 | /** |
281 | * Computes the destination index corresponding to the given source index. |
282 | * If the source index is inside a change edit (not at its start), |
283 | * then the destination index at the end of that edit is returned, |
284 | * since there is no information about index mapping inside a change edit. |
285 | * |
286 | * (This means that indexes to the start and middle of an edit, |
287 | * for example around a grapheme cluster, are mapped to indexes |
288 | * encompassing the entire edit. |
289 | * The alternative, mapping an interior index to the start, |
290 | * would map such an interval to an empty one.) |
291 | * |
292 | * This operation will usually but not always modify this object. |
293 | * The iterator state after this search is undefined. |
294 | * |
295 | * @param i source index |
296 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
297 | * or else the function returns immediately. Check for U_FAILURE() |
298 | * on output or use with function chaining. (See User Guide for details.) |
299 | * @return destination index; undefined if i is not 0..string length |
300 | * @stable ICU 60 |
301 | */ |
302 | int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); |
303 | |
304 | /** |
305 | * Computes the source index corresponding to the given destination index. |
306 | * If the destination index is inside a change edit (not at its start), |
307 | * then the source index at the end of that edit is returned, |
308 | * since there is no information about index mapping inside a change edit. |
309 | * |
310 | * (This means that indexes to the start and middle of an edit, |
311 | * for example around a grapheme cluster, are mapped to indexes |
312 | * encompassing the entire edit. |
313 | * The alternative, mapping an interior index to the start, |
314 | * would map such an interval to an empty one.) |
315 | * |
316 | * This operation will usually but not always modify this object. |
317 | * The iterator state after this search is undefined. |
318 | * |
319 | * @param i destination index |
320 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
321 | * or else the function returns immediately. Check for U_FAILURE() |
322 | * on output or use with function chaining. (See User Guide for details.) |
323 | * @return source index; undefined if i is not 0..string length |
324 | * @stable ICU 60 |
325 | */ |
326 | int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); |
327 | |
328 | /** |
329 | * Returns whether the edit currently represented by the iterator is a change edit. |
330 | * |
331 | * @return true if this edit replaces oldLength() units with newLength() different ones. |
332 | * false if oldLength units remain unchanged. |
333 | * @stable ICU 59 |
334 | */ |
335 | UBool hasChange() const { return changed; } |
336 | |
337 | /** |
338 | * The length of the current span in the source string, which starts at {@link #sourceIndex}. |
339 | * |
340 | * @return the number of units in the original string which are replaced or remain unchanged. |
341 | * @stable ICU 59 |
342 | */ |
343 | int32_t oldLength() const { return oldLength_; } |
344 | |
345 | /** |
346 | * The length of the current span in the destination string, which starts at |
347 | * {@link #destinationIndex}, or in the replacement string, which starts at |
348 | * {@link #replacementIndex}. |
349 | * |
350 | * @return the number of units in the modified string, if hasChange() is true. |
351 | * Same as oldLength if hasChange() is false. |
352 | * @stable ICU 59 |
353 | */ |
354 | int32_t newLength() const { return newLength_; } |
355 | |
356 | /** |
357 | * The start index of the current span in the source string; the span has length |
358 | * {@link #oldLength}. |
359 | * |
360 | * @return the current index into the source string |
361 | * @stable ICU 59 |
362 | */ |
363 | int32_t sourceIndex() const { return srcIndex; } |
364 | |
365 | /** |
366 | * The start index of the current span in the replacement string; the span has length |
367 | * {@link #newLength}. Well-defined only if the current edit is a change edit. |
368 | * |
369 | * The *replacement string* is the concatenation of all substrings of the destination |
370 | * string corresponding to change edits. |
371 | * |
372 | * This method is intended to be used together with operations that write only replacement |
373 | * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option). |
374 | * The source string can then be modified in-place. |
375 | * |
376 | * @return the current index into the replacement-characters-only string, |
377 | * not counting unchanged spans |
378 | * @stable ICU 59 |
379 | */ |
380 | int32_t replacementIndex() const { |
381 | // TODO: Throw an exception if we aren't in a change edit? |
382 | return replIndex; |
383 | } |
384 | |
385 | /** |
386 | * The start index of the current span in the destination string; the span has length |
387 | * {@link #newLength}. |
388 | * |
389 | * @return the current index into the full destination string |
390 | * @stable ICU 59 |
391 | */ |
392 | int32_t destinationIndex() const { return destIndex; } |
393 | |
394 | #ifndef U_HIDE_INTERNAL_API |
395 | /** |
396 | * A string representation of the current edit represented by the iterator for debugging. You |
397 | * should not depend on the contents of the return string. |
398 | * @internal |
399 | */ |
400 | UnicodeString& toString(UnicodeString& appendTo) const; |
401 | #endif // U_HIDE_INTERNAL_API |
402 | |
403 | private: |
404 | friend class Edits; |
405 | |
406 | Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); |
407 | |
408 | int32_t readLength(int32_t head); |
409 | void updateNextIndexes(); |
410 | void updatePreviousIndexes(); |
411 | UBool noNext(); |
412 | UBool next(UBool onlyChanges, UErrorCode &errorCode); |
413 | UBool previous(UErrorCode &errorCode); |
414 | /** @return -1: error or i<0; 0: found; 1: i>=string length */ |
415 | int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); |
416 | |
417 | const uint16_t *array; |
418 | int32_t index, length; |
419 | // 0 if we are not within compressed equal-length changes. |
420 | // Otherwise the number of remaining changes, including the current one. |
421 | int32_t remaining; |
422 | UBool onlyChanges_, coarse; |
423 | |
424 | int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) |
425 | UBool changed; |
426 | int32_t oldLength_, newLength_; |
427 | int32_t srcIndex, replIndex, destIndex; |
428 | }; |
429 | |
430 | /** |
431 | * Returns an Iterator for coarse-grained change edits |
432 | * (adjacent change edits are treated as one). |
433 | * Can be used to perform simple string updates. |
434 | * Skips no-change edits. |
435 | * @return an Iterator that merges adjacent changes. |
436 | * @stable ICU 59 |
437 | */ |
438 | Iterator getCoarseChangesIterator() const { |
439 | return Iterator(array, length, true, true); |
440 | } |
441 | |
442 | /** |
443 | * Returns an Iterator for coarse-grained change and no-change edits |
444 | * (adjacent change edits are treated as one). |
445 | * Can be used to perform simple string updates. |
446 | * Adjacent change edits are treated as one edit. |
447 | * @return an Iterator that merges adjacent changes. |
448 | * @stable ICU 59 |
449 | */ |
450 | Iterator getCoarseIterator() const { |
451 | return Iterator(array, length, false, true); |
452 | } |
453 | |
454 | /** |
455 | * Returns an Iterator for fine-grained change edits |
456 | * (full granularity of change edits is retained). |
457 | * Can be used for modifying styled text. |
458 | * Skips no-change edits. |
459 | * @return an Iterator that separates adjacent changes. |
460 | * @stable ICU 59 |
461 | */ |
462 | Iterator getFineChangesIterator() const { |
463 | return Iterator(array, length, true, false); |
464 | } |
465 | |
466 | /** |
467 | * Returns an Iterator for fine-grained change and no-change edits |
468 | * (full granularity of change edits is retained). |
469 | * Can be used for modifying styled text. |
470 | * @return an Iterator that separates adjacent changes. |
471 | * @stable ICU 59 |
472 | */ |
473 | Iterator getFineIterator() const { |
474 | return Iterator(array, length, false, false); |
475 | } |
476 | |
477 | /** |
478 | * Merges the two input Edits and appends the result to this object. |
479 | * |
480 | * Consider two string transformations (for example, normalization and case mapping) |
481 | * where each records Edits in addition to writing an output string.<br> |
482 | * Edits ab reflect how substrings of input string a |
483 | * map to substrings of intermediate string b.<br> |
484 | * Edits bc reflect how substrings of intermediate string b |
485 | * map to substrings of output string c.<br> |
486 | * This function merges ab and bc such that the additional edits |
487 | * recorded in this object reflect how substrings of input string a |
488 | * map to substrings of output string c. |
489 | * |
490 | * If unrelated Edits are passed in where the output string of the first |
491 | * has a different length than the input string of the second, |
492 | * then a U_ILLEGAL_ARGUMENT_ERROR is reported. |
493 | * |
494 | * @param ab reflects how substrings of input string a |
495 | * map to substrings of intermediate string b. |
496 | * @param bc reflects how substrings of intermediate string b |
497 | * map to substrings of output string c. |
498 | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
499 | * or else the function returns immediately. Check for U_FAILURE() |
500 | * on output or use with function chaining. (See User Guide for details.) |
501 | * @return *this, with the merged edits appended |
502 | * @stable ICU 60 |
503 | */ |
504 | Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); |
505 | |
506 | private: |
507 | void releaseArray() noexcept; |
508 | Edits ©Array(const Edits &other); |
509 | Edits &moveArray(Edits &src) noexcept; |
510 | |
511 | void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } |
512 | int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } |
513 | |
514 | void append(int32_t r); |
515 | UBool growArray(); |
516 | |
517 | static const int32_t STACK_CAPACITY = 100; |
518 | uint16_t *array; |
519 | int32_t capacity; |
520 | int32_t length; |
521 | int32_t delta; |
522 | int32_t numChanges; |
523 | UErrorCode errorCode_; |
524 | uint16_t stackArray[STACK_CAPACITY]; |
525 | }; |
526 | |
527 | U_NAMESPACE_END |
528 | |
529 | #endif /* U_SHOW_CPLUSPLUS_API */ |
530 | |
531 | #endif // __EDITS_H__ |
532 | |