1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2004-2012, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: utext.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2004oct06 |
16 | * created by: Markus W. Scherer |
17 | */ |
18 | |
19 | #ifndef __UTEXT_H__ |
20 | #define __UTEXT_H__ |
21 | |
22 | /** |
23 | * \file |
24 | * \brief C API: Abstract Unicode Text API |
25 | * |
26 | * The Text Access API provides a means to allow text that is stored in alternative |
27 | * formats to work with ICU services. ICU normally operates on text that is |
28 | * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type |
29 | * UnicodeString for C++ APIs. |
30 | * |
31 | * ICU Text Access allows other formats, such as UTF-8 or non-contiguous |
32 | * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. |
33 | * |
34 | * There are three general classes of usage for UText: |
35 | * |
36 | * Application Level Use. This is the simplest usage - applications would |
37 | * use one of the utext_open() functions on their input text, and pass |
38 | * the resulting UText to the desired ICU service. |
39 | * |
40 | * Second is usage in ICU Services, such as break iteration, that will need to |
41 | * operate on input presented to them as a UText. These implementations |
42 | * will need to use the iteration and related UText functions to gain |
43 | * access to the actual text. |
44 | * |
45 | * The third class of UText users are "text providers." These are the |
46 | * UText implementations for the various text storage formats. An application |
47 | * or system with a unique text storage format can implement a set of |
48 | * UText provider functions for that format, which will then allow |
49 | * ICU services to operate on that format. |
50 | * |
51 | * |
52 | * <em>Iterating over text</em> |
53 | * |
54 | * Here is sample code for a forward iteration over the contents of a UText |
55 | * |
56 | * \code |
57 | * UChar32 c; |
58 | * UText *ut = whatever(); |
59 | * |
60 | * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { |
61 | * // do whatever with the codepoint c here. |
62 | * } |
63 | * \endcode |
64 | * |
65 | * And here is similar code to iterate in the reverse direction, from the end |
66 | * of the text towards the beginning. |
67 | * |
68 | * \code |
69 | * UChar32 c; |
70 | * UText *ut = whatever(); |
71 | * int textLength = utext_nativeLength(ut); |
72 | * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { |
73 | * // do whatever with the codepoint c here. |
74 | * } |
75 | * \endcode |
76 | * |
77 | * <em>Characters and Indexing</em> |
78 | * |
79 | * Indexing into text by UText functions is nearly always in terms of the native |
80 | * indexing of the underlying text storage. The storage format could be UTF-8 |
81 | * or UTF-32, for example. When coding to the UText access API, no assumptions |
82 | * can be made regarding the size of characters, or how far an index |
83 | * may move when iterating between characters. |
84 | * |
85 | * All indices supplied to UText functions are pinned to the length of the |
86 | * text. An out-of-bounds index is not considered to be an error, but is |
87 | * adjusted to be in the range 0 <= index <= length of input text. |
88 | * |
89 | * |
90 | * When an index position is returned from a UText function, it will be |
91 | * a native index to the underlying text. In the case of multi-unit characters, |
92 | * it will always refer to the first position of the character, |
93 | * never to the interior. This is essentially the same thing as saying that |
94 | * a returned index will always point to a boundary between characters. |
95 | * |
96 | * When a native index is supplied to a UText function, all indices that |
97 | * refer to any part of a multi-unit character representation are considered |
98 | * to be equivalent. In the case of multi-unit characters, an incoming index |
99 | * will be logically normalized to refer to the start of the character. |
100 | * |
101 | * It is possible to test whether a native index is on a code point boundary |
102 | * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). |
103 | * If the index is returned unchanged, it was on a code point boundary. If |
104 | * an adjusted index is returned, the original index referred to the |
105 | * interior of a character. |
106 | * |
107 | * <em>Conventions for calling UText functions</em> |
108 | * |
109 | * Most UText access functions have as their first parameter a (UText *) pointer, |
110 | * which specifies the UText to be used. Unless otherwise noted, the |
111 | * pointer must refer to a valid, open UText. Attempting to |
112 | * use a closed UText or passing a NULL pointer is a programming error and |
113 | * will produce undefined results or NULL pointer exceptions. |
114 | * |
115 | * The UText_Open family of functions can either open an existing (closed) |
116 | * UText, or heap allocate a new UText. Here is sample code for creating |
117 | * a stack-allocated UText. |
118 | * |
119 | * \code |
120 | * char *s = whatever(); // A utf-8 string |
121 | * U_ErrorCode status = U_ZERO_ERROR; |
122 | * UText ut = UTEXT_INITIALIZER; |
123 | * utext_openUTF8(ut, s, -1, &status); |
124 | * if (U_FAILURE(status)) { |
125 | * // error handling |
126 | * } else { |
127 | * // work with the UText |
128 | * } |
129 | * \endcode |
130 | * |
131 | * Any existing UText passed to an open function _must_ have been initialized, |
132 | * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated |
133 | * by an open function. Passing NULL will cause the open function to |
134 | * heap-allocate and fully initialize a new UText. |
135 | * |
136 | */ |
137 | |
138 | |
139 | |
140 | #include "unicode/utypes.h" |
141 | #include "unicode/uchar.h" |
142 | #if U_SHOW_CPLUSPLUS_API |
143 | #include "unicode/localpointer.h" |
144 | #include "unicode/rep.h" |
145 | #include "unicode/unistr.h" |
146 | #include "unicode/chariter.h" |
147 | #endif |
148 | |
149 | |
150 | U_CDECL_BEGIN |
151 | |
152 | struct UText; |
153 | typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ |
154 | |
155 | |
156 | /*************************************************************************************** |
157 | * |
158 | * C Functions for creating UText wrappers around various kinds of text strings. |
159 | * |
160 | ****************************************************************************************/ |
161 | |
162 | |
163 | /** |
164 | * Close function for UText instances. |
165 | * Cleans up, releases any resources being held by an open UText. |
166 | * <p> |
167 | * If the UText was originally allocated by one of the utext_open functions, |
168 | * the storage associated with the utext will also be freed. |
169 | * If the UText storage originated with the application, as it would with |
170 | * a local or static instance, the storage will not be deleted. |
171 | * |
172 | * An open UText can be reset to refer to new string by using one of the utext_open() |
173 | * functions without first closing the UText. |
174 | * |
175 | * @param ut The UText to be closed. |
176 | * @return NULL if the UText struct was deleted by the close. If the UText struct |
177 | * was originally provided by the caller to the open function, it is |
178 | * returned by this function, and may be safely used again in |
179 | * a subsequent utext_open. |
180 | * |
181 | * @stable ICU 3.4 |
182 | */ |
183 | U_STABLE UText * U_EXPORT2 |
184 | utext_close(UText *ut); |
185 | |
186 | /** |
187 | * Open a read-only UText implementation for UTF-8 strings. |
188 | * |
189 | * \htmlonly |
190 | * Any invalid UTF-8 in the input will be handled in this way: |
191 | * a sequence of bytes that has the form of a truncated, but otherwise valid, |
192 | * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. |
193 | * Any other illegal bytes will each be replaced by a \uFFFD. |
194 | * \endhtmlonly |
195 | * |
196 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
197 | * If non-NULL, must refer to an initialized UText struct, which will then |
198 | * be reset to reference the specified UTF-8 string. |
199 | * @param s A UTF-8 string. Must not be NULL. |
200 | * @param length The length of the UTF-8 string in bytes, or -1 if the string is |
201 | * zero terminated. |
202 | * @param status Errors are returned here. |
203 | * @return A pointer to the UText. If a pre-allocated UText was provided, it |
204 | * will always be used and returned. |
205 | * @stable ICU 3.4 |
206 | */ |
207 | U_STABLE UText * U_EXPORT2 |
208 | utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); |
209 | |
210 | |
211 | /** |
212 | * Open a read-only UText for UChar * string. |
213 | * |
214 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
215 | * If non-NULL, must refer to an initialized UText struct, which will then |
216 | * be reset to reference the specified UChar string. |
217 | * @param s A UChar (UTF-16) string |
218 | * @param length The number of UChars in the input string, or -1 if the string is |
219 | * zero terminated. |
220 | * @param status Errors are returned here. |
221 | * @return A pointer to the UText. If a pre-allocated UText was provided, it |
222 | * will always be used and returned. |
223 | * @stable ICU 3.4 |
224 | */ |
225 | U_STABLE UText * U_EXPORT2 |
226 | utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); |
227 | |
228 | |
229 | #if U_SHOW_CPLUSPLUS_API |
230 | /** |
231 | * Open a writable UText for a non-const UnicodeString. |
232 | * |
233 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
234 | * If non-NULL, must refer to an initialized UText struct, which will then |
235 | * be reset to reference the specified input string. |
236 | * @param s A UnicodeString. |
237 | * @param status Errors are returned here. |
238 | * @return Pointer to the UText. If a UText was supplied as input, this |
239 | * will always be used and returned. |
240 | * @stable ICU 3.4 |
241 | */ |
242 | U_STABLE UText * U_EXPORT2 |
243 | utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); |
244 | |
245 | |
246 | /** |
247 | * Open a UText for a const UnicodeString. The resulting UText will not be writable. |
248 | * |
249 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
250 | * If non-NULL, must refer to an initialized UText struct, which will then |
251 | * be reset to reference the specified input string. |
252 | * @param s A const UnicodeString to be wrapped. |
253 | * @param status Errors are returned here. |
254 | * @return Pointer to the UText. If a UText was supplied as input, this |
255 | * will always be used and returned. |
256 | * @stable ICU 3.4 |
257 | */ |
258 | U_STABLE UText * U_EXPORT2 |
259 | utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); |
260 | |
261 | |
262 | /** |
263 | * Open a writable UText implementation for an ICU Replaceable object. |
264 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
265 | * If non-NULL, must refer to an already existing UText, which will then |
266 | * be reset to reference the specified replaceable text. |
267 | * @param rep A Replaceable text object. |
268 | * @param status Errors are returned here. |
269 | * @return Pointer to the UText. If a UText was supplied as input, this |
270 | * will always be used and returned. |
271 | * @see Replaceable |
272 | * @stable ICU 3.4 |
273 | */ |
274 | U_STABLE UText * U_EXPORT2 |
275 | utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); |
276 | |
277 | /** |
278 | * Open a UText implementation over an ICU CharacterIterator. |
279 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
280 | * If non-NULL, must refer to an already existing UText, which will then |
281 | * be reset to reference the specified replaceable text. |
282 | * @param ci A Character Iterator. |
283 | * @param status Errors are returned here. |
284 | * @return Pointer to the UText. If a UText was supplied as input, this |
285 | * will always be used and returned. |
286 | * @see Replaceable |
287 | * @stable ICU 3.4 |
288 | */ |
289 | U_STABLE UText * U_EXPORT2 |
290 | utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); |
291 | |
292 | #endif |
293 | |
294 | |
295 | /** |
296 | * Clone a UText. This is much like opening a UText where the source text is itself |
297 | * another UText. |
298 | * |
299 | * A deep clone will copy both the UText data structures and the underlying text. |
300 | * The original and cloned UText will operate completely independently; modifications |
301 | * made to the text in one will not affect the other. Text providers are not |
302 | * required to support deep clones. The user of clone() must check the status return |
303 | * and be prepared to handle failures. |
304 | * |
305 | * The standard UText implementations for UTF8, UChar *, UnicodeString and |
306 | * Replaceable all support deep cloning. |
307 | * |
308 | * The UText returned from a deep clone will be writable, assuming that the text |
309 | * provider is able to support writing, even if the source UText had been made |
310 | * non-writable by means of UText_freeze(). |
311 | * |
312 | * A shallow clone replicates only the UText data structures; it does not make |
313 | * a copy of the underlying text. Shallow clones can be used as an efficient way to |
314 | * have multiple iterators active in a single text string that is not being |
315 | * modified. |
316 | * |
317 | * A shallow clone operation will not fail, barring truly exceptional conditions such |
318 | * as memory allocation failures. |
319 | * |
320 | * Shallow UText clones should be avoided if the UText functions that modify the |
321 | * text are expected to be used, either on the original or the cloned UText. |
322 | * Any such modifications can cause unpredictable behavior. Read Only |
323 | * shallow clones provide some protection against errors of this type by |
324 | * disabling text modification via the cloned UText. |
325 | * |
326 | * A shallow clone made with the readOnly parameter == FALSE will preserve the |
327 | * utext_isWritable() state of the source object. Note, however, that |
328 | * write operations must be avoided while more than one UText exists that refer |
329 | * to the same underlying text. |
330 | * |
331 | * A UText and its clone may be safely concurrently accessed by separate threads. |
332 | * This is true for read access only with shallow clones, and for both read and |
333 | * write access with deep clones. |
334 | * It is the responsibility of the Text Provider to ensure that this thread safety |
335 | * constraint is met. |
336 | * |
337 | * @param dest A UText struct to be filled in with the result of the clone operation, |
338 | * or NULL if the clone function should heap-allocate a new UText struct. |
339 | * If non-NULL, must refer to an already existing UText, which will then |
340 | * be reset to become the clone. |
341 | * @param src The UText to be cloned. |
342 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. |
343 | * @param readOnly TRUE to request that the cloned UText have read only access to the |
344 | * underlying text. |
345 | |
346 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR |
347 | * will be returned if the text provider is unable to clone the |
348 | * original text. |
349 | * @return The newly created clone, or NULL if the clone operation failed. |
350 | * @stable ICU 3.4 |
351 | */ |
352 | U_STABLE UText * U_EXPORT2 |
353 | utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); |
354 | |
355 | |
356 | /** |
357 | * Compare two UText objects for equality. |
358 | * UTexts are equal if they are iterating over the same text, and |
359 | * have the same iteration position within the text. |
360 | * If either or both of the parameters are NULL, the comparison is FALSE. |
361 | * |
362 | * @param a The first of the two UTexts to compare. |
363 | * @param b The other UText to be compared. |
364 | * @return TRUE if the two UTexts are equal. |
365 | * @stable ICU 3.6 |
366 | */ |
367 | U_STABLE UBool U_EXPORT2 |
368 | utext_equals(const UText *a, const UText *b); |
369 | |
370 | |
371 | /***************************************************************************** |
372 | * |
373 | * Functions to work with the text represented by a UText wrapper |
374 | * |
375 | *****************************************************************************/ |
376 | |
377 | /** |
378 | * Get the length of the text. Depending on the characteristics |
379 | * of the underlying text representation, this may be expensive. |
380 | * @see utext_isLengthExpensive() |
381 | * |
382 | * |
383 | * @param ut the text to be accessed. |
384 | * @return the length of the text, expressed in native units. |
385 | * |
386 | * @stable ICU 3.4 |
387 | */ |
388 | U_STABLE int64_t U_EXPORT2 |
389 | utext_nativeLength(UText *ut); |
390 | |
391 | /** |
392 | * Return TRUE if calculating the length of the text could be expensive. |
393 | * Finding the length of NUL terminated strings is considered to be expensive. |
394 | * |
395 | * Note that the value of this function may change |
396 | * as the result of other operations on a UText. |
397 | * Once the length of a string has been discovered, it will no longer |
398 | * be expensive to report it. |
399 | * |
400 | * @param ut the text to be accessed. |
401 | * @return TRUE if determining the length of the text could be time consuming. |
402 | * @stable ICU 3.4 |
403 | */ |
404 | U_STABLE UBool U_EXPORT2 |
405 | utext_isLengthExpensive(const UText *ut); |
406 | |
407 | /** |
408 | * Returns the code point at the requested index, |
409 | * or U_SENTINEL (-1) if it is out of bounds. |
410 | * |
411 | * If the specified index points to the interior of a multi-unit |
412 | * character - one of the trail bytes of a UTF-8 sequence, for example - |
413 | * the complete code point will be returned. |
414 | * |
415 | * The iteration position will be set to the start of the returned code point. |
416 | * |
417 | * This function is roughly equivalent to the sequence |
418 | * utext_setNativeIndex(index); |
419 | * utext_current32(); |
420 | * (There is a subtle difference if the index is out of bounds by being less than zero - |
421 | * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() |
422 | * will return the char at zero. utext_char32At(negative index), on the other hand, will |
423 | * return the U_SENTINEL value of -1.) |
424 | * |
425 | * @param ut the text to be accessed |
426 | * @param nativeIndex the native index of the character to be accessed. If the index points |
427 | * to other than the first unit of a multi-unit character, it will be adjusted |
428 | * to the start of the character. |
429 | * @return the code point at the specified index. |
430 | * @stable ICU 3.4 |
431 | */ |
432 | U_STABLE UChar32 U_EXPORT2 |
433 | utext_char32At(UText *ut, int64_t nativeIndex); |
434 | |
435 | |
436 | /** |
437 | * |
438 | * Get the code point at the current iteration position, |
439 | * or U_SENTINEL (-1) if the iteration has reached the end of |
440 | * the input text. |
441 | * |
442 | * @param ut the text to be accessed. |
443 | * @return the Unicode code point at the current iterator position. |
444 | * @stable ICU 3.4 |
445 | */ |
446 | U_STABLE UChar32 U_EXPORT2 |
447 | utext_current32(UText *ut); |
448 | |
449 | |
450 | /** |
451 | * Get the code point at the current iteration position of the UText, and |
452 | * advance the position to the first index following the character. |
453 | * |
454 | * If the position is at the end of the text (the index following |
455 | * the last character, which is also the length of the text), |
456 | * return U_SENTINEL (-1) and do not advance the index. |
457 | * |
458 | * This is a post-increment operation. |
459 | * |
460 | * An inline macro version of this function, UTEXT_NEXT32(), |
461 | * is available for performance critical use. |
462 | * |
463 | * @param ut the text to be accessed. |
464 | * @return the Unicode code point at the iteration position. |
465 | * @see UTEXT_NEXT32 |
466 | * @stable ICU 3.4 |
467 | */ |
468 | U_STABLE UChar32 U_EXPORT2 |
469 | utext_next32(UText *ut); |
470 | |
471 | |
472 | /** |
473 | * Move the iterator position to the character (code point) whose |
474 | * index precedes the current position, and return that character. |
475 | * This is a pre-decrement operation. |
476 | * |
477 | * If the initial position is at the start of the text (index of 0) |
478 | * return U_SENTINEL (-1), and leave the position unchanged. |
479 | * |
480 | * An inline macro version of this function, UTEXT_PREVIOUS32(), |
481 | * is available for performance critical use. |
482 | * |
483 | * @param ut the text to be accessed. |
484 | * @return the previous UChar32 code point, or U_SENTINEL (-1) |
485 | * if the iteration has reached the start of the text. |
486 | * @see UTEXT_PREVIOUS32 |
487 | * @stable ICU 3.4 |
488 | */ |
489 | U_STABLE UChar32 U_EXPORT2 |
490 | utext_previous32(UText *ut); |
491 | |
492 | |
493 | /** |
494 | * Set the iteration index and return the code point at that index. |
495 | * Leave the iteration index at the start of the following code point. |
496 | * |
497 | * This function is the most efficient and convenient way to |
498 | * begin a forward iteration. The results are identical to the those |
499 | * from the sequence |
500 | * \code |
501 | * utext_setIndex(); |
502 | * utext_next32(); |
503 | * \endcode |
504 | * |
505 | * @param ut the text to be accessed. |
506 | * @param nativeIndex Iteration index, in the native units of the text provider. |
507 | * @return Code point which starts at or before index, |
508 | * or U_SENTINEL (-1) if it is out of bounds. |
509 | * @stable ICU 3.4 |
510 | */ |
511 | U_STABLE UChar32 U_EXPORT2 |
512 | utext_next32From(UText *ut, int64_t nativeIndex); |
513 | |
514 | |
515 | |
516 | /** |
517 | * Set the iteration index, and return the code point preceding the |
518 | * one specified by the initial index. Leave the iteration position |
519 | * at the start of the returned code point. |
520 | * |
521 | * This function is the most efficient and convenient way to |
522 | * begin a backwards iteration. |
523 | * |
524 | * @param ut the text to be accessed. |
525 | * @param nativeIndex Iteration index in the native units of the text provider. |
526 | * @return Code point preceding the one at the initial index, |
527 | * or U_SENTINEL (-1) if it is out of bounds. |
528 | * |
529 | * @stable ICU 3.4 |
530 | */ |
531 | U_STABLE UChar32 U_EXPORT2 |
532 | utext_previous32From(UText *ut, int64_t nativeIndex); |
533 | |
534 | /** |
535 | * Get the current iterator position, which can range from 0 to |
536 | * the length of the text. |
537 | * The position is a native index into the input text, in whatever format it |
538 | * may have (possibly UTF-8 for example), and may not always be the same as |
539 | * the corresponding UChar (UTF-16) index. |
540 | * The returned position will always be aligned to a code point boundary. |
541 | * |
542 | * @param ut the text to be accessed. |
543 | * @return the current index position, in the native units of the text provider. |
544 | * @stable ICU 3.4 |
545 | */ |
546 | U_STABLE int64_t U_EXPORT2 |
547 | utext_getNativeIndex(const UText *ut); |
548 | |
549 | /** |
550 | * Set the current iteration position to the nearest code point |
551 | * boundary at or preceding the specified index. |
552 | * The index is in the native units of the original input text. |
553 | * If the index is out of range, it will be pinned to be within |
554 | * the range of the input text. |
555 | * <p> |
556 | * It will usually be more efficient to begin an iteration |
557 | * using the functions utext_next32From() or utext_previous32From() |
558 | * rather than setIndex(). |
559 | * <p> |
560 | * Moving the index position to an adjacent character is best done |
561 | * with utext_next32(), utext_previous32() or utext_moveIndex32(). |
562 | * Attempting to do direct arithmetic on the index position is |
563 | * complicated by the fact that the size (in native units) of a |
564 | * character depends on the underlying representation of the character |
565 | * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not |
566 | * easily knowable. |
567 | * |
568 | * @param ut the text to be accessed. |
569 | * @param nativeIndex the native unit index of the new iteration position. |
570 | * @stable ICU 3.4 |
571 | */ |
572 | U_STABLE void U_EXPORT2 |
573 | utext_setNativeIndex(UText *ut, int64_t nativeIndex); |
574 | |
575 | /** |
576 | * Move the iterator position by delta code points. The number of code points |
577 | * is a signed number; a negative delta will move the iterator backwards, |
578 | * towards the start of the text. |
579 | * <p> |
580 | * The index is moved by <code>delta</code> code points |
581 | * forward or backward, but no further backward than to 0 and |
582 | * no further forward than to utext_nativeLength(). |
583 | * The resulting index value will be in between 0 and length, inclusive. |
584 | * |
585 | * @param ut the text to be accessed. |
586 | * @param delta the signed number of code points to move the iteration position. |
587 | * @return TRUE if the position could be moved the requested number of positions while |
588 | * staying within the range [0 - text length]. |
589 | * @stable ICU 3.4 |
590 | */ |
591 | U_STABLE UBool U_EXPORT2 |
592 | utext_moveIndex32(UText *ut, int32_t delta); |
593 | |
594 | /** |
595 | * Get the native index of the character preceding the current position. |
596 | * If the iteration position is already at the start of the text, zero |
597 | * is returned. |
598 | * The value returned is the same as that obtained from the following sequence, |
599 | * but without the side effect of changing the iteration position. |
600 | * |
601 | * \code |
602 | * UText *ut = whatever; |
603 | * ... |
604 | * utext_previous(ut) |
605 | * utext_getNativeIndex(ut); |
606 | * \endcode |
607 | * |
608 | * This function is most useful during forwards iteration, where it will get the |
609 | * native index of the character most recently returned from utext_next(). |
610 | * |
611 | * @param ut the text to be accessed |
612 | * @return the native index of the character preceding the current index position, |
613 | * or zero if the current position is at the start of the text. |
614 | * @stable ICU 3.6 |
615 | */ |
616 | U_STABLE int64_t U_EXPORT2 |
617 | utext_getPreviousNativeIndex(UText *ut); |
618 | |
619 | |
620 | /** |
621 | * |
622 | * Extract text from a UText into a UChar buffer. The range of text to be extracted |
623 | * is specified in the native indices of the UText provider. These may not necessarily |
624 | * be UTF-16 indices. |
625 | * <p> |
626 | * The size (number of 16 bit UChars) of the data to be extracted is returned. The |
627 | * full number of UChars is returned, even when the extracted text is truncated |
628 | * because the specified buffer size is too small. |
629 | * <p> |
630 | * The extracted string will (if you are a user) / must (if you are a text provider) |
631 | * be NUL-terminated if there is sufficient space in the destination buffer. This |
632 | * terminating NUL is not included in the returned length. |
633 | * <p> |
634 | * The iteration index is left at the position following the last extracted character. |
635 | * |
636 | * @param ut the UText from which to extract data. |
637 | * @param nativeStart the native index of the first character to extract.\ |
638 | * If the specified index is out of range, |
639 | * it will be pinned to be within 0 <= index <= textLength |
640 | * @param nativeLimit the native string index of the position following the last |
641 | * character to extract. If the specified index is out of range, |
642 | * it will be pinned to be within 0 <= index <= textLength. |
643 | * nativeLimit must be >= nativeStart. |
644 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed |
645 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero |
646 | * for precomputing the required size. |
647 | * @param status receives any error status. |
648 | * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the |
649 | * buffer was too small. Returns number of UChars for preflighting. |
650 | * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. |
651 | * |
652 | * @stable ICU 3.4 |
653 | */ |
654 | U_STABLE int32_t U_EXPORT2 |
655 | utext_extract(UText *ut, |
656 | int64_t nativeStart, int64_t nativeLimit, |
657 | UChar *dest, int32_t destCapacity, |
658 | UErrorCode *status); |
659 | |
660 | |
661 | |
662 | /************************************************************************************ |
663 | * |
664 | * #define inline versions of selected performance-critical text access functions |
665 | * Caution: do not use auto increment++ or decrement-- expressions |
666 | * as parameters to these macros. |
667 | * |
668 | * For most use, where there is no extreme performance constraint, the |
669 | * normal, non-inline functions are a better choice. The resulting code |
670 | * will be smaller, and, if the need ever arises, easier to debug. |
671 | * |
672 | * These are implemented as #defines rather than real functions |
673 | * because there is no fully portable way to do inline functions in plain C. |
674 | * |
675 | ************************************************************************************/ |
676 | |
677 | #ifndef U_HIDE_INTERNAL_API |
678 | /** |
679 | * inline version of utext_current32(), for performance-critical situations. |
680 | * |
681 | * Get the code point at the current iteration position of the UText. |
682 | * Returns U_SENTINEL (-1) if the position is at the end of the |
683 | * text. |
684 | * |
685 | * @internal ICU 4.4 technology preview |
686 | */ |
687 | #define UTEXT_CURRENT32(ut) \ |
688 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ |
689 | ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) |
690 | #endif /* U_HIDE_INTERNAL_API */ |
691 | |
692 | /** |
693 | * inline version of utext_next32(), for performance-critical situations. |
694 | * |
695 | * Get the code point at the current iteration position of the UText, and |
696 | * advance the position to the first index following the character. |
697 | * This is a post-increment operation. |
698 | * Returns U_SENTINEL (-1) if the position is at the end of the |
699 | * text. |
700 | * |
701 | * @stable ICU 3.4 |
702 | */ |
703 | #define UTEXT_NEXT32(ut) \ |
704 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ |
705 | ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) |
706 | |
707 | /** |
708 | * inline version of utext_previous32(), for performance-critical situations. |
709 | * |
710 | * Move the iterator position to the character (code point) whose |
711 | * index precedes the current position, and return that character. |
712 | * This is a pre-decrement operation. |
713 | * Returns U_SENTINEL (-1) if the position is at the start of the text. |
714 | * |
715 | * @stable ICU 3.4 |
716 | */ |
717 | #define UTEXT_PREVIOUS32(ut) \ |
718 | ((ut)->chunkOffset > 0 && \ |
719 | (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ |
720 | (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) |
721 | |
722 | /** |
723 | * inline version of utext_getNativeIndex(), for performance-critical situations. |
724 | * |
725 | * Get the current iterator position, which can range from 0 to |
726 | * the length of the text. |
727 | * The position is a native index into the input text, in whatever format it |
728 | * may have (possibly UTF-8 for example), and may not always be the same as |
729 | * the corresponding UChar (UTF-16) index. |
730 | * The returned position will always be aligned to a code point boundary. |
731 | * |
732 | * @stable ICU 3.6 |
733 | */ |
734 | #define UTEXT_GETNATIVEINDEX(ut) \ |
735 | ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ |
736 | (ut)->chunkNativeStart+(ut)->chunkOffset : \ |
737 | (ut)->pFuncs->mapOffsetToNative(ut)) |
738 | |
739 | /** |
740 | * inline version of utext_setNativeIndex(), for performance-critical situations. |
741 | * |
742 | * Set the current iteration position to the nearest code point |
743 | * boundary at or preceding the specified index. |
744 | * The index is in the native units of the original input text. |
745 | * If the index is out of range, it will be pinned to be within |
746 | * the range of the input text. |
747 | * |
748 | * @stable ICU 3.8 |
749 | */ |
750 | #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ |
751 | int64_t __offset = (ix) - (ut)->chunkNativeStart; \ |
752 | if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ |
753 | (ut)->chunkOffset=(int32_t)__offset; \ |
754 | } else { \ |
755 | utext_setNativeIndex((ut), (ix)); \ |
756 | } \ |
757 | } UPRV_BLOCK_MACRO_END |
758 | |
759 | |
760 | |
761 | /************************************************************************************ |
762 | * |
763 | * Functions related to writing or modifying the text. |
764 | * These will work only with modifiable UTexts. Attempting to |
765 | * modify a read-only UText will return an error status. |
766 | * |
767 | ************************************************************************************/ |
768 | |
769 | |
770 | /** |
771 | * Return TRUE if the text can be written (modified) with utext_replace() or |
772 | * utext_copy(). For the text to be writable, the text provider must |
773 | * be of a type that supports writing and the UText must not be frozen. |
774 | * |
775 | * Attempting to modify text when utext_isWriteable() is FALSE will fail - |
776 | * the text will not be modified, and an error will be returned from the function |
777 | * that attempted the modification. |
778 | * |
779 | * @param ut the UText to be tested. |
780 | * @return TRUE if the text is modifiable. |
781 | * |
782 | * @see utext_freeze() |
783 | * @see utext_replace() |
784 | * @see utext_copy() |
785 | * @stable ICU 3.4 |
786 | * |
787 | */ |
788 | U_STABLE UBool U_EXPORT2 |
789 | utext_isWritable(const UText *ut); |
790 | |
791 | |
792 | /** |
793 | * Test whether there is meta data associated with the text. |
794 | * @see Replaceable::hasMetaData() |
795 | * |
796 | * @param ut The UText to be tested |
797 | * @return TRUE if the underlying text includes meta data. |
798 | * @stable ICU 3.4 |
799 | */ |
800 | U_STABLE UBool U_EXPORT2 |
801 | utext_hasMetaData(const UText *ut); |
802 | |
803 | |
804 | /** |
805 | * Replace a range of the original text with a replacement text. |
806 | * |
807 | * Leaves the current iteration position at the position following the |
808 | * newly inserted replacement text. |
809 | * |
810 | * This function is only available on UText types that support writing, |
811 | * that is, ones where utext_isWritable() returns TRUE. |
812 | * |
813 | * When using this function, there should be only a single UText opened onto the |
814 | * underlying native text string. Behavior after a replace operation |
815 | * on a UText is undefined for any other additional UTexts that refer to the |
816 | * modified string. |
817 | * |
818 | * @param ut the UText representing the text to be operated on. |
819 | * @param nativeStart the native index of the start of the region to be replaced |
820 | * @param nativeLimit the native index of the character following the region to be replaced. |
821 | * @param replacementText pointer to the replacement text |
822 | * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. |
823 | * @param status receives any error status. Possible errors include |
824 | * U_NO_WRITE_PERMISSION |
825 | * |
826 | * @return The signed number of (native) storage units by which |
827 | * the length of the text expanded or contracted. |
828 | * |
829 | * @stable ICU 3.4 |
830 | */ |
831 | U_STABLE int32_t U_EXPORT2 |
832 | utext_replace(UText *ut, |
833 | int64_t nativeStart, int64_t nativeLimit, |
834 | const UChar *replacementText, int32_t replacementLength, |
835 | UErrorCode *status); |
836 | |
837 | |
838 | |
839 | /** |
840 | * |
841 | * Copy or move a substring from one position to another within the text, |
842 | * while retaining any metadata associated with the text. |
843 | * This function is used to duplicate or reorder substrings. |
844 | * The destination index must not overlap the source range. |
845 | * |
846 | * The text to be copied or moved is inserted at destIndex; |
847 | * it does not replace or overwrite any existing text. |
848 | * |
849 | * The iteration position is left following the newly inserted text |
850 | * at the destination position. |
851 | * |
852 | * This function is only available on UText types that support writing, |
853 | * that is, ones where utext_isWritable() returns TRUE. |
854 | * |
855 | * When using this function, there should be only a single UText opened onto the |
856 | * underlying native text string. Behavior after a copy operation |
857 | * on a UText is undefined in any other additional UTexts that refer to the |
858 | * modified string. |
859 | * |
860 | * @param ut The UText representing the text to be operated on. |
861 | * @param nativeStart The native index of the start of the region to be copied or moved |
862 | * @param nativeLimit The native index of the character position following the region |
863 | * to be copied. |
864 | * @param destIndex The native destination index to which the source substring is |
865 | * copied or moved. |
866 | * @param move If TRUE, then the substring is moved, not copied/duplicated. |
867 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION |
868 | * |
869 | * @stable ICU 3.4 |
870 | */ |
871 | U_STABLE void U_EXPORT2 |
872 | utext_copy(UText *ut, |
873 | int64_t nativeStart, int64_t nativeLimit, |
874 | int64_t destIndex, |
875 | UBool move, |
876 | UErrorCode *status); |
877 | |
878 | |
879 | /** |
880 | * <p> |
881 | * Freeze a UText. This prevents any modification to the underlying text itself |
882 | * by means of functions operating on this UText. |
883 | * </p> |
884 | * <p> |
885 | * Once frozen, a UText can not be unfrozen. The intent is to ensure |
886 | * that a the text underlying a frozen UText wrapper cannot be modified via that UText. |
887 | * </p> |
888 | * <p> |
889 | * Caution: freezing a UText will disable changes made via the specific |
890 | * frozen UText wrapper only; it will not have any effect on the ability to |
891 | * directly modify the text by bypassing the UText. Any such backdoor modifications |
892 | * are always an error while UText access is occurring because the underlying |
893 | * text can get out of sync with UText's buffering. |
894 | * </p> |
895 | * |
896 | * @param ut The UText to be frozen. |
897 | * @see utext_isWritable() |
898 | * @stable ICU 3.6 |
899 | */ |
900 | U_STABLE void U_EXPORT2 |
901 | utext_freeze(UText *ut); |
902 | |
903 | |
904 | /** |
905 | * UText provider properties (bit field indexes). |
906 | * |
907 | * @see UText |
908 | * @stable ICU 3.4 |
909 | */ |
910 | enum { |
911 | /** |
912 | * It is potentially time consuming for the provider to determine the length of the text. |
913 | * @stable ICU 3.4 |
914 | */ |
915 | UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, |
916 | /** |
917 | * Text chunks remain valid and usable until the text object is modified or |
918 | * deleted, not just until the next time the access() function is called |
919 | * (which is the default). |
920 | * @stable ICU 3.4 |
921 | */ |
922 | UTEXT_PROVIDER_STABLE_CHUNKS = 2, |
923 | /** |
924 | * The provider supports modifying the text via the replace() and copy() |
925 | * functions. |
926 | * @see Replaceable |
927 | * @stable ICU 3.4 |
928 | */ |
929 | UTEXT_PROVIDER_WRITABLE = 3, |
930 | /** |
931 | * There is meta data associated with the text. |
932 | * @see Replaceable::hasMetaData() |
933 | * @stable ICU 3.4 |
934 | */ |
935 | UTEXT_PROVIDER_HAS_META_DATA = 4, |
936 | /** |
937 | * Text provider owns the text storage. |
938 | * Generally occurs as the result of a deep clone of the UText. |
939 | * When closing the UText, the associated text must |
940 | * also be closed/deleted/freed/ whatever is appropriate. |
941 | * @stable ICU 3.6 |
942 | */ |
943 | UTEXT_PROVIDER_OWNS_TEXT = 5 |
944 | }; |
945 | |
946 | /** |
947 | * Function type declaration for UText.clone(). |
948 | * |
949 | * clone a UText. Much like opening a UText where the source text is itself |
950 | * another UText. |
951 | * |
952 | * A deep clone will copy both the UText data structures and the underlying text. |
953 | * The original and cloned UText will operate completely independently; modifications |
954 | * made to the text in one will not effect the other. Text providers are not |
955 | * required to support deep clones. The user of clone() must check the status return |
956 | * and be prepared to handle failures. |
957 | * |
958 | * A shallow clone replicates only the UText data structures; it does not make |
959 | * a copy of the underlying text. Shallow clones can be used as an efficient way to |
960 | * have multiple iterators active in a single text string that is not being |
961 | * modified. |
962 | * |
963 | * A shallow clone operation must not fail except for truly exceptional conditions such |
964 | * as memory allocation failures. |
965 | * |
966 | * A UText and its clone may be safely concurrently accessed by separate threads. |
967 | * This is true for both shallow and deep clones. |
968 | * It is the responsibility of the Text Provider to ensure that this thread safety |
969 | * constraint is met. |
970 | |
971 | * |
972 | * @param dest A UText struct to be filled in with the result of the clone operation, |
973 | * or NULL if the clone function should heap-allocate a new UText struct. |
974 | * @param src The UText to be cloned. |
975 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. |
976 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR |
977 | * should be returned if the text provider is unable to clone the |
978 | * original text. |
979 | * @return The newly created clone, or NULL if the clone operation failed. |
980 | * |
981 | * @stable ICU 3.4 |
982 | */ |
983 | typedef UText * U_CALLCONV |
984 | UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); |
985 | |
986 | |
987 | /** |
988 | * Function type declaration for UText.nativeLength(). |
989 | * |
990 | * @param ut the UText to get the length of. |
991 | * @return the length, in the native units of the original text string. |
992 | * @see UText |
993 | * @stable ICU 3.4 |
994 | */ |
995 | typedef int64_t U_CALLCONV |
996 | UTextNativeLength(UText *ut); |
997 | |
998 | /** |
999 | * Function type declaration for UText.access(). Get the description of the text chunk |
1000 | * containing the text at a requested native index. The UText's iteration |
1001 | * position will be left at the requested index. If the index is out |
1002 | * of bounds, the iteration position will be left at the start or end |
1003 | * of the string, as appropriate. |
1004 | * |
1005 | * Chunks must begin and end on code point boundaries. A single code point |
1006 | * comprised of multiple storage units must never span a chunk boundary. |
1007 | * |
1008 | * |
1009 | * @param ut the UText being accessed. |
1010 | * @param nativeIndex Requested index of the text to be accessed. |
1011 | * @param forward If TRUE, then the returned chunk must contain text |
1012 | * starting from the index, so that start<=index<limit. |
1013 | * If FALSE, then the returned chunk must contain text |
1014 | * before the index, so that start<index<=limit. |
1015 | * @return True if the requested index could be accessed. The chunk |
1016 | * will contain the requested text. |
1017 | * False value if a chunk cannot be accessed |
1018 | * (the requested index is out of bounds). |
1019 | * |
1020 | * @see UText |
1021 | * @stable ICU 3.4 |
1022 | */ |
1023 | typedef UBool U_CALLCONV |
1024 | UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); |
1025 | |
1026 | /** |
1027 | * Function type declaration for UText.extract(). |
1028 | * |
1029 | * Extract text from a UText into a UChar buffer. The range of text to be extracted |
1030 | * is specified in the native indices of the UText provider. These may not necessarily |
1031 | * be UTF-16 indices. |
1032 | * <p> |
1033 | * The size (number of 16 bit UChars) in the data to be extracted is returned. The |
1034 | * full amount is returned, even when the specified buffer size is smaller. |
1035 | * <p> |
1036 | * The extracted string will (if you are a user) / must (if you are a text provider) |
1037 | * be NUL-terminated if there is sufficient space in the destination buffer. |
1038 | * |
1039 | * @param ut the UText from which to extract data. |
1040 | * @param nativeStart the native index of the first character to extract. |
1041 | * @param nativeLimit the native string index of the position following the last |
1042 | * character to extract. |
1043 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed |
1044 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero |
1045 | * for precomputing the required size. |
1046 | * @param status receives any error status. |
1047 | * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for |
1048 | * preflighting. |
1049 | * @return Number of UChars in the data. Does not include a trailing NUL. |
1050 | * |
1051 | * @stable ICU 3.4 |
1052 | */ |
1053 | typedef int32_t U_CALLCONV |
1054 | (UText *ut, |
1055 | int64_t nativeStart, int64_t nativeLimit, |
1056 | UChar *dest, int32_t destCapacity, |
1057 | UErrorCode *status); |
1058 | |
1059 | /** |
1060 | * Function type declaration for UText.replace(). |
1061 | * |
1062 | * Replace a range of the original text with a replacement text. |
1063 | * |
1064 | * Leaves the current iteration position at the position following the |
1065 | * newly inserted replacement text. |
1066 | * |
1067 | * This function need only be implemented on UText types that support writing. |
1068 | * |
1069 | * When using this function, there should be only a single UText opened onto the |
1070 | * underlying native text string. The function is responsible for updating the |
1071 | * text chunk within the UText to reflect the updated iteration position, |
1072 | * taking into account any changes to the underlying string's structure caused |
1073 | * by the replace operation. |
1074 | * |
1075 | * @param ut the UText representing the text to be operated on. |
1076 | * @param nativeStart the index of the start of the region to be replaced |
1077 | * @param nativeLimit the index of the character following the region to be replaced. |
1078 | * @param replacementText pointer to the replacement text |
1079 | * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. |
1080 | * @param status receives any error status. Possible errors include |
1081 | * U_NO_WRITE_PERMISSION |
1082 | * |
1083 | * @return The signed number of (native) storage units by which |
1084 | * the length of the text expanded or contracted. |
1085 | * |
1086 | * @stable ICU 3.4 |
1087 | */ |
1088 | typedef int32_t U_CALLCONV |
1089 | UTextReplace(UText *ut, |
1090 | int64_t nativeStart, int64_t nativeLimit, |
1091 | const UChar *replacementText, int32_t replacmentLength, |
1092 | UErrorCode *status); |
1093 | |
1094 | /** |
1095 | * Function type declaration for UText.copy(). |
1096 | * |
1097 | * Copy or move a substring from one position to another within the text, |
1098 | * while retaining any metadata associated with the text. |
1099 | * This function is used to duplicate or reorder substrings. |
1100 | * The destination index must not overlap the source range. |
1101 | * |
1102 | * The text to be copied or moved is inserted at destIndex; |
1103 | * it does not replace or overwrite any existing text. |
1104 | * |
1105 | * This function need only be implemented for UText types that support writing. |
1106 | * |
1107 | * When using this function, there should be only a single UText opened onto the |
1108 | * underlying native text string. The function is responsible for updating the |
1109 | * text chunk within the UText to reflect the updated iteration position, |
1110 | * taking into account any changes to the underlying string's structure caused |
1111 | * by the replace operation. |
1112 | * |
1113 | * @param ut The UText representing the text to be operated on. |
1114 | * @param nativeStart The index of the start of the region to be copied or moved |
1115 | * @param nativeLimit The index of the character following the region to be replaced. |
1116 | * @param nativeDest The destination index to which the source substring is copied or moved. |
1117 | * @param move If TRUE, then the substring is moved, not copied/duplicated. |
1118 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION |
1119 | * |
1120 | * @stable ICU 3.4 |
1121 | */ |
1122 | typedef void U_CALLCONV |
1123 | UTextCopy(UText *ut, |
1124 | int64_t nativeStart, int64_t nativeLimit, |
1125 | int64_t nativeDest, |
1126 | UBool move, |
1127 | UErrorCode *status); |
1128 | |
1129 | /** |
1130 | * Function type declaration for UText.mapOffsetToNative(). |
1131 | * Map from the current UChar offset within the current text chunk to |
1132 | * the corresponding native index in the original source text. |
1133 | * |
1134 | * This is required only for text providers that do not use native UTF-16 indexes. |
1135 | * |
1136 | * @param ut the UText. |
1137 | * @return Absolute (native) index corresponding to chunkOffset in the current chunk. |
1138 | * The returned native index should always be to a code point boundary. |
1139 | * |
1140 | * @stable ICU 3.4 |
1141 | */ |
1142 | typedef int64_t U_CALLCONV |
1143 | UTextMapOffsetToNative(const UText *ut); |
1144 | |
1145 | /** |
1146 | * Function type declaration for UText.mapIndexToUTF16(). |
1147 | * Map from a native index to a UChar offset within a text chunk. |
1148 | * Behavior is undefined if the native index does not fall within the |
1149 | * current chunk. |
1150 | * |
1151 | * This function is required only for text providers that do not use native UTF-16 indexes. |
1152 | * |
1153 | * @param ut The UText containing the text chunk. |
1154 | * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. |
1155 | * @return Chunk-relative UTF-16 offset corresponding to the specified native |
1156 | * index. |
1157 | * |
1158 | * @stable ICU 3.4 |
1159 | */ |
1160 | typedef int32_t U_CALLCONV |
1161 | UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); |
1162 | |
1163 | |
1164 | /** |
1165 | * Function type declaration for UText.utextClose(). |
1166 | * |
1167 | * A Text Provider close function is only required for provider types that make |
1168 | * allocations in their open function (or other functions) that must be |
1169 | * cleaned when the UText is closed. |
1170 | * |
1171 | * The allocation of the UText struct itself and any "extra" storage |
1172 | * associated with the UText is handled by the common UText implementation |
1173 | * and does not require provider specific cleanup in a close function. |
1174 | * |
1175 | * Most UText provider implementations do not need to implement this function. |
1176 | * |
1177 | * @param ut A UText object to be closed. |
1178 | * |
1179 | * @stable ICU 3.4 |
1180 | */ |
1181 | typedef void U_CALLCONV |
1182 | UTextClose(UText *ut); |
1183 | |
1184 | |
1185 | /** |
1186 | * (public) Function dispatch table for UText. |
1187 | * Conceptually very much like a C++ Virtual Function Table. |
1188 | * This struct defines the organization of the table. |
1189 | * Each text provider implementation must provide an |
1190 | * actual table that is initialized with the appropriate functions |
1191 | * for the type of text being handled. |
1192 | * @stable ICU 3.6 |
1193 | */ |
1194 | struct UTextFuncs { |
1195 | /** |
1196 | * (public) Function table size, sizeof(UTextFuncs) |
1197 | * Intended for use should the table grow to accommodate added |
1198 | * functions in the future, to allow tests for older format |
1199 | * function tables that do not contain the extensions. |
1200 | * |
1201 | * Fields are placed for optimal alignment on |
1202 | * 32/64/128-bit-pointer machines, by normally grouping together |
1203 | * 4 32-bit fields, |
1204 | * 4 pointers, |
1205 | * 2 64-bit fields |
1206 | * in sequence. |
1207 | * @stable ICU 3.6 |
1208 | */ |
1209 | int32_t tableSize; |
1210 | |
1211 | /** |
1212 | * (private) Alignment padding. |
1213 | * Do not use, reserved for use by the UText framework only. |
1214 | * @internal |
1215 | */ |
1216 | int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; |
1217 | |
1218 | |
1219 | /** |
1220 | * (public) Function pointer for UTextClone |
1221 | * |
1222 | * @see UTextClone |
1223 | * @stable ICU 3.6 |
1224 | */ |
1225 | UTextClone *clone; |
1226 | |
1227 | /** |
1228 | * (public) function pointer for UTextLength |
1229 | * May be expensive to compute! |
1230 | * |
1231 | * @see UTextLength |
1232 | * @stable ICU 3.6 |
1233 | */ |
1234 | UTextNativeLength *nativeLength; |
1235 | |
1236 | /** |
1237 | * (public) Function pointer for UTextAccess. |
1238 | * |
1239 | * @see UTextAccess |
1240 | * @stable ICU 3.6 |
1241 | */ |
1242 | UTextAccess *access; |
1243 | |
1244 | /** |
1245 | * (public) Function pointer for UTextExtract. |
1246 | * |
1247 | * @see UTextExtract |
1248 | * @stable ICU 3.6 |
1249 | */ |
1250 | UTextExtract *; |
1251 | |
1252 | /** |
1253 | * (public) Function pointer for UTextReplace. |
1254 | * |
1255 | * @see UTextReplace |
1256 | * @stable ICU 3.6 |
1257 | */ |
1258 | UTextReplace *replace; |
1259 | |
1260 | /** |
1261 | * (public) Function pointer for UTextCopy. |
1262 | * |
1263 | * @see UTextCopy |
1264 | * @stable ICU 3.6 |
1265 | */ |
1266 | UTextCopy *copy; |
1267 | |
1268 | /** |
1269 | * (public) Function pointer for UTextMapOffsetToNative. |
1270 | * |
1271 | * @see UTextMapOffsetToNative |
1272 | * @stable ICU 3.6 |
1273 | */ |
1274 | UTextMapOffsetToNative *mapOffsetToNative; |
1275 | |
1276 | /** |
1277 | * (public) Function pointer for UTextMapNativeIndexToUTF16. |
1278 | * |
1279 | * @see UTextMapNativeIndexToUTF16 |
1280 | * @stable ICU 3.6 |
1281 | */ |
1282 | UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; |
1283 | |
1284 | /** |
1285 | * (public) Function pointer for UTextClose. |
1286 | * |
1287 | * @see UTextClose |
1288 | * @stable ICU 3.6 |
1289 | */ |
1290 | UTextClose *close; |
1291 | |
1292 | /** |
1293 | * (private) Spare function pointer |
1294 | * @internal |
1295 | */ |
1296 | UTextClose *spare1; |
1297 | |
1298 | /** |
1299 | * (private) Spare function pointer |
1300 | * @internal |
1301 | */ |
1302 | UTextClose *spare2; |
1303 | |
1304 | /** |
1305 | * (private) Spare function pointer |
1306 | * @internal |
1307 | */ |
1308 | UTextClose *spare3; |
1309 | |
1310 | }; |
1311 | /** |
1312 | * Function dispatch table for UText |
1313 | * @see UTextFuncs |
1314 | */ |
1315 | typedef struct UTextFuncs UTextFuncs; |
1316 | |
1317 | /** |
1318 | * UText struct. Provides the interface between the generic UText access code |
1319 | * and the UText provider code that works on specific kinds of |
1320 | * text (UTF-8, noncontiguous UTF-16, whatever.) |
1321 | * |
1322 | * Applications that are using predefined types of text providers |
1323 | * to pass text data to ICU services will have no need to view the |
1324 | * internals of the UText structs that they open. |
1325 | * |
1326 | * @stable ICU 3.6 |
1327 | */ |
1328 | struct UText { |
1329 | /** |
1330 | * (private) Magic. Used to help detect when UText functions are handed |
1331 | * invalid or uninitialized UText structs. |
1332 | * utext_openXYZ() functions take an initialized, |
1333 | * but not necessarily open, UText struct as an |
1334 | * optional fill-in parameter. This magic field |
1335 | * is used to check for that initialization. |
1336 | * Text provider close functions must NOT clear |
1337 | * the magic field because that would prevent |
1338 | * reuse of the UText struct. |
1339 | * @internal |
1340 | */ |
1341 | uint32_t magic; |
1342 | |
1343 | |
1344 | /** |
1345 | * (private) Flags for managing the allocation and freeing of |
1346 | * memory associated with this UText. |
1347 | * @internal |
1348 | */ |
1349 | int32_t flags; |
1350 | |
1351 | |
1352 | /** |
1353 | * Text provider properties. This set of flags is maintained by the |
1354 | * text provider implementation. |
1355 | * @stable ICU 3.4 |
1356 | */ |
1357 | int32_t providerProperties; |
1358 | |
1359 | /** |
1360 | * (public) sizeOfStruct=sizeof(UText) |
1361 | * Allows possible backward compatible extension. |
1362 | * |
1363 | * @stable ICU 3.4 |
1364 | */ |
1365 | int32_t sizeOfStruct; |
1366 | |
1367 | /* ------ 16 byte alignment boundary ----------- */ |
1368 | |
1369 | |
1370 | /** |
1371 | * (protected) Native index of the first character position following |
1372 | * the current chunk. |
1373 | * @stable ICU 3.6 |
1374 | */ |
1375 | int64_t chunkNativeLimit; |
1376 | |
1377 | /** |
1378 | * (protected) Size in bytes of the extra space (pExtra). |
1379 | * @stable ICU 3.4 |
1380 | */ |
1381 | int32_t ; |
1382 | |
1383 | /** |
1384 | * (protected) The highest chunk offset where native indexing and |
1385 | * chunk (UTF-16) indexing correspond. For UTF-16 sources, value |
1386 | * will be equal to chunkLength. |
1387 | * |
1388 | * @stable ICU 3.6 |
1389 | */ |
1390 | int32_t nativeIndexingLimit; |
1391 | |
1392 | /* ---- 16 byte alignment boundary------ */ |
1393 | |
1394 | /** |
1395 | * (protected) Native index of the first character in the text chunk. |
1396 | * @stable ICU 3.6 |
1397 | */ |
1398 | int64_t chunkNativeStart; |
1399 | |
1400 | /** |
1401 | * (protected) Current iteration position within the text chunk (UTF-16 buffer). |
1402 | * This is the index to the character that will be returned by utext_next32(). |
1403 | * @stable ICU 3.6 |
1404 | */ |
1405 | int32_t chunkOffset; |
1406 | |
1407 | /** |
1408 | * (protected) Length the text chunk (UTF-16 buffer), in UChars. |
1409 | * @stable ICU 3.6 |
1410 | */ |
1411 | int32_t chunkLength; |
1412 | |
1413 | /* ---- 16 byte alignment boundary-- */ |
1414 | |
1415 | |
1416 | /** |
1417 | * (protected) pointer to a chunk of text in UTF-16 format. |
1418 | * May refer either to original storage of the source of the text, or |
1419 | * if conversion was required, to a buffer owned by the UText. |
1420 | * @stable ICU 3.6 |
1421 | */ |
1422 | const UChar *chunkContents; |
1423 | |
1424 | /** |
1425 | * (public) Pointer to Dispatch table for accessing functions for this UText. |
1426 | * @stable ICU 3.6 |
1427 | */ |
1428 | const UTextFuncs *pFuncs; |
1429 | |
1430 | /** |
1431 | * (protected) Pointer to additional space requested by the |
1432 | * text provider during the utext_open operation. |
1433 | * @stable ICU 3.4 |
1434 | */ |
1435 | void *; |
1436 | |
1437 | /** |
1438 | * (protected) Pointer to string or text-containing object or similar. |
1439 | * This is the source of the text that this UText is wrapping, in a format |
1440 | * that is known to the text provider functions. |
1441 | * @stable ICU 3.4 |
1442 | */ |
1443 | const void *context; |
1444 | |
1445 | /* --- 16 byte alignment boundary--- */ |
1446 | |
1447 | /** |
1448 | * (protected) Pointer fields available for use by the text provider. |
1449 | * Not used by UText common code. |
1450 | * @stable ICU 3.6 |
1451 | */ |
1452 | const void *p; |
1453 | /** |
1454 | * (protected) Pointer fields available for use by the text provider. |
1455 | * Not used by UText common code. |
1456 | * @stable ICU 3.6 |
1457 | */ |
1458 | const void *q; |
1459 | /** |
1460 | * (protected) Pointer fields available for use by the text provider. |
1461 | * Not used by UText common code. |
1462 | * @stable ICU 3.6 |
1463 | */ |
1464 | const void *r; |
1465 | |
1466 | /** |
1467 | * Private field reserved for future use by the UText framework |
1468 | * itself. This is not to be touched by the text providers. |
1469 | * @internal ICU 3.4 |
1470 | */ |
1471 | void *privP; |
1472 | |
1473 | |
1474 | /* --- 16 byte alignment boundary--- */ |
1475 | |
1476 | |
1477 | /** |
1478 | * (protected) Integer field reserved for use by the text provider. |
1479 | * Not used by the UText framework, or by the client (user) of the UText. |
1480 | * @stable ICU 3.4 |
1481 | */ |
1482 | int64_t a; |
1483 | |
1484 | /** |
1485 | * (protected) Integer field reserved for use by the text provider. |
1486 | * Not used by the UText framework, or by the client (user) of the UText. |
1487 | * @stable ICU 3.4 |
1488 | */ |
1489 | int32_t b; |
1490 | |
1491 | /** |
1492 | * (protected) Integer field reserved for use by the text provider. |
1493 | * Not used by the UText framework, or by the client (user) of the UText. |
1494 | * @stable ICU 3.4 |
1495 | */ |
1496 | int32_t c; |
1497 | |
1498 | /* ---- 16 byte alignment boundary---- */ |
1499 | |
1500 | |
1501 | /** |
1502 | * Private field reserved for future use by the UText framework |
1503 | * itself. This is not to be touched by the text providers. |
1504 | * @internal ICU 3.4 |
1505 | */ |
1506 | int64_t privA; |
1507 | /** |
1508 | * Private field reserved for future use by the UText framework |
1509 | * itself. This is not to be touched by the text providers. |
1510 | * @internal ICU 3.4 |
1511 | */ |
1512 | int32_t privB; |
1513 | /** |
1514 | * Private field reserved for future use by the UText framework |
1515 | * itself. This is not to be touched by the text providers. |
1516 | * @internal ICU 3.4 |
1517 | */ |
1518 | int32_t privC; |
1519 | }; |
1520 | |
1521 | |
1522 | /** |
1523 | * Common function for use by Text Provider implementations to allocate and/or initialize |
1524 | * a new UText struct. To be called in the implementation of utext_open() functions. |
1525 | * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. |
1526 | * If the supplied UText is already open, the provider's close function will be called |
1527 | * so that the struct can be reused by the open that is in progress. |
1528 | * |
1529 | * @param ut pointer to a UText struct to be re-used, or null if a new UText |
1530 | * should be allocated. |
1531 | * @param extraSpace The amount of additional space to be allocated as part |
1532 | * of this UText, for use by types of providers that require |
1533 | * additional storage. |
1534 | * @param status Errors are returned here. |
1535 | * @return pointer to the UText, allocated if necessary, with extra space set up if requested. |
1536 | * @stable ICU 3.4 |
1537 | */ |
1538 | U_STABLE UText * U_EXPORT2 |
1539 | utext_setup(UText *ut, int32_t , UErrorCode *status); |
1540 | |
1541 | // do not use #ifndef U_HIDE_INTERNAL_API around the following! |
1542 | /** |
1543 | * @internal |
1544 | * Value used to help identify correctly initialized UText structs. |
1545 | * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. |
1546 | */ |
1547 | enum { |
1548 | UTEXT_MAGIC = 0x345ad82c |
1549 | }; |
1550 | |
1551 | /** |
1552 | * initializer to be used with local (stack) instances of a UText |
1553 | * struct. UText structs must be initialized before passing |
1554 | * them to one of the utext_open functions. |
1555 | * |
1556 | * @stable ICU 3.6 |
1557 | */ |
1558 | #define UTEXT_INITIALIZER { \ |
1559 | UTEXT_MAGIC, /* magic */ \ |
1560 | 0, /* flags */ \ |
1561 | 0, /* providerProps */ \ |
1562 | sizeof(UText), /* sizeOfStruct */ \ |
1563 | 0, /* chunkNativeLimit */ \ |
1564 | 0, /* extraSize */ \ |
1565 | 0, /* nativeIndexingLimit */ \ |
1566 | 0, /* chunkNativeStart */ \ |
1567 | 0, /* chunkOffset */ \ |
1568 | 0, /* chunkLength */ \ |
1569 | NULL, /* chunkContents */ \ |
1570 | NULL, /* pFuncs */ \ |
1571 | NULL, /* pExtra */ \ |
1572 | NULL, /* context */ \ |
1573 | NULL, NULL, NULL, /* p, q, r */ \ |
1574 | NULL, /* privP */ \ |
1575 | 0, 0, 0, /* a, b, c */ \ |
1576 | 0, 0, 0 /* privA,B,C, */ \ |
1577 | } |
1578 | |
1579 | |
1580 | U_CDECL_END |
1581 | |
1582 | |
1583 | #if U_SHOW_CPLUSPLUS_API |
1584 | |
1585 | U_NAMESPACE_BEGIN |
1586 | |
1587 | /** |
1588 | * \class LocalUTextPointer |
1589 | * "Smart pointer" class, closes a UText via utext_close(). |
1590 | * For most methods see the LocalPointerBase base class. |
1591 | * |
1592 | * @see LocalPointerBase |
1593 | * @see LocalPointer |
1594 | * @stable ICU 4.4 |
1595 | */ |
1596 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); |
1597 | |
1598 | U_NAMESPACE_END |
1599 | |
1600 | #endif |
1601 | |
1602 | |
1603 | #endif |
1604 | |