1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2004-2012, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: utext.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2004oct06 |
16 | * created by: Markus W. Scherer |
17 | */ |
18 | |
19 | #ifndef __UTEXT_H__ |
20 | #define __UTEXT_H__ |
21 | |
22 | /** |
23 | * \file |
24 | * \brief C API: Abstract Unicode Text API |
25 | * |
26 | * The Text Access API provides a means to allow text that is stored in alternative |
27 | * formats to work with ICU services. ICU normally operates on text that is |
28 | * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type |
29 | * UnicodeString for C++ APIs. |
30 | * |
31 | * ICU Text Access allows other formats, such as UTF-8 or non-contiguous |
32 | * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. |
33 | * |
34 | * There are three general classes of usage for UText: |
35 | * |
36 | * Application Level Use. This is the simplest usage - applications would |
37 | * use one of the utext_open() functions on their input text, and pass |
38 | * the resulting UText to the desired ICU service. |
39 | * |
40 | * Second is usage in ICU Services, such as break iteration, that will need to |
41 | * operate on input presented to them as a UText. These implementations |
42 | * will need to use the iteration and related UText functions to gain |
43 | * access to the actual text. |
44 | * |
45 | * The third class of UText users are "text providers." These are the |
46 | * UText implementations for the various text storage formats. An application |
47 | * or system with a unique text storage format can implement a set of |
48 | * UText provider functions for that format, which will then allow |
49 | * ICU services to operate on that format. |
50 | * |
51 | * |
52 | * <em>Iterating over text</em> |
53 | * |
54 | * Here is sample code for a forward iteration over the contents of a UText |
55 | * |
56 | * \code |
57 | * UChar32 c; |
58 | * UText *ut = whatever(); |
59 | * |
60 | * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { |
61 | * // do whatever with the codepoint c here. |
62 | * } |
63 | * \endcode |
64 | * |
65 | * And here is similar code to iterate in the reverse direction, from the end |
66 | * of the text towards the beginning. |
67 | * |
68 | * \code |
69 | * UChar32 c; |
70 | * UText *ut = whatever(); |
71 | * int textLength = utext_nativeLength(ut); |
72 | * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { |
73 | * // do whatever with the codepoint c here. |
74 | * } |
75 | * \endcode |
76 | * |
77 | * <em>Characters and Indexing</em> |
78 | * |
79 | * Indexing into text by UText functions is nearly always in terms of the native |
80 | * indexing of the underlying text storage. The storage format could be UTF-8 |
81 | * or UTF-32, for example. When coding to the UText access API, no assumptions |
82 | * can be made regarding the size of characters, or how far an index |
83 | * may move when iterating between characters. |
84 | * |
85 | * All indices supplied to UText functions are pinned to the length of the |
86 | * text. An out-of-bounds index is not considered to be an error, but is |
87 | * adjusted to be in the range 0 <= index <= length of input text. |
88 | * |
89 | * |
90 | * When an index position is returned from a UText function, it will be |
91 | * a native index to the underlying text. In the case of multi-unit characters, |
92 | * it will always refer to the first position of the character, |
93 | * never to the interior. This is essentially the same thing as saying that |
94 | * a returned index will always point to a boundary between characters. |
95 | * |
96 | * When a native index is supplied to a UText function, all indices that |
97 | * refer to any part of a multi-unit character representation are considered |
98 | * to be equivalent. In the case of multi-unit characters, an incoming index |
99 | * will be logically normalized to refer to the start of the character. |
100 | * |
101 | * It is possible to test whether a native index is on a code point boundary |
102 | * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). |
103 | * If the index is returned unchanged, it was on a code point boundary. If |
104 | * an adjusted index is returned, the original index referred to the |
105 | * interior of a character. |
106 | * |
107 | * <em>Conventions for calling UText functions</em> |
108 | * |
109 | * Most UText access functions have as their first parameter a (UText *) pointer, |
110 | * which specifies the UText to be used. Unless otherwise noted, the |
111 | * pointer must refer to a valid, open UText. Attempting to |
112 | * use a closed UText or passing a NULL pointer is a programming error and |
113 | * will produce undefined results or NULL pointer exceptions. |
114 | * |
115 | * The UText_Open family of functions can either open an existing (closed) |
116 | * UText, or heap allocate a new UText. Here is sample code for creating |
117 | * a stack-allocated UText. |
118 | * |
119 | * \code |
120 | * char *s = whatever(); // A utf-8 string |
121 | * U_ErrorCode status = U_ZERO_ERROR; |
122 | * UText ut = UTEXT_INITIALIZER; |
123 | * utext_openUTF8(ut, s, -1, &status); |
124 | * if (U_FAILURE(status)) { |
125 | * // error handling |
126 | * } else { |
127 | * // work with the UText |
128 | * } |
129 | * \endcode |
130 | * |
131 | * Any existing UText passed to an open function _must_ have been initialized, |
132 | * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated |
133 | * by an open function. Passing NULL will cause the open function to |
134 | * heap-allocate and fully initialize a new UText. |
135 | * |
136 | */ |
137 | |
138 | |
139 | |
140 | #include "unicode/utypes.h" |
141 | #include "unicode/uchar.h" |
142 | #if U_SHOW_CPLUSPLUS_API |
143 | #include "unicode/localpointer.h" |
144 | #include "unicode/rep.h" |
145 | #include "unicode/unistr.h" |
146 | #include "unicode/chariter.h" |
147 | #endif |
148 | |
149 | |
150 | U_CDECL_BEGIN |
151 | |
152 | struct UText; |
153 | typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ |
154 | |
155 | |
156 | /*************************************************************************************** |
157 | * |
158 | * C Functions for creating UText wrappers around various kinds of text strings. |
159 | * |
160 | ****************************************************************************************/ |
161 | |
162 | |
163 | /** |
164 | * Close function for UText instances. |
165 | * Cleans up, releases any resources being held by an open UText. |
166 | * <p> |
167 | * If the UText was originally allocated by one of the utext_open functions, |
168 | * the storage associated with the utext will also be freed. |
169 | * If the UText storage originated with the application, as it would with |
170 | * a local or static instance, the storage will not be deleted. |
171 | * |
172 | * An open UText can be reset to refer to new string by using one of the utext_open() |
173 | * functions without first closing the UText. |
174 | * |
175 | * @param ut The UText to be closed. |
176 | * @return NULL if the UText struct was deleted by the close. If the UText struct |
177 | * was originally provided by the caller to the open function, it is |
178 | * returned by this function, and may be safely used again in |
179 | * a subsequent utext_open. |
180 | * |
181 | * @stable ICU 3.4 |
182 | */ |
183 | U_STABLE UText * U_EXPORT2 |
184 | utext_close(UText *ut); |
185 | |
186 | #if U_SHOW_CPLUSPLUS_API |
187 | |
188 | U_NAMESPACE_BEGIN |
189 | |
190 | /** |
191 | * \class LocalUTextPointer |
192 | * "Smart pointer" class, closes a UText via utext_close(). |
193 | * For most methods see the LocalPointerBase base class. |
194 | * |
195 | * @see LocalPointerBase |
196 | * @see LocalPointer |
197 | * @stable ICU 4.4 |
198 | */ |
199 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); |
200 | |
201 | U_NAMESPACE_END |
202 | |
203 | #endif |
204 | |
205 | /** |
206 | * Open a read-only UText implementation for UTF-8 strings. |
207 | * |
208 | * \htmlonly |
209 | * Any invalid UTF-8 in the input will be handled in this way: |
210 | * a sequence of bytes that has the form of a truncated, but otherwise valid, |
211 | * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. |
212 | * Any other illegal bytes will each be replaced by a \uFFFD. |
213 | * \endhtmlonly |
214 | * |
215 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
216 | * If non-NULL, must refer to an initialized UText struct, which will then |
217 | * be reset to reference the specified UTF-8 string. |
218 | * @param s A UTF-8 string. Must not be NULL. |
219 | * @param length The length of the UTF-8 string in bytes, or -1 if the string is |
220 | * zero terminated. |
221 | * @param status Errors are returned here. |
222 | * @return A pointer to the UText. If a pre-allocated UText was provided, it |
223 | * will always be used and returned. |
224 | * @stable ICU 3.4 |
225 | */ |
226 | U_STABLE UText * U_EXPORT2 |
227 | utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); |
228 | |
229 | |
230 | /** |
231 | * Open a read-only UText for UChar * string. |
232 | * |
233 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
234 | * If non-NULL, must refer to an initialized UText struct, which will then |
235 | * be reset to reference the specified UChar string. |
236 | * @param s A UChar (UTF-16) string |
237 | * @param length The number of UChars in the input string, or -1 if the string is |
238 | * zero terminated. |
239 | * @param status Errors are returned here. |
240 | * @return A pointer to the UText. If a pre-allocated UText was provided, it |
241 | * will always be used and returned. |
242 | * @stable ICU 3.4 |
243 | */ |
244 | U_STABLE UText * U_EXPORT2 |
245 | utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); |
246 | |
247 | |
248 | #if U_SHOW_CPLUSPLUS_API |
249 | /** |
250 | * Open a writable UText for a non-const UnicodeString. |
251 | * |
252 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
253 | * If non-NULL, must refer to an initialized UText struct, which will then |
254 | * be reset to reference the specified input string. |
255 | * @param s A UnicodeString. |
256 | * @param status Errors are returned here. |
257 | * @return Pointer to the UText. If a UText was supplied as input, this |
258 | * will always be used and returned. |
259 | * @stable ICU 3.4 |
260 | */ |
261 | U_STABLE UText * U_EXPORT2 |
262 | utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); |
263 | |
264 | |
265 | /** |
266 | * Open a UText for a const UnicodeString. The resulting UText will not be writable. |
267 | * |
268 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
269 | * If non-NULL, must refer to an initialized UText struct, which will then |
270 | * be reset to reference the specified input string. |
271 | * @param s A const UnicodeString to be wrapped. |
272 | * @param status Errors are returned here. |
273 | * @return Pointer to the UText. If a UText was supplied as input, this |
274 | * will always be used and returned. |
275 | * @stable ICU 3.4 |
276 | */ |
277 | U_STABLE UText * U_EXPORT2 |
278 | utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); |
279 | |
280 | |
281 | /** |
282 | * Open a writable UText implementation for an ICU Replaceable object. |
283 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
284 | * If non-NULL, must refer to an already existing UText, which will then |
285 | * be reset to reference the specified replaceable text. |
286 | * @param rep A Replaceable text object. |
287 | * @param status Errors are returned here. |
288 | * @return Pointer to the UText. If a UText was supplied as input, this |
289 | * will always be used and returned. |
290 | * @see Replaceable |
291 | * @stable ICU 3.4 |
292 | */ |
293 | U_STABLE UText * U_EXPORT2 |
294 | utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); |
295 | |
296 | /** |
297 | * Open a UText implementation over an ICU CharacterIterator. |
298 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. |
299 | * If non-NULL, must refer to an already existing UText, which will then |
300 | * be reset to reference the specified replaceable text. |
301 | * @param ci A Character Iterator. |
302 | * @param status Errors are returned here. |
303 | * @return Pointer to the UText. If a UText was supplied as input, this |
304 | * will always be used and returned. |
305 | * @see Replaceable |
306 | * @stable ICU 3.4 |
307 | */ |
308 | U_STABLE UText * U_EXPORT2 |
309 | utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); |
310 | |
311 | #endif |
312 | |
313 | |
314 | /** |
315 | * Clone a UText. This is much like opening a UText where the source text is itself |
316 | * another UText. |
317 | * |
318 | * A deep clone will copy both the UText data structures and the underlying text. |
319 | * The original and cloned UText will operate completely independently; modifications |
320 | * made to the text in one will not affect the other. Text providers are not |
321 | * required to support deep clones. The user of clone() must check the status return |
322 | * and be prepared to handle failures. |
323 | * |
324 | * The standard UText implementations for UTF8, UChar *, UnicodeString and |
325 | * Replaceable all support deep cloning. |
326 | * |
327 | * The UText returned from a deep clone will be writable, assuming that the text |
328 | * provider is able to support writing, even if the source UText had been made |
329 | * non-writable by means of UText_freeze(). |
330 | * |
331 | * A shallow clone replicates only the UText data structures; it does not make |
332 | * a copy of the underlying text. Shallow clones can be used as an efficient way to |
333 | * have multiple iterators active in a single text string that is not being |
334 | * modified. |
335 | * |
336 | * A shallow clone operation will not fail, barring truly exceptional conditions such |
337 | * as memory allocation failures. |
338 | * |
339 | * Shallow UText clones should be avoided if the UText functions that modify the |
340 | * text are expected to be used, either on the original or the cloned UText. |
341 | * Any such modifications can cause unpredictable behavior. Read Only |
342 | * shallow clones provide some protection against errors of this type by |
343 | * disabling text modification via the cloned UText. |
344 | * |
345 | * A shallow clone made with the readOnly parameter == FALSE will preserve the |
346 | * utext_isWritable() state of the source object. Note, however, that |
347 | * write operations must be avoided while more than one UText exists that refer |
348 | * to the same underlying text. |
349 | * |
350 | * A UText and its clone may be safely concurrently accessed by separate threads. |
351 | * This is true for read access only with shallow clones, and for both read and |
352 | * write access with deep clones. |
353 | * It is the responsibility of the Text Provider to ensure that this thread safety |
354 | * constraint is met. |
355 | * |
356 | * @param dest A UText struct to be filled in with the result of the clone operation, |
357 | * or NULL if the clone function should heap-allocate a new UText struct. |
358 | * If non-NULL, must refer to an already existing UText, which will then |
359 | * be reset to become the clone. |
360 | * @param src The UText to be cloned. |
361 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. |
362 | * @param readOnly TRUE to request that the cloned UText have read only access to the |
363 | * underlying text. |
364 | |
365 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR |
366 | * will be returned if the text provider is unable to clone the |
367 | * original text. |
368 | * @return The newly created clone, or NULL if the clone operation failed. |
369 | * @stable ICU 3.4 |
370 | */ |
371 | U_STABLE UText * U_EXPORT2 |
372 | utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); |
373 | |
374 | |
375 | /** |
376 | * Compare two UText objects for equality. |
377 | * UTexts are equal if they are iterating over the same text, and |
378 | * have the same iteration position within the text. |
379 | * If either or both of the parameters are NULL, the comparison is FALSE. |
380 | * |
381 | * @param a The first of the two UTexts to compare. |
382 | * @param b The other UText to be compared. |
383 | * @return TRUE if the two UTexts are equal. |
384 | * @stable ICU 3.6 |
385 | */ |
386 | U_STABLE UBool U_EXPORT2 |
387 | utext_equals(const UText *a, const UText *b); |
388 | |
389 | |
390 | /***************************************************************************** |
391 | * |
392 | * Functions to work with the text represented by a UText wrapper |
393 | * |
394 | *****************************************************************************/ |
395 | |
396 | /** |
397 | * Get the length of the text. Depending on the characteristics |
398 | * of the underlying text representation, this may be expensive. |
399 | * @see utext_isLengthExpensive() |
400 | * |
401 | * |
402 | * @param ut the text to be accessed. |
403 | * @return the length of the text, expressed in native units. |
404 | * |
405 | * @stable ICU 3.4 |
406 | */ |
407 | U_STABLE int64_t U_EXPORT2 |
408 | utext_nativeLength(UText *ut); |
409 | |
410 | /** |
411 | * Return TRUE if calculating the length of the text could be expensive. |
412 | * Finding the length of NUL terminated strings is considered to be expensive. |
413 | * |
414 | * Note that the value of this function may change |
415 | * as the result of other operations on a UText. |
416 | * Once the length of a string has been discovered, it will no longer |
417 | * be expensive to report it. |
418 | * |
419 | * @param ut the text to be accessed. |
420 | * @return TRUE if determining the length of the text could be time consuming. |
421 | * @stable ICU 3.4 |
422 | */ |
423 | U_STABLE UBool U_EXPORT2 |
424 | utext_isLengthExpensive(const UText *ut); |
425 | |
426 | /** |
427 | * Returns the code point at the requested index, |
428 | * or U_SENTINEL (-1) if it is out of bounds. |
429 | * |
430 | * If the specified index points to the interior of a multi-unit |
431 | * character - one of the trail bytes of a UTF-8 sequence, for example - |
432 | * the complete code point will be returned. |
433 | * |
434 | * The iteration position will be set to the start of the returned code point. |
435 | * |
436 | * This function is roughly equivalent to the sequence |
437 | * utext_setNativeIndex(index); |
438 | * utext_current32(); |
439 | * (There is a subtle difference if the index is out of bounds by being less than zero - |
440 | * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() |
441 | * will return the char at zero. utext_char32At(negative index), on the other hand, will |
442 | * return the U_SENTINEL value of -1.) |
443 | * |
444 | * @param ut the text to be accessed |
445 | * @param nativeIndex the native index of the character to be accessed. If the index points |
446 | * to other than the first unit of a multi-unit character, it will be adjusted |
447 | * to the start of the character. |
448 | * @return the code point at the specified index. |
449 | * @stable ICU 3.4 |
450 | */ |
451 | U_STABLE UChar32 U_EXPORT2 |
452 | utext_char32At(UText *ut, int64_t nativeIndex); |
453 | |
454 | |
455 | /** |
456 | * |
457 | * Get the code point at the current iteration position, |
458 | * or U_SENTINEL (-1) if the iteration has reached the end of |
459 | * the input text. |
460 | * |
461 | * @param ut the text to be accessed. |
462 | * @return the Unicode code point at the current iterator position. |
463 | * @stable ICU 3.4 |
464 | */ |
465 | U_STABLE UChar32 U_EXPORT2 |
466 | utext_current32(UText *ut); |
467 | |
468 | |
469 | /** |
470 | * Get the code point at the current iteration position of the UText, and |
471 | * advance the position to the first index following the character. |
472 | * |
473 | * If the position is at the end of the text (the index following |
474 | * the last character, which is also the length of the text), |
475 | * return U_SENTINEL (-1) and do not advance the index. |
476 | * |
477 | * This is a post-increment operation. |
478 | * |
479 | * An inline macro version of this function, UTEXT_NEXT32(), |
480 | * is available for performance critical use. |
481 | * |
482 | * @param ut the text to be accessed. |
483 | * @return the Unicode code point at the iteration position. |
484 | * @see UTEXT_NEXT32 |
485 | * @stable ICU 3.4 |
486 | */ |
487 | U_STABLE UChar32 U_EXPORT2 |
488 | utext_next32(UText *ut); |
489 | |
490 | |
491 | /** |
492 | * Move the iterator position to the character (code point) whose |
493 | * index precedes the current position, and return that character. |
494 | * This is a pre-decrement operation. |
495 | * |
496 | * If the initial position is at the start of the text (index of 0) |
497 | * return U_SENTINEL (-1), and leave the position unchanged. |
498 | * |
499 | * An inline macro version of this function, UTEXT_PREVIOUS32(), |
500 | * is available for performance critical use. |
501 | * |
502 | * @param ut the text to be accessed. |
503 | * @return the previous UChar32 code point, or U_SENTINEL (-1) |
504 | * if the iteration has reached the start of the text. |
505 | * @see UTEXT_PREVIOUS32 |
506 | * @stable ICU 3.4 |
507 | */ |
508 | U_STABLE UChar32 U_EXPORT2 |
509 | utext_previous32(UText *ut); |
510 | |
511 | |
512 | /** |
513 | * Set the iteration index and return the code point at that index. |
514 | * Leave the iteration index at the start of the following code point. |
515 | * |
516 | * This function is the most efficient and convenient way to |
517 | * begin a forward iteration. The results are identical to the those |
518 | * from the sequence |
519 | * \code |
520 | * utext_setIndex(); |
521 | * utext_next32(); |
522 | * \endcode |
523 | * |
524 | * @param ut the text to be accessed. |
525 | * @param nativeIndex Iteration index, in the native units of the text provider. |
526 | * @return Code point which starts at or before index, |
527 | * or U_SENTINEL (-1) if it is out of bounds. |
528 | * @stable ICU 3.4 |
529 | */ |
530 | U_STABLE UChar32 U_EXPORT2 |
531 | utext_next32From(UText *ut, int64_t nativeIndex); |
532 | |
533 | |
534 | |
535 | /** |
536 | * Set the iteration index, and return the code point preceding the |
537 | * one specified by the initial index. Leave the iteration position |
538 | * at the start of the returned code point. |
539 | * |
540 | * This function is the most efficient and convenient way to |
541 | * begin a backwards iteration. |
542 | * |
543 | * @param ut the text to be accessed. |
544 | * @param nativeIndex Iteration index in the native units of the text provider. |
545 | * @return Code point preceding the one at the initial index, |
546 | * or U_SENTINEL (-1) if it is out of bounds. |
547 | * |
548 | * @stable ICU 3.4 |
549 | */ |
550 | U_STABLE UChar32 U_EXPORT2 |
551 | utext_previous32From(UText *ut, int64_t nativeIndex); |
552 | |
553 | /** |
554 | * Get the current iterator position, which can range from 0 to |
555 | * the length of the text. |
556 | * The position is a native index into the input text, in whatever format it |
557 | * may have (possibly UTF-8 for example), and may not always be the same as |
558 | * the corresponding UChar (UTF-16) index. |
559 | * The returned position will always be aligned to a code point boundary. |
560 | * |
561 | * @param ut the text to be accessed. |
562 | * @return the current index position, in the native units of the text provider. |
563 | * @stable ICU 3.4 |
564 | */ |
565 | U_STABLE int64_t U_EXPORT2 |
566 | utext_getNativeIndex(const UText *ut); |
567 | |
568 | /** |
569 | * Set the current iteration position to the nearest code point |
570 | * boundary at or preceding the specified index. |
571 | * The index is in the native units of the original input text. |
572 | * If the index is out of range, it will be pinned to be within |
573 | * the range of the input text. |
574 | * <p> |
575 | * It will usually be more efficient to begin an iteration |
576 | * using the functions utext_next32From() or utext_previous32From() |
577 | * rather than setIndex(). |
578 | * <p> |
579 | * Moving the index position to an adjacent character is best done |
580 | * with utext_next32(), utext_previous32() or utext_moveIndex32(). |
581 | * Attempting to do direct arithmetic on the index position is |
582 | * complicated by the fact that the size (in native units) of a |
583 | * character depends on the underlying representation of the character |
584 | * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not |
585 | * easily knowable. |
586 | * |
587 | * @param ut the text to be accessed. |
588 | * @param nativeIndex the native unit index of the new iteration position. |
589 | * @stable ICU 3.4 |
590 | */ |
591 | U_STABLE void U_EXPORT2 |
592 | utext_setNativeIndex(UText *ut, int64_t nativeIndex); |
593 | |
594 | /** |
595 | * Move the iterator position by delta code points. The number of code points |
596 | * is a signed number; a negative delta will move the iterator backwards, |
597 | * towards the start of the text. |
598 | * <p> |
599 | * The index is moved by <code>delta</code> code points |
600 | * forward or backward, but no further backward than to 0 and |
601 | * no further forward than to utext_nativeLength(). |
602 | * The resulting index value will be in between 0 and length, inclusive. |
603 | * |
604 | * @param ut the text to be accessed. |
605 | * @param delta the signed number of code points to move the iteration position. |
606 | * @return TRUE if the position could be moved the requested number of positions while |
607 | * staying within the range [0 - text length]. |
608 | * @stable ICU 3.4 |
609 | */ |
610 | U_STABLE UBool U_EXPORT2 |
611 | utext_moveIndex32(UText *ut, int32_t delta); |
612 | |
613 | /** |
614 | * Get the native index of the character preceding the current position. |
615 | * If the iteration position is already at the start of the text, zero |
616 | * is returned. |
617 | * The value returned is the same as that obtained from the following sequence, |
618 | * but without the side effect of changing the iteration position. |
619 | * |
620 | * \code |
621 | * UText *ut = whatever; |
622 | * ... |
623 | * utext_previous(ut) |
624 | * utext_getNativeIndex(ut); |
625 | * \endcode |
626 | * |
627 | * This function is most useful during forwards iteration, where it will get the |
628 | * native index of the character most recently returned from utext_next(). |
629 | * |
630 | * @param ut the text to be accessed |
631 | * @return the native index of the character preceding the current index position, |
632 | * or zero if the current position is at the start of the text. |
633 | * @stable ICU 3.6 |
634 | */ |
635 | U_STABLE int64_t U_EXPORT2 |
636 | utext_getPreviousNativeIndex(UText *ut); |
637 | |
638 | |
639 | /** |
640 | * |
641 | * Extract text from a UText into a UChar buffer. The range of text to be extracted |
642 | * is specified in the native indices of the UText provider. These may not necessarily |
643 | * be UTF-16 indices. |
644 | * <p> |
645 | * The size (number of 16 bit UChars) of the data to be extracted is returned. The |
646 | * full number of UChars is returned, even when the extracted text is truncated |
647 | * because the specified buffer size is too small. |
648 | * <p> |
649 | * The extracted string will (if you are a user) / must (if you are a text provider) |
650 | * be NUL-terminated if there is sufficient space in the destination buffer. This |
651 | * terminating NUL is not included in the returned length. |
652 | * <p> |
653 | * The iteration index is left at the position following the last extracted character. |
654 | * |
655 | * @param ut the UText from which to extract data. |
656 | * @param nativeStart the native index of the first character to extract.\ |
657 | * If the specified index is out of range, |
658 | * it will be pinned to be within 0 <= index <= textLength |
659 | * @param nativeLimit the native string index of the position following the last |
660 | * character to extract. If the specified index is out of range, |
661 | * it will be pinned to be within 0 <= index <= textLength. |
662 | * nativeLimit must be >= nativeStart. |
663 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed |
664 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero |
665 | * for precomputing the required size. |
666 | * @param status receives any error status. |
667 | * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the |
668 | * buffer was too small. Returns number of UChars for preflighting. |
669 | * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. |
670 | * |
671 | * @stable ICU 3.4 |
672 | */ |
673 | U_STABLE int32_t U_EXPORT2 |
674 | utext_extract(UText *ut, |
675 | int64_t nativeStart, int64_t nativeLimit, |
676 | UChar *dest, int32_t destCapacity, |
677 | UErrorCode *status); |
678 | |
679 | |
680 | |
681 | /************************************************************************************ |
682 | * |
683 | * #define inline versions of selected performance-critical text access functions |
684 | * Caution: do not use auto increment++ or decrement-- expressions |
685 | * as parameters to these macros. |
686 | * |
687 | * For most use, where there is no extreme performance constraint, the |
688 | * normal, non-inline functions are a better choice. The resulting code |
689 | * will be smaller, and, if the need ever arises, easier to debug. |
690 | * |
691 | * These are implemented as #defines rather than real functions |
692 | * because there is no fully portable way to do inline functions in plain C. |
693 | * |
694 | ************************************************************************************/ |
695 | |
696 | #ifndef U_HIDE_INTERNAL_API |
697 | /** |
698 | * inline version of utext_current32(), for performance-critical situations. |
699 | * |
700 | * Get the code point at the current iteration position of the UText. |
701 | * Returns U_SENTINEL (-1) if the position is at the end of the |
702 | * text. |
703 | * |
704 | * @internal ICU 4.4 technology preview |
705 | */ |
706 | #define UTEXT_CURRENT32(ut) \ |
707 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ |
708 | ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) |
709 | #endif /* U_HIDE_INTERNAL_API */ |
710 | |
711 | /** |
712 | * inline version of utext_next32(), for performance-critical situations. |
713 | * |
714 | * Get the code point at the current iteration position of the UText, and |
715 | * advance the position to the first index following the character. |
716 | * This is a post-increment operation. |
717 | * Returns U_SENTINEL (-1) if the position is at the end of the |
718 | * text. |
719 | * |
720 | * @stable ICU 3.4 |
721 | */ |
722 | #define UTEXT_NEXT32(ut) \ |
723 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ |
724 | ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) |
725 | |
726 | /** |
727 | * inline version of utext_previous32(), for performance-critical situations. |
728 | * |
729 | * Move the iterator position to the character (code point) whose |
730 | * index precedes the current position, and return that character. |
731 | * This is a pre-decrement operation. |
732 | * Returns U_SENTINEL (-1) if the position is at the start of the text. |
733 | * |
734 | * @stable ICU 3.4 |
735 | */ |
736 | #define UTEXT_PREVIOUS32(ut) \ |
737 | ((ut)->chunkOffset > 0 && \ |
738 | (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ |
739 | (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) |
740 | |
741 | /** |
742 | * inline version of utext_getNativeIndex(), for performance-critical situations. |
743 | * |
744 | * Get the current iterator position, which can range from 0 to |
745 | * the length of the text. |
746 | * The position is a native index into the input text, in whatever format it |
747 | * may have (possibly UTF-8 for example), and may not always be the same as |
748 | * the corresponding UChar (UTF-16) index. |
749 | * The returned position will always be aligned to a code point boundary. |
750 | * |
751 | * @stable ICU 3.6 |
752 | */ |
753 | #define UTEXT_GETNATIVEINDEX(ut) \ |
754 | ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ |
755 | (ut)->chunkNativeStart+(ut)->chunkOffset : \ |
756 | (ut)->pFuncs->mapOffsetToNative(ut)) |
757 | |
758 | /** |
759 | * inline version of utext_setNativeIndex(), for performance-critical situations. |
760 | * |
761 | * Set the current iteration position to the nearest code point |
762 | * boundary at or preceding the specified index. |
763 | * The index is in the native units of the original input text. |
764 | * If the index is out of range, it will be pinned to be within |
765 | * the range of the input text. |
766 | * |
767 | * @stable ICU 3.8 |
768 | */ |
769 | #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ |
770 | int64_t __offset = (ix) - (ut)->chunkNativeStart; \ |
771 | if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ |
772 | (ut)->chunkOffset=(int32_t)__offset; \ |
773 | } else { \ |
774 | utext_setNativeIndex((ut), (ix)); \ |
775 | } \ |
776 | } UPRV_BLOCK_MACRO_END |
777 | |
778 | |
779 | |
780 | /************************************************************************************ |
781 | * |
782 | * Functions related to writing or modifying the text. |
783 | * These will work only with modifiable UTexts. Attempting to |
784 | * modify a read-only UText will return an error status. |
785 | * |
786 | ************************************************************************************/ |
787 | |
788 | |
789 | /** |
790 | * Return TRUE if the text can be written (modified) with utext_replace() or |
791 | * utext_copy(). For the text to be writable, the text provider must |
792 | * be of a type that supports writing and the UText must not be frozen. |
793 | * |
794 | * Attempting to modify text when utext_isWriteable() is FALSE will fail - |
795 | * the text will not be modified, and an error will be returned from the function |
796 | * that attempted the modification. |
797 | * |
798 | * @param ut the UText to be tested. |
799 | * @return TRUE if the text is modifiable. |
800 | * |
801 | * @see utext_freeze() |
802 | * @see utext_replace() |
803 | * @see utext_copy() |
804 | * @stable ICU 3.4 |
805 | * |
806 | */ |
807 | U_STABLE UBool U_EXPORT2 |
808 | utext_isWritable(const UText *ut); |
809 | |
810 | |
811 | /** |
812 | * Test whether there is meta data associated with the text. |
813 | * @see Replaceable::hasMetaData() |
814 | * |
815 | * @param ut The UText to be tested |
816 | * @return TRUE if the underlying text includes meta data. |
817 | * @stable ICU 3.4 |
818 | */ |
819 | U_STABLE UBool U_EXPORT2 |
820 | utext_hasMetaData(const UText *ut); |
821 | |
822 | |
823 | /** |
824 | * Replace a range of the original text with a replacement text. |
825 | * |
826 | * Leaves the current iteration position at the position following the |
827 | * newly inserted replacement text. |
828 | * |
829 | * This function is only available on UText types that support writing, |
830 | * that is, ones where utext_isWritable() returns TRUE. |
831 | * |
832 | * When using this function, there should be only a single UText opened onto the |
833 | * underlying native text string. Behavior after a replace operation |
834 | * on a UText is undefined for any other additional UTexts that refer to the |
835 | * modified string. |
836 | * |
837 | * @param ut the UText representing the text to be operated on. |
838 | * @param nativeStart the native index of the start of the region to be replaced |
839 | * @param nativeLimit the native index of the character following the region to be replaced. |
840 | * @param replacementText pointer to the replacement text |
841 | * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. |
842 | * @param status receives any error status. Possible errors include |
843 | * U_NO_WRITE_PERMISSION |
844 | * |
845 | * @return The signed number of (native) storage units by which |
846 | * the length of the text expanded or contracted. |
847 | * |
848 | * @stable ICU 3.4 |
849 | */ |
850 | U_STABLE int32_t U_EXPORT2 |
851 | utext_replace(UText *ut, |
852 | int64_t nativeStart, int64_t nativeLimit, |
853 | const UChar *replacementText, int32_t replacementLength, |
854 | UErrorCode *status); |
855 | |
856 | |
857 | |
858 | /** |
859 | * |
860 | * Copy or move a substring from one position to another within the text, |
861 | * while retaining any metadata associated with the text. |
862 | * This function is used to duplicate or reorder substrings. |
863 | * The destination index must not overlap the source range. |
864 | * |
865 | * The text to be copied or moved is inserted at destIndex; |
866 | * it does not replace or overwrite any existing text. |
867 | * |
868 | * The iteration position is left following the newly inserted text |
869 | * at the destination position. |
870 | * |
871 | * This function is only available on UText types that support writing, |
872 | * that is, ones where utext_isWritable() returns TRUE. |
873 | * |
874 | * When using this function, there should be only a single UText opened onto the |
875 | * underlying native text string. Behavior after a copy operation |
876 | * on a UText is undefined in any other additional UTexts that refer to the |
877 | * modified string. |
878 | * |
879 | * @param ut The UText representing the text to be operated on. |
880 | * @param nativeStart The native index of the start of the region to be copied or moved |
881 | * @param nativeLimit The native index of the character position following the region |
882 | * to be copied. |
883 | * @param destIndex The native destination index to which the source substring is |
884 | * copied or moved. |
885 | * @param move If TRUE, then the substring is moved, not copied/duplicated. |
886 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION |
887 | * |
888 | * @stable ICU 3.4 |
889 | */ |
890 | U_STABLE void U_EXPORT2 |
891 | utext_copy(UText *ut, |
892 | int64_t nativeStart, int64_t nativeLimit, |
893 | int64_t destIndex, |
894 | UBool move, |
895 | UErrorCode *status); |
896 | |
897 | |
898 | /** |
899 | * <p> |
900 | * Freeze a UText. This prevents any modification to the underlying text itself |
901 | * by means of functions operating on this UText. |
902 | * </p> |
903 | * <p> |
904 | * Once frozen, a UText can not be unfrozen. The intent is to ensure |
905 | * that a the text underlying a frozen UText wrapper cannot be modified via that UText. |
906 | * </p> |
907 | * <p> |
908 | * Caution: freezing a UText will disable changes made via the specific |
909 | * frozen UText wrapper only; it will not have any effect on the ability to |
910 | * directly modify the text by bypassing the UText. Any such backdoor modifications |
911 | * are always an error while UText access is occurring because the underlying |
912 | * text can get out of sync with UText's buffering. |
913 | * </p> |
914 | * |
915 | * @param ut The UText to be frozen. |
916 | * @see utext_isWritable() |
917 | * @stable ICU 3.6 |
918 | */ |
919 | U_STABLE void U_EXPORT2 |
920 | utext_freeze(UText *ut); |
921 | |
922 | |
923 | /** |
924 | * UText provider properties (bit field indexes). |
925 | * |
926 | * @see UText |
927 | * @stable ICU 3.4 |
928 | */ |
929 | enum { |
930 | /** |
931 | * It is potentially time consuming for the provider to determine the length of the text. |
932 | * @stable ICU 3.4 |
933 | */ |
934 | UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, |
935 | /** |
936 | * Text chunks remain valid and usable until the text object is modified or |
937 | * deleted, not just until the next time the access() function is called |
938 | * (which is the default). |
939 | * @stable ICU 3.4 |
940 | */ |
941 | UTEXT_PROVIDER_STABLE_CHUNKS = 2, |
942 | /** |
943 | * The provider supports modifying the text via the replace() and copy() |
944 | * functions. |
945 | * @see Replaceable |
946 | * @stable ICU 3.4 |
947 | */ |
948 | UTEXT_PROVIDER_WRITABLE = 3, |
949 | /** |
950 | * There is meta data associated with the text. |
951 | * @see Replaceable::hasMetaData() |
952 | * @stable ICU 3.4 |
953 | */ |
954 | UTEXT_PROVIDER_HAS_META_DATA = 4, |
955 | /** |
956 | * Text provider owns the text storage. |
957 | * Generally occurs as the result of a deep clone of the UText. |
958 | * When closing the UText, the associated text must |
959 | * also be closed/deleted/freed/ whatever is appropriate. |
960 | * @stable ICU 3.6 |
961 | */ |
962 | UTEXT_PROVIDER_OWNS_TEXT = 5 |
963 | }; |
964 | |
965 | /** |
966 | * Function type declaration for UText.clone(). |
967 | * |
968 | * clone a UText. Much like opening a UText where the source text is itself |
969 | * another UText. |
970 | * |
971 | * A deep clone will copy both the UText data structures and the underlying text. |
972 | * The original and cloned UText will operate completely independently; modifications |
973 | * made to the text in one will not effect the other. Text providers are not |
974 | * required to support deep clones. The user of clone() must check the status return |
975 | * and be prepared to handle failures. |
976 | * |
977 | * A shallow clone replicates only the UText data structures; it does not make |
978 | * a copy of the underlying text. Shallow clones can be used as an efficient way to |
979 | * have multiple iterators active in a single text string that is not being |
980 | * modified. |
981 | * |
982 | * A shallow clone operation must not fail except for truly exceptional conditions such |
983 | * as memory allocation failures. |
984 | * |
985 | * A UText and its clone may be safely concurrently accessed by separate threads. |
986 | * This is true for both shallow and deep clones. |
987 | * It is the responsibility of the Text Provider to ensure that this thread safety |
988 | * constraint is met. |
989 | |
990 | * |
991 | * @param dest A UText struct to be filled in with the result of the clone operation, |
992 | * or NULL if the clone function should heap-allocate a new UText struct. |
993 | * @param src The UText to be cloned. |
994 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. |
995 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR |
996 | * should be returned if the text provider is unable to clone the |
997 | * original text. |
998 | * @return The newly created clone, or NULL if the clone operation failed. |
999 | * |
1000 | * @stable ICU 3.4 |
1001 | */ |
1002 | typedef UText * U_CALLCONV |
1003 | UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); |
1004 | |
1005 | |
1006 | /** |
1007 | * Function type declaration for UText.nativeLength(). |
1008 | * |
1009 | * @param ut the UText to get the length of. |
1010 | * @return the length, in the native units of the original text string. |
1011 | * @see UText |
1012 | * @stable ICU 3.4 |
1013 | */ |
1014 | typedef int64_t U_CALLCONV |
1015 | UTextNativeLength(UText *ut); |
1016 | |
1017 | /** |
1018 | * Function type declaration for UText.access(). Get the description of the text chunk |
1019 | * containing the text at a requested native index. The UText's iteration |
1020 | * position will be left at the requested index. If the index is out |
1021 | * of bounds, the iteration position will be left at the start or end |
1022 | * of the string, as appropriate. |
1023 | * |
1024 | * Chunks must begin and end on code point boundaries. A single code point |
1025 | * comprised of multiple storage units must never span a chunk boundary. |
1026 | * |
1027 | * |
1028 | * @param ut the UText being accessed. |
1029 | * @param nativeIndex Requested index of the text to be accessed. |
1030 | * @param forward If TRUE, then the returned chunk must contain text |
1031 | * starting from the index, so that start<=index<limit. |
1032 | * If FALSE, then the returned chunk must contain text |
1033 | * before the index, so that start<index<=limit. |
1034 | * @return True if the requested index could be accessed. The chunk |
1035 | * will contain the requested text. |
1036 | * False value if a chunk cannot be accessed |
1037 | * (the requested index is out of bounds). |
1038 | * |
1039 | * @see UText |
1040 | * @stable ICU 3.4 |
1041 | */ |
1042 | typedef UBool U_CALLCONV |
1043 | UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); |
1044 | |
1045 | /** |
1046 | * Function type declaration for UText.extract(). |
1047 | * |
1048 | * Extract text from a UText into a UChar buffer. The range of text to be extracted |
1049 | * is specified in the native indices of the UText provider. These may not necessarily |
1050 | * be UTF-16 indices. |
1051 | * <p> |
1052 | * The size (number of 16 bit UChars) in the data to be extracted is returned. The |
1053 | * full amount is returned, even when the specified buffer size is smaller. |
1054 | * <p> |
1055 | * The extracted string will (if you are a user) / must (if you are a text provider) |
1056 | * be NUL-terminated if there is sufficient space in the destination buffer. |
1057 | * |
1058 | * @param ut the UText from which to extract data. |
1059 | * @param nativeStart the native index of the first character to extract. |
1060 | * @param nativeLimit the native string index of the position following the last |
1061 | * character to extract. |
1062 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed |
1063 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero |
1064 | * for precomputing the required size. |
1065 | * @param status receives any error status. |
1066 | * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for |
1067 | * preflighting. |
1068 | * @return Number of UChars in the data. Does not include a trailing NUL. |
1069 | * |
1070 | * @stable ICU 3.4 |
1071 | */ |
1072 | typedef int32_t U_CALLCONV |
1073 | (UText *ut, |
1074 | int64_t nativeStart, int64_t nativeLimit, |
1075 | UChar *dest, int32_t destCapacity, |
1076 | UErrorCode *status); |
1077 | |
1078 | /** |
1079 | * Function type declaration for UText.replace(). |
1080 | * |
1081 | * Replace a range of the original text with a replacement text. |
1082 | * |
1083 | * Leaves the current iteration position at the position following the |
1084 | * newly inserted replacement text. |
1085 | * |
1086 | * This function need only be implemented on UText types that support writing. |
1087 | * |
1088 | * When using this function, there should be only a single UText opened onto the |
1089 | * underlying native text string. The function is responsible for updating the |
1090 | * text chunk within the UText to reflect the updated iteration position, |
1091 | * taking into account any changes to the underlying string's structure caused |
1092 | * by the replace operation. |
1093 | * |
1094 | * @param ut the UText representing the text to be operated on. |
1095 | * @param nativeStart the index of the start of the region to be replaced |
1096 | * @param nativeLimit the index of the character following the region to be replaced. |
1097 | * @param replacementText pointer to the replacement text |
1098 | * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. |
1099 | * @param status receives any error status. Possible errors include |
1100 | * U_NO_WRITE_PERMISSION |
1101 | * |
1102 | * @return The signed number of (native) storage units by which |
1103 | * the length of the text expanded or contracted. |
1104 | * |
1105 | * @stable ICU 3.4 |
1106 | */ |
1107 | typedef int32_t U_CALLCONV |
1108 | UTextReplace(UText *ut, |
1109 | int64_t nativeStart, int64_t nativeLimit, |
1110 | const UChar *replacementText, int32_t replacmentLength, |
1111 | UErrorCode *status); |
1112 | |
1113 | /** |
1114 | * Function type declaration for UText.copy(). |
1115 | * |
1116 | * Copy or move a substring from one position to another within the text, |
1117 | * while retaining any metadata associated with the text. |
1118 | * This function is used to duplicate or reorder substrings. |
1119 | * The destination index must not overlap the source range. |
1120 | * |
1121 | * The text to be copied or moved is inserted at destIndex; |
1122 | * it does not replace or overwrite any existing text. |
1123 | * |
1124 | * This function need only be implemented for UText types that support writing. |
1125 | * |
1126 | * When using this function, there should be only a single UText opened onto the |
1127 | * underlying native text string. The function is responsible for updating the |
1128 | * text chunk within the UText to reflect the updated iteration position, |
1129 | * taking into account any changes to the underlying string's structure caused |
1130 | * by the replace operation. |
1131 | * |
1132 | * @param ut The UText representing the text to be operated on. |
1133 | * @param nativeStart The index of the start of the region to be copied or moved |
1134 | * @param nativeLimit The index of the character following the region to be replaced. |
1135 | * @param nativeDest The destination index to which the source substring is copied or moved. |
1136 | * @param move If TRUE, then the substring is moved, not copied/duplicated. |
1137 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION |
1138 | * |
1139 | * @stable ICU 3.4 |
1140 | */ |
1141 | typedef void U_CALLCONV |
1142 | UTextCopy(UText *ut, |
1143 | int64_t nativeStart, int64_t nativeLimit, |
1144 | int64_t nativeDest, |
1145 | UBool move, |
1146 | UErrorCode *status); |
1147 | |
1148 | /** |
1149 | * Function type declaration for UText.mapOffsetToNative(). |
1150 | * Map from the current UChar offset within the current text chunk to |
1151 | * the corresponding native index in the original source text. |
1152 | * |
1153 | * This is required only for text providers that do not use native UTF-16 indexes. |
1154 | * |
1155 | * @param ut the UText. |
1156 | * @return Absolute (native) index corresponding to chunkOffset in the current chunk. |
1157 | * The returned native index should always be to a code point boundary. |
1158 | * |
1159 | * @stable ICU 3.4 |
1160 | */ |
1161 | typedef int64_t U_CALLCONV |
1162 | UTextMapOffsetToNative(const UText *ut); |
1163 | |
1164 | /** |
1165 | * Function type declaration for UText.mapIndexToUTF16(). |
1166 | * Map from a native index to a UChar offset within a text chunk. |
1167 | * Behavior is undefined if the native index does not fall within the |
1168 | * current chunk. |
1169 | * |
1170 | * This function is required only for text providers that do not use native UTF-16 indexes. |
1171 | * |
1172 | * @param ut The UText containing the text chunk. |
1173 | * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. |
1174 | * @return Chunk-relative UTF-16 offset corresponding to the specified native |
1175 | * index. |
1176 | * |
1177 | * @stable ICU 3.4 |
1178 | */ |
1179 | typedef int32_t U_CALLCONV |
1180 | UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); |
1181 | |
1182 | |
1183 | /** |
1184 | * Function type declaration for UText.utextClose(). |
1185 | * |
1186 | * A Text Provider close function is only required for provider types that make |
1187 | * allocations in their open function (or other functions) that must be |
1188 | * cleaned when the UText is closed. |
1189 | * |
1190 | * The allocation of the UText struct itself and any "extra" storage |
1191 | * associated with the UText is handled by the common UText implementation |
1192 | * and does not require provider specific cleanup in a close function. |
1193 | * |
1194 | * Most UText provider implementations do not need to implement this function. |
1195 | * |
1196 | * @param ut A UText object to be closed. |
1197 | * |
1198 | * @stable ICU 3.4 |
1199 | */ |
1200 | typedef void U_CALLCONV |
1201 | UTextClose(UText *ut); |
1202 | |
1203 | |
1204 | /** |
1205 | * (public) Function dispatch table for UText. |
1206 | * Conceptually very much like a C++ Virtual Function Table. |
1207 | * This struct defines the organization of the table. |
1208 | * Each text provider implementation must provide an |
1209 | * actual table that is initialized with the appropriate functions |
1210 | * for the type of text being handled. |
1211 | * @stable ICU 3.6 |
1212 | */ |
1213 | struct UTextFuncs { |
1214 | /** |
1215 | * (public) Function table size, sizeof(UTextFuncs) |
1216 | * Intended for use should the table grow to accommodate added |
1217 | * functions in the future, to allow tests for older format |
1218 | * function tables that do not contain the extensions. |
1219 | * |
1220 | * Fields are placed for optimal alignment on |
1221 | * 32/64/128-bit-pointer machines, by normally grouping together |
1222 | * 4 32-bit fields, |
1223 | * 4 pointers, |
1224 | * 2 64-bit fields |
1225 | * in sequence. |
1226 | * @stable ICU 3.6 |
1227 | */ |
1228 | int32_t tableSize; |
1229 | |
1230 | /** |
1231 | * (private) Alignment padding. |
1232 | * Do not use, reserved for use by the UText framework only. |
1233 | * @internal |
1234 | */ |
1235 | int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; |
1236 | |
1237 | |
1238 | /** |
1239 | * (public) Function pointer for UTextClone |
1240 | * |
1241 | * @see UTextClone |
1242 | * @stable ICU 3.6 |
1243 | */ |
1244 | UTextClone *clone; |
1245 | |
1246 | /** |
1247 | * (public) function pointer for UTextLength |
1248 | * May be expensive to compute! |
1249 | * |
1250 | * @see UTextLength |
1251 | * @stable ICU 3.6 |
1252 | */ |
1253 | UTextNativeLength *nativeLength; |
1254 | |
1255 | /** |
1256 | * (public) Function pointer for UTextAccess. |
1257 | * |
1258 | * @see UTextAccess |
1259 | * @stable ICU 3.6 |
1260 | */ |
1261 | UTextAccess *access; |
1262 | |
1263 | /** |
1264 | * (public) Function pointer for UTextExtract. |
1265 | * |
1266 | * @see UTextExtract |
1267 | * @stable ICU 3.6 |
1268 | */ |
1269 | UTextExtract *; |
1270 | |
1271 | /** |
1272 | * (public) Function pointer for UTextReplace. |
1273 | * |
1274 | * @see UTextReplace |
1275 | * @stable ICU 3.6 |
1276 | */ |
1277 | UTextReplace *replace; |
1278 | |
1279 | /** |
1280 | * (public) Function pointer for UTextCopy. |
1281 | * |
1282 | * @see UTextCopy |
1283 | * @stable ICU 3.6 |
1284 | */ |
1285 | UTextCopy *copy; |
1286 | |
1287 | /** |
1288 | * (public) Function pointer for UTextMapOffsetToNative. |
1289 | * |
1290 | * @see UTextMapOffsetToNative |
1291 | * @stable ICU 3.6 |
1292 | */ |
1293 | UTextMapOffsetToNative *mapOffsetToNative; |
1294 | |
1295 | /** |
1296 | * (public) Function pointer for UTextMapNativeIndexToUTF16. |
1297 | * |
1298 | * @see UTextMapNativeIndexToUTF16 |
1299 | * @stable ICU 3.6 |
1300 | */ |
1301 | UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; |
1302 | |
1303 | /** |
1304 | * (public) Function pointer for UTextClose. |
1305 | * |
1306 | * @see UTextClose |
1307 | * @stable ICU 3.6 |
1308 | */ |
1309 | UTextClose *close; |
1310 | |
1311 | /** |
1312 | * (private) Spare function pointer |
1313 | * @internal |
1314 | */ |
1315 | UTextClose *spare1; |
1316 | |
1317 | /** |
1318 | * (private) Spare function pointer |
1319 | * @internal |
1320 | */ |
1321 | UTextClose *spare2; |
1322 | |
1323 | /** |
1324 | * (private) Spare function pointer |
1325 | * @internal |
1326 | */ |
1327 | UTextClose *spare3; |
1328 | |
1329 | }; |
1330 | /** |
1331 | * Function dispatch table for UText |
1332 | * @see UTextFuncs |
1333 | */ |
1334 | typedef struct UTextFuncs UTextFuncs; |
1335 | |
1336 | /** |
1337 | * UText struct. Provides the interface between the generic UText access code |
1338 | * and the UText provider code that works on specific kinds of |
1339 | * text (UTF-8, noncontiguous UTF-16, whatever.) |
1340 | * |
1341 | * Applications that are using predefined types of text providers |
1342 | * to pass text data to ICU services will have no need to view the |
1343 | * internals of the UText structs that they open. |
1344 | * |
1345 | * @stable ICU 3.6 |
1346 | */ |
1347 | struct UText { |
1348 | /** |
1349 | * (private) Magic. Used to help detect when UText functions are handed |
1350 | * invalid or uninitialized UText structs. |
1351 | * utext_openXYZ() functions take an initialized, |
1352 | * but not necessarily open, UText struct as an |
1353 | * optional fill-in parameter. This magic field |
1354 | * is used to check for that initialization. |
1355 | * Text provider close functions must NOT clear |
1356 | * the magic field because that would prevent |
1357 | * reuse of the UText struct. |
1358 | * @internal |
1359 | */ |
1360 | uint32_t magic; |
1361 | |
1362 | |
1363 | /** |
1364 | * (private) Flags for managing the allocation and freeing of |
1365 | * memory associated with this UText. |
1366 | * @internal |
1367 | */ |
1368 | int32_t flags; |
1369 | |
1370 | |
1371 | /** |
1372 | * Text provider properties. This set of flags is maintained by the |
1373 | * text provider implementation. |
1374 | * @stable ICU 3.4 |
1375 | */ |
1376 | int32_t providerProperties; |
1377 | |
1378 | /** |
1379 | * (public) sizeOfStruct=sizeof(UText) |
1380 | * Allows possible backward compatible extension. |
1381 | * |
1382 | * @stable ICU 3.4 |
1383 | */ |
1384 | int32_t sizeOfStruct; |
1385 | |
1386 | /* ------ 16 byte alignment boundary ----------- */ |
1387 | |
1388 | |
1389 | /** |
1390 | * (protected) Native index of the first character position following |
1391 | * the current chunk. |
1392 | * @stable ICU 3.6 |
1393 | */ |
1394 | int64_t chunkNativeLimit; |
1395 | |
1396 | /** |
1397 | * (protected) Size in bytes of the extra space (pExtra). |
1398 | * @stable ICU 3.4 |
1399 | */ |
1400 | int32_t ; |
1401 | |
1402 | /** |
1403 | * (protected) The highest chunk offset where native indexing and |
1404 | * chunk (UTF-16) indexing correspond. For UTF-16 sources, value |
1405 | * will be equal to chunkLength. |
1406 | * |
1407 | * @stable ICU 3.6 |
1408 | */ |
1409 | int32_t nativeIndexingLimit; |
1410 | |
1411 | /* ---- 16 byte alignment boundary------ */ |
1412 | |
1413 | /** |
1414 | * (protected) Native index of the first character in the text chunk. |
1415 | * @stable ICU 3.6 |
1416 | */ |
1417 | int64_t chunkNativeStart; |
1418 | |
1419 | /** |
1420 | * (protected) Current iteration position within the text chunk (UTF-16 buffer). |
1421 | * This is the index to the character that will be returned by utext_next32(). |
1422 | * @stable ICU 3.6 |
1423 | */ |
1424 | int32_t chunkOffset; |
1425 | |
1426 | /** |
1427 | * (protected) Length the text chunk (UTF-16 buffer), in UChars. |
1428 | * @stable ICU 3.6 |
1429 | */ |
1430 | int32_t chunkLength; |
1431 | |
1432 | /* ---- 16 byte alignment boundary-- */ |
1433 | |
1434 | |
1435 | /** |
1436 | * (protected) pointer to a chunk of text in UTF-16 format. |
1437 | * May refer either to original storage of the source of the text, or |
1438 | * if conversion was required, to a buffer owned by the UText. |
1439 | * @stable ICU 3.6 |
1440 | */ |
1441 | const UChar *chunkContents; |
1442 | |
1443 | /** |
1444 | * (public) Pointer to Dispatch table for accessing functions for this UText. |
1445 | * @stable ICU 3.6 |
1446 | */ |
1447 | const UTextFuncs *pFuncs; |
1448 | |
1449 | /** |
1450 | * (protected) Pointer to additional space requested by the |
1451 | * text provider during the utext_open operation. |
1452 | * @stable ICU 3.4 |
1453 | */ |
1454 | void *; |
1455 | |
1456 | /** |
1457 | * (protected) Pointer to string or text-containing object or similar. |
1458 | * This is the source of the text that this UText is wrapping, in a format |
1459 | * that is known to the text provider functions. |
1460 | * @stable ICU 3.4 |
1461 | */ |
1462 | const void *context; |
1463 | |
1464 | /* --- 16 byte alignment boundary--- */ |
1465 | |
1466 | /** |
1467 | * (protected) Pointer fields available for use by the text provider. |
1468 | * Not used by UText common code. |
1469 | * @stable ICU 3.6 |
1470 | */ |
1471 | const void *p; |
1472 | /** |
1473 | * (protected) Pointer fields available for use by the text provider. |
1474 | * Not used by UText common code. |
1475 | * @stable ICU 3.6 |
1476 | */ |
1477 | const void *q; |
1478 | /** |
1479 | * (protected) Pointer fields available for use by the text provider. |
1480 | * Not used by UText common code. |
1481 | * @stable ICU 3.6 |
1482 | */ |
1483 | const void *r; |
1484 | |
1485 | /** |
1486 | * Private field reserved for future use by the UText framework |
1487 | * itself. This is not to be touched by the text providers. |
1488 | * @internal ICU 3.4 |
1489 | */ |
1490 | void *privP; |
1491 | |
1492 | |
1493 | /* --- 16 byte alignment boundary--- */ |
1494 | |
1495 | |
1496 | /** |
1497 | * (protected) Integer field reserved for use by the text provider. |
1498 | * Not used by the UText framework, or by the client (user) of the UText. |
1499 | * @stable ICU 3.4 |
1500 | */ |
1501 | int64_t a; |
1502 | |
1503 | /** |
1504 | * (protected) Integer field reserved for use by the text provider. |
1505 | * Not used by the UText framework, or by the client (user) of the UText. |
1506 | * @stable ICU 3.4 |
1507 | */ |
1508 | int32_t b; |
1509 | |
1510 | /** |
1511 | * (protected) Integer field reserved for use by the text provider. |
1512 | * Not used by the UText framework, or by the client (user) of the UText. |
1513 | * @stable ICU 3.4 |
1514 | */ |
1515 | int32_t c; |
1516 | |
1517 | /* ---- 16 byte alignment boundary---- */ |
1518 | |
1519 | |
1520 | /** |
1521 | * Private field reserved for future use by the UText framework |
1522 | * itself. This is not to be touched by the text providers. |
1523 | * @internal ICU 3.4 |
1524 | */ |
1525 | int64_t privA; |
1526 | /** |
1527 | * Private field reserved for future use by the UText framework |
1528 | * itself. This is not to be touched by the text providers. |
1529 | * @internal ICU 3.4 |
1530 | */ |
1531 | int32_t privB; |
1532 | /** |
1533 | * Private field reserved for future use by the UText framework |
1534 | * itself. This is not to be touched by the text providers. |
1535 | * @internal ICU 3.4 |
1536 | */ |
1537 | int32_t privC; |
1538 | }; |
1539 | |
1540 | |
1541 | /** |
1542 | * Common function for use by Text Provider implementations to allocate and/or initialize |
1543 | * a new UText struct. To be called in the implementation of utext_open() functions. |
1544 | * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. |
1545 | * If the supplied UText is already open, the provider's close function will be called |
1546 | * so that the struct can be reused by the open that is in progress. |
1547 | * |
1548 | * @param ut pointer to a UText struct to be re-used, or null if a new UText |
1549 | * should be allocated. |
1550 | * @param extraSpace The amount of additional space to be allocated as part |
1551 | * of this UText, for use by types of providers that require |
1552 | * additional storage. |
1553 | * @param status Errors are returned here. |
1554 | * @return pointer to the UText, allocated if necessary, with extra space set up if requested. |
1555 | * @stable ICU 3.4 |
1556 | */ |
1557 | U_STABLE UText * U_EXPORT2 |
1558 | utext_setup(UText *ut, int32_t , UErrorCode *status); |
1559 | |
1560 | // do not use #ifndef U_HIDE_INTERNAL_API around the following! |
1561 | /** |
1562 | * @internal |
1563 | * Value used to help identify correctly initialized UText structs. |
1564 | * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. |
1565 | */ |
1566 | enum { |
1567 | UTEXT_MAGIC = 0x345ad82c |
1568 | }; |
1569 | |
1570 | /** |
1571 | * initializer to be used with local (stack) instances of a UText |
1572 | * struct. UText structs must be initialized before passing |
1573 | * them to one of the utext_open functions. |
1574 | * |
1575 | * @stable ICU 3.6 |
1576 | */ |
1577 | #define UTEXT_INITIALIZER { \ |
1578 | UTEXT_MAGIC, /* magic */ \ |
1579 | 0, /* flags */ \ |
1580 | 0, /* providerProps */ \ |
1581 | sizeof(UText), /* sizeOfStruct */ \ |
1582 | 0, /* chunkNativeLimit */ \ |
1583 | 0, /* extraSize */ \ |
1584 | 0, /* nativeIndexingLimit */ \ |
1585 | 0, /* chunkNativeStart */ \ |
1586 | 0, /* chunkOffset */ \ |
1587 | 0, /* chunkLength */ \ |
1588 | NULL, /* chunkContents */ \ |
1589 | NULL, /* pFuncs */ \ |
1590 | NULL, /* pExtra */ \ |
1591 | NULL, /* context */ \ |
1592 | NULL, NULL, NULL, /* p, q, r */ \ |
1593 | NULL, /* privP */ \ |
1594 | 0, 0, 0, /* a, b, c */ \ |
1595 | 0, 0, 0 /* privA,B,C, */ \ |
1596 | } |
1597 | |
1598 | |
1599 | U_CDECL_END |
1600 | |
1601 | |
1602 | |
1603 | #endif |
1604 | |