utext.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/utext.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2005-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: utext.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2005apr12
16	* created by: Markus W. Scherer
17	*/
18
19	#include "unicode/utypes.h"
20	#include "unicode/ustring.h"
21	#include "unicode/unistr.h"
22	#include "unicode/chariter.h"
23	#include "unicode/utext.h"
24	#include "unicode/utf.h"
25	#include "unicode/utf8.h"
26	#include "unicode/utf16.h"
27	#include "ustr_imp.h"
28	#include "cmemory.h"
29	#include "cstring.h"
30	#include "uassert.h"
31	#include "putilimp.h"
32
33	U_NAMESPACE_USE
34
35	#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
36
37
38	static UBool
39	utext_access(UText *ut, int64_t index, UBool forward) {
40	return ut->pFuncs->access(ut, index, forward);
41	}
42
43
44
45	U_CAPI UBool U_EXPORT2
46	utext_moveIndex32(UText *ut, int32_t delta) {
47	UChar32 c;
48	if (delta > `0`) {
49	do {
50	if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
51	return FALSE;
52	}
53	c = ut->chunkContents[ut->chunkOffset];
54	if (U16_IS_SURROGATE(c)) {
55	c = utext_next32(ut);
56	if (c == U_SENTINEL) {
57	return FALSE;
58	}
59	} else {
60	ut->chunkOffset++;
61	}
62	} while(--delta>`0`);
63
64	} else if (delta<`0`) {
65	do {
66	if(ut->chunkOffset<=`0` && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
67	return FALSE;
68	}
69	c = ut->chunkContents[ut->chunkOffset-`1`];
70	if (U16_IS_SURROGATE(c)) {
71	c = utext_previous32(ut);
72	if (c == U_SENTINEL) {
73	return FALSE;
74	}
75	} else {
76	ut->chunkOffset--;
77	}
78	} while(++delta<`0`);
79	}
80
81	return TRUE;
82	}
83
84
85	U_CAPI int64_t U_EXPORT2
86	utext_nativeLength(UText *ut) {
87	return ut->pFuncs->nativeLength(ut);
88	}
89
90
91	U_CAPI UBool U_EXPORT2
92	utext_isLengthExpensive(const UText *ut) {
93	UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != `0`;
94	return r;
95	}
96
97
98	U_CAPI int64_t U_EXPORT2
99	utext_getNativeIndex(const UText *ut) {
100	if(ut->chunkOffset <= ut->nativeIndexingLimit) {
101	return ut->chunkNativeStart+ut->chunkOffset;
102	} else {
103	return ut->pFuncs->mapOffsetToNative(ut);
104	}
105	}
106
107
108	U_CAPI void U_EXPORT2
109	utext_setNativeIndex(UText *ut, int64_t index) {
110	if(index<ut->chunkNativeStart \|\| index>=ut->chunkNativeLimit) {
111	// The desired position is outside of the current chunk.
112	// Access the new position. Assume a forward iteration from here,
113	// which will also be optimimum for a single random access.
114	// Reverse iterations may suffer slightly.
115	ut->pFuncs->access(ut, index, TRUE);
116	} else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
117	// utf-16 indexing.
118	ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
119	} else {
120	ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
121	}
122	// The convention is that the index must always be on a code point boundary.
123	// Adjust the index position if it is in the middle of a surrogate pair.
124	if (ut->chunkOffset<ut->chunkLength) {
125	UChar c= ut->chunkContents[ut->chunkOffset];
126	if (U16_IS_TRAIL(c)) {
127	if (ut->chunkOffset==`0`) {
128	ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
129	}
130	if (ut->chunkOffset>`0`) {
131	UChar lead = ut->chunkContents[ut->chunkOffset-`1`];
132	if (U16_IS_LEAD(lead)) {
133	ut->chunkOffset--;
134	}
135	}
136	}
137	}
138	}
139
140
141
142	U_CAPI int64_t U_EXPORT2
143	utext_getPreviousNativeIndex(UText *ut) {
144	//
145	// Fast-path the common case.
146	// Common means current position is not at the beginning of a chunk
147	// and the preceding character is not supplementary.
148	//
149	int32_t i = ut->chunkOffset - `1`;
150	int64_t result;
151	if (i >= `0`) {
152	UChar c = ut->chunkContents[i];
153	if (U16_IS_TRAIL(c) == FALSE) {
154	if (i <= ut->nativeIndexingLimit) {
155	result = ut->chunkNativeStart + i;
156	} else {
157	ut->chunkOffset = i;
158	result = ut->pFuncs->mapOffsetToNative(ut);
159	ut->chunkOffset++;
160	}
161	return result;
162	}
163	}
164
165	// If at the start of text, simply return 0.
166	if (ut->chunkOffset==`0` && ut->chunkNativeStart==`0`) {
167	return `0`;
168	}
169
170	// Harder, less common cases. We are at a chunk boundary, or on a surrogate.
171	// Keep it simple, use other functions to handle the edges.
172	//
173	utext_previous32(ut);
174	result = UTEXT_GETNATIVEINDEX(ut);
175	utext_next32(ut);
176	return result;
177	}
178
179
180	//
181	// utext_current32. Get the UChar32 at the current position.
182	// UText iteration position is always on a code point boundary,
183	// never on the trail half of a surrogate pair.
184	//
185	U_CAPI UChar32 U_EXPORT2
186	utext_current32(UText *ut) {
187	UChar32 c;
188	if (ut->chunkOffset==ut->chunkLength) {
189	// Current position is just off the end of the chunk.
190	if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
191	// Off the end of the text.
192	return U_SENTINEL;
193	}
194	}
195
196	c = ut->chunkContents[ut->chunkOffset];
197	if (U16_IS_LEAD(c) == FALSE) {
198	// Normal, non-supplementary case.
199	return c;
200	}
201
202	//
203	// Possible supplementary char.
204	//
205	UChar32 trail = `0`;
206	UChar32 supplementaryC = c;
207	if ((ut->chunkOffset+`1`) < ut->chunkLength) {
208	// The trail surrogate is in the same chunk.
209	trail = ut->chunkContents[ut->chunkOffset+`1`];
210	} else {
211	// The trail surrogate is in a different chunk.
212	// Because we must maintain the iteration position, we need to switch forward
213	// into the new chunk, get the trail surrogate, then revert the chunk back to the
214	// original one.
215	// An edge case to be careful of: the entire text may end with an unpaired
216	// leading surrogate. The attempt to access the trail will fail, but
217	// the original position before the unpaired lead still needs to be restored.
218	int64_t nativePosition = ut->chunkNativeLimit;
219	int32_t originalOffset = ut->chunkOffset;
220	if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
221	trail = ut->chunkContents[ut->chunkOffset];
222	}
223	UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
224	U_ASSERT(r==TRUE);
225	ut->chunkOffset = originalOffset;
226	if(!r) {
227	return U_SENTINEL;
228	}
229	}
230
231	if (U16_IS_TRAIL(trail)) {
232	supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
233	}
234	return supplementaryC;
235
236	}
237
238
239	U_CAPI UChar32 U_EXPORT2
240	utext_char32At(UText *ut, int64_t nativeIndex) {
241	UChar32 c = U_SENTINEL;
242
243	// Fast path the common case.
244	if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
245	ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
246	c = ut->chunkContents[ut->chunkOffset];
247	if (U16_IS_SURROGATE(c) == FALSE) {
248	return c;
249	}
250	}
251
252
253	utext_setNativeIndex(ut, nativeIndex);
254	if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
255	c = ut->chunkContents[ut->chunkOffset];
256	if (U16_IS_SURROGATE(c)) {
257	// For surrogates, let current32() deal with the complications
258	// of supplementaries that may span chunk boundaries.
259	c = utext_current32(ut);
260	}
261	}
262	return c;
263	}
264
265
266	U_CAPI UChar32 U_EXPORT2
267	utext_next32(UText *ut) {
268	UChar32 c;
269
270	if (ut->chunkOffset >= ut->chunkLength) {
271	if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
272	return U_SENTINEL;
273	}
274	}
275
276	c = ut->chunkContents[ut->chunkOffset++];
277	if (U16_IS_LEAD(c) == FALSE) {
278	// Normal case, not supplementary.
279	// (A trail surrogate seen here is just returned as is, as a surrogate value.
280	// It cannot be part of a pair.)
281	return c;
282	}
283
284	if (ut->chunkOffset >= ut->chunkLength) {
285	if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
286	// c is an unpaired lead surrogate at the end of the text.
287	// return it as it is.
288	return c;
289	}
290	}
291	UChar32 trail = ut->chunkContents[ut->chunkOffset];
292	if (U16_IS_TRAIL(trail) == FALSE) {
293	// c was an unpaired lead surrogate, not at the end of the text.
294	// return it as it is (unpaired). Iteration position is on the
295	// following character, possibly in the next chunk, where the
296	// trail surrogate would have been if it had existed.
297	return c;
298	}
299
300	UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
301	ut->chunkOffset++; // move iteration position over the trail surrogate.
302	return supplementary;
303	}
304
305
306	U_CAPI UChar32 U_EXPORT2
307	utext_previous32(UText *ut) {
308	UChar32 c;
309
310	if (ut->chunkOffset <= `0`) {
311	if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
312	return U_SENTINEL;
313	}
314	}
315	ut->chunkOffset--;
316	c = ut->chunkContents[ut->chunkOffset];
317	if (U16_IS_TRAIL(c) == FALSE) {
318	// Normal case, not supplementary.
319	// (A lead surrogate seen here is just returned as is, as a surrogate value.
320	// It cannot be part of a pair.)
321	return c;
322	}
323
324	if (ut->chunkOffset <= `0`) {
325	if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
326	// c is an unpaired trail surrogate at the start of the text.
327	// return it as it is.
328	return c;
329	}
330	}
331
332	UChar32 lead = ut->chunkContents[ut->chunkOffset-`1`];
333	if (U16_IS_LEAD(lead) == FALSE) {
334	// c was an unpaired trail surrogate, not at the end of the text.
335	// return it as it is (unpaired). Iteration position is at c
336	return c;
337	}
338
339	UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
340	ut->chunkOffset--; // move iteration position over the lead surrogate.
341	return supplementary;
342	}
343
344
345
346	U_CAPI UChar32 U_EXPORT2
347	utext_next32From(UText *ut, int64_t index) {
348	UChar32 c = U_SENTINEL;
349
350	if(index<ut->chunkNativeStart \|\| index>=ut->chunkNativeLimit) {
351	// Desired position is outside of the current chunk.
352	if(!ut->pFuncs->access(ut, index, TRUE)) {
353	// no chunk available here
354	return U_SENTINEL;
355	}
356	} else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
357	// Desired position is in chunk, with direct 1:1 native to UTF16 indexing
358	ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
359	} else {
360	// Desired position is in chunk, with non-UTF16 indexing.
361	ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
362	}
363
364	c = ut->chunkContents[ut->chunkOffset++];
365	if (U16_IS_SURROGATE(c)) {
366	// Surrogates. Many edge cases. Use other functions that already
367	// deal with the problems.
368	utext_setNativeIndex(ut, index);
369	c = utext_next32(ut);
370	}
371	return c;
372	}
373
374
375	U_CAPI UChar32 U_EXPORT2
376	utext_previous32From(UText *ut, int64_t index) {
377	//
378	// Return the character preceding the specified index.
379	// Leave the iteration position at the start of the character that was returned.
380	//
381	UChar32 cPrev; // The character preceding cCurr, which is what we will return.
382
383	// Address the chunk containg the position preceding the incoming index
384	// A tricky edge case:
385	// We try to test the requested native index against the chunkNativeStart to determine
386	// whether the character preceding the one at the index is in the current chunk.
387	// BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
388	// requested index is on something other than the first position of the first char.
389	//
390	if(index<=ut->chunkNativeStart \|\| index>ut->chunkNativeLimit) {
391	// Requested native index is outside of the current chunk.
392	if(!ut->pFuncs->access(ut, index, FALSE)) {
393	// no chunk available here
394	return U_SENTINEL;
395	}
396	} else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
397	// Direct UTF-16 indexing.
398	ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
399	} else {
400	ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
401	if (ut->chunkOffset==`0` && !ut->pFuncs->access(ut, index, FALSE)) {
402	// no chunk available here
403	return U_SENTINEL;
404	}
405	}
406
407	//
408	// Simple case with no surrogates.
409	//
410	ut->chunkOffset--;
411	cPrev = ut->chunkContents[ut->chunkOffset];
412
413	if (U16_IS_SURROGATE(cPrev)) {
414	// Possible supplementary. Many edge cases.
415	// Let other functions do the heavy lifting.
416	utext_setNativeIndex(ut, index);
417	cPrev = utext_previous32(ut);
418	}
419	return cPrev;
420	}
421
422
423	U_CAPI int32_t U_EXPORT2
424	utext_extract(UText *ut,
425	int64_t start, int64_t limit,
426	UChar *dest, int32_t destCapacity,
427	UErrorCode *status) {
428	return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
429	}
430
431
432
433	U_CAPI UBool U_EXPORT2
434	utext_equals(const UText a, const* UText *b) {
435	if (a==NULL \|\| b==NULL \|\|
436	a->magic != UTEXT_MAGIC \|\|
437	b->magic != UTEXT_MAGIC) {
438	// Null or invalid arguments don't compare equal to anything.
439	return FALSE;
440	}
441
442	if (a->pFuncs != b->pFuncs) {
443	// Different types of text providers.
444	return FALSE;
445	}
446
447	if (a->context != b->context) {
448	// Different sources (different strings)
449	return FALSE;
450	}
451	if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
452	// Different current position in the string.
453	return FALSE;
454	}
455
456	return TRUE;
457	}
458
459	U_CAPI UBool U_EXPORT2
460	utext_isWritable(const UText *ut)
461	{
462	UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != `0`;
463	return b;
464	}
465
466
467	U_CAPI void U_EXPORT2
468	utext_freeze(UText *ut) {
469	// Zero out the WRITABLE flag.
470	ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
471	}
472
473
474	U_CAPI UBool U_EXPORT2
475	utext_hasMetaData(const UText *ut)
476	{
477	UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != `0`;
478	return b;
479	}
480
481
482
483	U_CAPI int32_t U_EXPORT2
484	utext_replace(UText *ut,
485	int64_t nativeStart, int64_t nativeLimit,
486	const UChar *replacementText, int32_t replacementLength,
487	UErrorCode *status)
488	{
489	if (U_FAILURE(*status)) {
490	return `0`;
491	}
492	if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == `0`) {
493	*status = U_NO_WRITE_PERMISSION;
494	return `0`;
495	}
496	int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
497	return i;
498	}
499
500	U_CAPI void U_EXPORT2
501	utext_copy(UText *ut,
502	int64_t nativeStart, int64_t nativeLimit,
503	int64_t destIndex,
504	UBool move,
505	UErrorCode *status)
506	{
507	if (U_FAILURE(*status)) {
508	return;
509	}
510	if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == `0`) {
511	*status = U_NO_WRITE_PERMISSION;
512	return;
513	}
514	ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
515	}
516
517
518
519	U_CAPI UText * U_EXPORT2
520	utext_clone(UText dest, const* UText src, UBool deep, UBool readOnly, UErrorCode status) {
521	if (U_FAILURE(*status)) {
522	return dest;
523	}
524	UText *result = src->pFuncs->clone(dest, src, deep, status);
525	if (U_FAILURE(*status)) {
526	return result;
527	}
528	if (result == NULL) {
529	*status = U_MEMORY_ALLOCATION_ERROR;
530	return result;
531	}
532	if (readOnly) {
533	utext_freeze(result);
534	}
535	return result;
536	}
537
538
539
540	//------------------------------------------------------------------------------
541	//
542	// UText common functions implementation
543	//
544	//------------------------------------------------------------------------------
545
546	//
547	// UText.flags bit definitions
548	//
549	enum {
550	UTEXT_HEAP_ALLOCATED = `1`, // 1 if ICU has allocated this UText struct on the heap.
551	// 0 if caller provided storage for the UText.
552
553	UTEXT_EXTRA_HEAP_ALLOCATED = `2`, // 1 if ICU has allocated extra storage as a separate
554	// heap block.
555	// 0 if there is no separate allocation. Either no extra
556	// storage was requested, or it is appended to the end
557	// of the main UText storage.
558
559	UTEXT_OPEN = `4` // 1 if this UText is currently open
560	// 0 if this UText is not open.
561	};
562
563
564	//
565	// Extended form of a UText. The purpose is to aid in computing the total size required
566	// when a provider asks for a UText to be allocated with extra storage.
567
568	struct ExtendedUText {
569	UText ut;
570	max_align_t extension;
571	};
572
573	static const UText emptyText = UTEXT_INITIALIZER;
574
575	U_CAPI UText * U_EXPORT2
576	utext_setup(UText ut, int32_t extraSpace, UErrorCode status) {
577	if (U_FAILURE(*status)) {
578	return ut;
579	}
580
581	if (ut == NULL) {
582	// We need to heap-allocate storage for the new UText
583	int32_t spaceRequired = sizeof(UText);
584	if (extraSpace > `0`) {
585	spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(max_align_t);
586	}
587	ut = (UText *)uprv_malloc(spaceRequired);
588	if (ut == NULL) {
589	*status = U_MEMORY_ALLOCATION_ERROR;
590	return NULL;
591	} else {
592	*ut = emptyText;
593	ut->flags \|= UTEXT_HEAP_ALLOCATED;
594	if (spaceRequired>`0`) {
595	ut->extraSize = extraSpace;
596	ut->pExtra = &((ExtendedUText *)ut)->extension;
597	}
598	}
599	} else {
600	// We have been supplied with an already existing UText.
601	// Verify that it really appears to be a UText.
602	if (ut->magic != UTEXT_MAGIC) {
603	*status = U_ILLEGAL_ARGUMENT_ERROR;
604	return ut;
605	}
606	// If the ut is already open and there's a provider supplied close
607	// function, call it.
608	if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
609	ut->pFuncs->close(ut);
610	}
611	ut->flags &= ~UTEXT_OPEN;
612
613	// If extra space was requested by our caller, check whether
614	// sufficient already exists, and allocate new if needed.
615	if (extraSpace > ut->extraSize) {
616	// Need more space. If there is existing separately allocated space,
617	// delete it first, then allocate new space.
618	if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
619	uprv_free(ut->pExtra);
620	ut->extraSize = `0`;
621	}
622	ut->pExtra = uprv_malloc(extraSpace);
623	if (ut->pExtra == NULL) {
624	*status = U_MEMORY_ALLOCATION_ERROR;
625	} else {
626	ut->extraSize = extraSpace;
627	ut->flags \|= UTEXT_EXTRA_HEAP_ALLOCATED;
628	}
629	}
630	}
631	if (U_SUCCESS(*status)) {
632	ut->flags \|= UTEXT_OPEN;
633
634	// Initialize all remaining fields of the UText.
635	//
636	ut->context = NULL;
637	ut->chunkContents = NULL;
638	ut->p = NULL;
639	ut->q = NULL;
640	ut->r = NULL;
641	ut->a = `0`;
642	ut->b = `0`;
643	ut->c = `0`;
644	ut->chunkOffset = `0`;
645	ut->chunkLength = `0`;
646	ut->chunkNativeStart = `0`;
647	ut->chunkNativeLimit = `0`;
648	ut->nativeIndexingLimit = `0`;
649	ut->providerProperties = `0`;
650	ut->privA = `0`;
651	ut->privB = `0`;
652	ut->privC = `0`;
653	ut->privP = NULL;
654	if (ut->pExtra!=NULL && ut->extraSize>`0`)
655	uprv_memset(ut->pExtra, `0`, ut->extraSize);
656
657	}
658	return ut;
659	}
660
661
662	U_CAPI UText * U_EXPORT2
663	utext_close(UText *ut) {
664	if (ut==NULL \|\|
665	ut->magic != UTEXT_MAGIC \|\|
666	(ut->flags & UTEXT_OPEN) == `0`)
667	{
668	// The supplied ut is not an open UText.
669	// Do nothing.
670	return ut;
671	}
672
673	// If the provider gave us a close function, call it now.
674	// This will clean up anything allocated specifically by the provider.
675	if (ut->pFuncs->close != NULL) {
676	ut->pFuncs->close(ut);
677	}
678	ut->flags &= ~UTEXT_OPEN;
679
680	// If we (the framework) allocated the UText or subsidiary storage,
681	// delete it.
682	if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
683	uprv_free(ut->pExtra);
684	ut->pExtra = NULL;
685	ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
686	ut->extraSize = `0`;
687	}
688
689	// Zero out function table of the closed UText. This is a defensive move,
690	// inteded to cause applications that inadvertantly use a closed
691	// utext to crash with null pointer errors.
692	ut->pFuncs = NULL;
693
694	if (ut->flags & UTEXT_HEAP_ALLOCATED) {
695	// This UText was allocated by UText setup. We need to free it.
696	// Clear magic, so we can detect if the user messes up and immediately
697	// tries to reopen another UText using the deleted storage.
698	ut->magic = `0`;
699	uprv_free(ut);
700	ut = NULL;
701	}
702	return ut;
703	}
704
705
706
707
708	//
709	// invalidateChunk Reset a chunk to have no contents, so that the next call
710	// to access will cause new data to load.
711	// This is needed when copy/move/replace operate directly on the
712	// backing text, potentially putting it out of sync with the
713	// contents in the chunk.
714	//
715	static void
716	invalidateChunk(UText *ut) {
717	ut->chunkLength = `0`;
718	ut->chunkNativeLimit = `0`;
719	ut->chunkNativeStart = `0`;
720	ut->chunkOffset = `0`;
721	ut->nativeIndexingLimit = `0`;
722	}
723
724	//
725	// pinIndex Do range pinning on a native index parameter.
726	// 64 bit pinning is done in place.
727	// 32 bit truncated result is returned as a convenience for
728	// use in providers that don't need 64 bits.
729	static int32_t
730	pinIndex(int64_t &index, int64_t limit) {
731	if (index<`0`) {
732	index = `0`;
733	} else if (index > limit) {
734	index = limit;
735	}
736	return (int32_t)index;
737	}
738
739
740	U_CDECL_BEGIN
741
742	//
743	// Pointer relocation function,
744	// a utility used by shallow clone.
745	// Adjust a pointer that refers to something within one UText (the source)
746	// to refer to the same relative offset within a another UText (the target)
747	//
748	static void adjustPointer(UText dest, const* void *destPtr, const* UText *src) {
749	// convert all pointers to (char ) so that byte address arithmetic will work.*
750	char dptr = (char* )destPtr;
751	char dUText = (char* *)dest;
752	char sUText = (char* *)src;
753
754	if (dptr >= (char )src->pExtra && dptr < ((char**)src->pExtra)+src->extraSize) {
755	// target ptr was to something within the src UText's pExtra storage.
756	// relocate it into the target UText's pExtra region.
757	destPtr = ((char* )dest->pExtra) + (dptr - (char* *)src->pExtra);
758	} else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
759	// target ptr was pointing to somewhere within the source UText itself.
760	// Move it to the same offset within the target UText.
761	*destPtr = dUText + (dptr-sUText);
762	}
763	}
764
765
766	//
767	// Clone. This is a generic copy-the-utext-by-value clone function that can be
768	// used as-is with some utext types, and as a helper by other clones.
769	//
770	static UText * U_CALLCONV
771	shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
772	if (U_FAILURE(*status)) {
773	return NULL;
774	}
775	int32_t srcExtraSize = src->extraSize;
776
777	//
778	// Use the generic text_setup to allocate storage if required.
779	//
780	dest = utext_setup(dest, srcExtraSize, status);
781	if (U_FAILURE(*status)) {
782	return dest;
783	}
784
785	//
786	// flags (how the UText was allocated) and the pointer to the
787	// extra storage must retain the values in the cloned utext that
788	// were set up by utext_setup. Save them separately before
789	// copying the whole struct.
790	//
791	void *destExtra = dest->pExtra;
792	int32_t flags = dest->flags;
793
794
795	//
796	// Copy the whole UText struct by value.
797	// Any "Extra" storage is copied also.
798	//
799	int sizeToCopy = src->sizeOfStruct;
800	if (sizeToCopy > dest->sizeOfStruct) {
801	sizeToCopy = dest->sizeOfStruct;
802	}
803	uprv_memcpy(dest, src, sizeToCopy);
804	dest->pExtra = destExtra;
805	dest->flags = flags;
806	if (srcExtraSize > `0`) {
807	uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
808	}
809
810	//
811	// Relocate any pointers in the target that refer to the UText itself
812	// to point to the cloned copy rather than the original source.
813	//
814	adjustPointer(dest, &dest->context, src);
815	adjustPointer(dest, &dest->p, src);
816	adjustPointer(dest, &dest->q, src);
817	adjustPointer(dest, &dest->r, src);
818	adjustPointer(dest, (const void **)&dest->chunkContents, src);
819
820	// The newly shallow-cloned UText does _not_ own the underlying storage for the text.
821	// (The source for the clone may or may not have owned the text.)
822
823	dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
824
825	return dest;
826	}
827
828
829	U_CDECL_END
830
831
832
833	//------------------------------------------------------------------------------
834	//
835	// UText implementation for UTF-8 char strings (read-only)*
836	// Limitation: string length must be <= 0x7fffffff in length.
837	// (length must for in an int32_t variable)
838	//
839	// Use of UText data members:
840	// context pointer to UTF-8 string
841	// utext.b is the input string length (bytes).
842	// utext.c Length scanned so far in string
843	// (for optimizing finding length of zero terminated strings.)
844	// utext.p pointer to the current buffer
845	// utext.q pointer to the other buffer.
846	//
847	//------------------------------------------------------------------------------
848
849	// Chunk size.
850	// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
851	// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
852	// to two UChars.)
853	// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
854	// is a three-byte sequence (truncated four-byte sequence).
855	//
856	enum { UTF8_TEXT_CHUNK_SIZE=`32` };
857
858	//
859	// UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
860	// Each contains the UChar chunk buffer, the to and from native maps, and
861	// header info.
862	//
863	// because backwards iteration fills the buffers starting at the end and
864	// working towards the front, the filled part of the buffers may not begin
865	// at the start of the available storage for the buffers.
866	//
867	// Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
868	// the last character added being a supplementary, and thus requiring a surrogate
869	// pair. Doing this is simpler than checking for the edge case.
870	//
871
872	struct UTF8Buf {
873	int32_t bufNativeStart; // Native index of first char in UChar buf
874	int32_t bufNativeLimit; // Native index following last char in buf.
875	int32_t bufStartIdx; // First filled position in buf.
876	int32_t bufLimitIdx; // Limit of filled range in buf.
877	int32_t bufNILimit; // Limit of native indexing part of buf
878	int32_t toUCharsMapStart; // Native index corresponding to
879	// mapToUChars[0].
880	// Set to bufNativeStart when filling forwards.
881	// Set to computed value when filling backwards.
882
883	UChar buf[UTF8_TEXT_CHUNK_SIZE+`4`]; // The UChar buffer. Requires one extra position beyond the
884	// the chunk size, to allow for surrogate at the end.
885	// Length must be identical to mapToNative array, below,
886	// because of the way indexing works when the array is
887	// filled backwards during a reverse iteration. Thus,
888	// the additional extra size.
889	uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+`4`]; // map UChar index in buf to
890	// native offset from bufNativeStart.
891	// Requires two extra slots,
892	// one for a supplementary starting in the last normal position,
893	// and one for an entry for the buffer limit position.
894	uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE`3`+`6`]; // Map native offset from bufNativeStart to*
895	// correspoding offset in filled part of buf.
896	int32_t align;
897	};
898
899	U_CDECL_BEGIN
900
901	//
902	// utf8TextLength
903	//
904	// Get the length of the string. If we don't already know it,
905	// we'll need to scan for the trailing nul.
906	//
907	static int64_t U_CALLCONV
908	utf8TextLength(UText *ut) {
909	if (ut->b < `0`) {
910	// Zero terminated string, and we haven't scanned to the end yet.
911	// Scan it now.
912	const char r = (const* char *)ut->context + ut->c;
913	while (*r != `0`) {
914	r++;
915	}
916	if ((r - (const char *)ut->context) < `0x7fffffff`) {
917	ut->b = (int32_t)(r - (const char *)ut->context);
918	} else {
919	// Actual string was bigger (more than 2 gig) than we
920	// can handle. Clip it to 2 GB.
921	ut->b = `0x7fffffff`;
922	}
923	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
924	}
925	return ut->b;
926	}
927
928
929
930
931
932
933	static UBool U_CALLCONV
934	utf8TextAccess(UText *ut, int64_t index, UBool forward) {
935	//
936	// Apologies to those who are allergic to goto statements.
937	// Consider each goto to a labelled block to be the equivalent of
938	// call the named block as if it were a function();
939	// return;
940	//
941	const uint8_t s8=(const* uint8_t *)ut->context;
942	UTF8Buf *u8b = NULL;
943	int32_t length = ut->b; // Length of original utf-8
944	int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
945	int32_t mapIndex = `0`;
946	if (index<`0`) {
947	ix=`0`;
948	} else if (index > `0x7fffffff`) {
949	// Strings with 64 bit lengths not supported by this UTF-8 provider.
950	ix = `0x7fffffff`;
951	}
952
953	// Pin requested index to the string length.
954	if (ix>length) {
955	if (length>=`0`) {
956	ix=length;
957	} else if (ix>=ut->c) {
958	// Zero terminated string, and requested index is beyond
959	// the region that has already been scanned.
960	// Scan up to either the end of the string or to the
961	// requested position, whichever comes first.
962	while (ut->c<ix && s8[ut->c]!=`0`) {
963	ut->c++;
964	}
965	// TODO: support for null terminated string length > 32 bits.
966	if (s8[ut->c] == `0`) {
967	// We just found the actual length of the string.
968	// Trim the requested index back to that.
969	ix = ut->c;
970	ut->b = ut->c;
971	length = ut->c;
972	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
973	}
974	}
975	}
976
977	//
978	// Dispatch to the appropriate action for a forward iteration request.
979	//
980	if (forward) {
981	if (ix==ut->chunkNativeLimit) {
982	// Check for normal sequential iteration cases first.
983	if (ix==length) {
984	// Just reached end of string
985	// Don't swap buffers, but do set the
986	// current buffer position.
987	ut->chunkOffset = ut->chunkLength;
988	return FALSE;
989	} else {
990	// End of current buffer.
991	// check whether other buffer already has what we need.
992	UTF8Buf altB = (UTF8Buf )ut->q;
993	if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
994	goto swapBuffers;
995	}
996	}
997	}
998
999	// A random access. Desired index could be in either or niether buf.
1000	// For optimizing the order of testing, first check for the index
1001	// being in the other buffer. This will be the case for uses that
1002	// move back and forth over a fairly limited range
1003	{
1004	u8b = (UTF8Buf )ut->q; // the alternate buffer*
1005	if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
1006	// Requested index is in the other buffer.
1007	goto swapBuffers;
1008	}
1009	if (ix == length) {
1010	// Requested index is end-of-string.
1011	// (this is the case of randomly seeking to the end.
1012	// The case of iterating off the end is handled earlier.)
1013	if (ix == ut->chunkNativeLimit) {
1014	// Current buffer extends up to the end of the string.
1015	// Leave it as the current buffer.
1016	ut->chunkOffset = ut->chunkLength;
1017	return FALSE;
1018	}
1019	if (ix == u8b->bufNativeLimit) {
1020	// Alternate buffer extends to the end of string.
1021	// Swap it in as the current buffer.
1022	goto swapBuffersAndFail;
1023	}
1024
1025	// Neither existing buffer extends to the end of the string.
1026	goto makeStubBuffer;
1027	}
1028
1029	if (ix<ut->chunkNativeStart \|\| ix>=ut->chunkNativeLimit) {
1030	// Requested index is in neither buffer.
1031	goto fillForward;
1032	}
1033
1034	// Requested index is in this buffer.
1035	u8b = (UTF8Buf )ut->p; // the current buffer*
1036	mapIndex = ix - u8b->toUCharsMapStart;
1037	U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1038	ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1039	return TRUE;
1040
1041	}
1042	}
1043
1044
1045	//
1046	// Dispatch to the appropriate action for a
1047	// Backwards Diretion iteration request.
1048	//
1049	if (ix==ut->chunkNativeStart) {
1050	// Check for normal sequential iteration cases first.
1051	if (ix==`0`) {
1052	// Just reached the start of string
1053	// Don't swap buffers, but do set the
1054	// current buffer position.
1055	ut->chunkOffset = `0`;
1056	return FALSE;
1057	} else {
1058	// Start of current buffer.
1059	// check whether other buffer already has what we need.
1060	UTF8Buf altB = (UTF8Buf )ut->q;
1061	if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1062	goto swapBuffers;
1063	}
1064	}
1065	}
1066
1067	// A random access. Desired index could be in either or niether buf.
1068	// For optimizing the order of testing,
1069	// Most likely case: in the other buffer.
1070	// Second most likely: in neither buffer.
1071	// Unlikely, but must work: in the current buffer.
1072	u8b = (UTF8Buf )ut->q; // the alternate buffer*
1073	if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1074	// Requested index is in the other buffer.
1075	goto swapBuffers;
1076	}
1077	// Requested index is start-of-string.
1078	// (this is the case of randomly seeking to the start.
1079	// The case of iterating off the start is handled earlier.)
1080	if (ix==`0`) {
1081	if (u8b->bufNativeStart==`0`) {
1082	// Alternate buffer contains the data for the start string.
1083	// Make it be the current buffer.
1084	goto swapBuffersAndFail;
1085	} else {
1086	// Request for data before the start of string,
1087	// neither buffer is usable.
1088	// set up a zero-length buffer.
1089	goto makeStubBuffer;
1090	}
1091	}
1092
1093	if (ix<=ut->chunkNativeStart \|\| ix>ut->chunkNativeLimit) {
1094	// Requested index is in neither buffer.
1095	goto fillReverse;
1096	}
1097
1098	// Requested index is in this buffer.
1099	// Set the utf16 buffer index.
1100	u8b = (UTF8Buf *)ut->p;
1101	mapIndex = ix - u8b->toUCharsMapStart;
1102	ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1103	if (ut->chunkOffset==`0`) {
1104	// This occurs when the first character in the text is
1105	// a multi-byte UTF-8 char, and the requested index is to
1106	// one of the trailing bytes. Because there is no preceding ,
1107	// character, this access fails. We can't pick up on the
1108	// situation sooner because the requested index is not zero.
1109	return FALSE;
1110	} else {
1111	return TRUE;
1112	}
1113
1114
1115
1116	swapBuffers:
1117	// The alternate buffer (ut->q) has the string data that was requested.
1118	// Swap the primary and alternate buffers, and set the
1119	// chunk index into the new primary buffer.
1120	{
1121	u8b = (UTF8Buf *)ut->q;
1122	ut->q = ut->p;
1123	ut->p = u8b;
1124	ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1125	ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1126	ut->chunkNativeStart = u8b->bufNativeStart;
1127	ut->chunkNativeLimit = u8b->bufNativeLimit;
1128	ut->nativeIndexingLimit = u8b->bufNILimit;
1129
1130	// Index into the (now current) chunk
1131	// Use the map to set the chunk index. It's more trouble than it's worth
1132	// to check whether native indexing can be used.
1133	U_ASSERT(ix>=u8b->bufNativeStart);
1134	U_ASSERT(ix<=u8b->bufNativeLimit);
1135	mapIndex = ix - u8b->toUCharsMapStart;
1136	U_ASSERT(mapIndex>=`0`);
1137	U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1138	ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1139
1140	return TRUE;
1141	}
1142
1143
1144	swapBuffersAndFail:
1145	// We got a request for either the start or end of the string,
1146	// with iteration continuing in the out-of-bounds direction.
1147	// The alternate buffer already contains the data up to the
1148	// start/end.
1149	// Swap the buffers, then return failure, indicating that we couldn't
1150	// make things correct for continuing the iteration in the requested
1151	// direction. The position & buffer are correct should the
1152	// user decide to iterate in the opposite direction.
1153	u8b = (UTF8Buf *)ut->q;
1154	ut->q = ut->p;
1155	ut->p = u8b;
1156	ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1157	ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1158	ut->chunkNativeStart = u8b->bufNativeStart;
1159	ut->chunkNativeLimit = u8b->bufNativeLimit;
1160	ut->nativeIndexingLimit = u8b->bufNILimit;
1161
1162	// Index into the (now current) chunk
1163	// For this function (swapBuffersAndFail), the requested index
1164	// will always be at either the start or end of the chunk.
1165	if (ix==u8b->bufNativeLimit) {
1166	ut->chunkOffset = ut->chunkLength;
1167	} else {
1168	ut->chunkOffset = `0`;
1169	U_ASSERT(ix == u8b->bufNativeStart);
1170	}
1171	return FALSE;
1172
1173	makeStubBuffer:
1174	// The user has done a seek/access past the start or end
1175	// of the string. Rather than loading data that is likely
1176	// to never be used, just set up a zero-length buffer at
1177	// the position.
1178	u8b = (UTF8Buf *)ut->q;
1179	u8b->bufNativeStart = ix;
1180	u8b->bufNativeLimit = ix;
1181	u8b->bufStartIdx = `0`;
1182	u8b->bufLimitIdx = `0`;
1183	u8b->bufNILimit = `0`;
1184	u8b->toUCharsMapStart = ix;
1185	u8b->mapToNative[`0`] = `0`;
1186	u8b->mapToUChars[`0`] = `0`;
1187	goto swapBuffersAndFail;
1188
1189
1190
1191	fillForward:
1192	{
1193	// Move the incoming index to a code point boundary.
1194	U8_SET_CP_START(s8, `0`, ix);
1195
1196	// Swap the UText buffers.
1197	// We want to fill what was previously the alternate buffer,
1198	// and make what was the current buffer be the new alternate.
1199	UTF8Buf u8b_swap = (UTF8Buf )ut->q;
1200	ut->q = ut->p;
1201	ut->p = u8b_swap;
1202
1203	int32_t strLen = ut->b;
1204	UBool nulTerminated = FALSE;
1205	if (strLen < `0`) {
1206	strLen = `0x7fffffff`;
1207	nulTerminated = TRUE;
1208	}
1209
1210	UChar *buf = u8b_swap->buf;
1211	uint8_t *mapToNative = u8b_swap->mapToNative;
1212	uint8_t *mapToUChars = u8b_swap->mapToUChars;
1213	int32_t destIx = `0`;
1214	int32_t srcIx = ix;
1215	UBool seenNonAscii = FALSE;
1216	UChar32 c = `0`;
1217
1218	// Fill the chunk buffer and mapping arrays.
1219	while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1220	c = s8[srcIx];
1221	if (c>`0` && c<`0x80`) {
1222	// Special case ASCII range for speed.
1223	// zero is excluded to simplify bounds checking.
1224	buf[destIx] = (UChar)c;
1225	mapToNative[destIx] = (uint8_t)(srcIx - ix);
1226	mapToUChars[srcIx-ix] = (uint8_t)destIx;
1227	srcIx++;
1228	destIx++;
1229	} else {
1230	// General case, handle everything.
1231	if (seenNonAscii == FALSE) {
1232	seenNonAscii = TRUE;
1233	u8b_swap->bufNILimit = destIx;
1234	}
1235
1236	int32_t cIx = srcIx;
1237	int32_t dIx = destIx;
1238	int32_t dIxSaved = destIx;
1239	U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
1240	if (c==`0` && nulTerminated) {
1241	srcIx--;
1242	break;
1243	}
1244
1245	U16_APPEND_UNSAFE(buf, destIx, c);
1246	do {
1247	mapToNative[dIx++] = (uint8_t)(cIx - ix);
1248	} while (dIx < destIx);
1249
1250	do {
1251	mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
1252	} while (cIx < srcIx);
1253	}
1254	if (srcIx>=strLen) {
1255	break;
1256	}
1257
1258	}
1259
1260	// store Native <--> Chunk Map entries for the end of the buffer.
1261	// There is no actual character here, but the index position is valid.
1262	mapToNative[destIx] = (uint8_t)(srcIx - ix);
1263	mapToUChars[srcIx - ix] = (uint8_t)destIx;
1264
1265	// fill in Buffer descriptor
1266	u8b_swap->bufNativeStart = ix;
1267	u8b_swap->bufNativeLimit = srcIx;
1268	u8b_swap->bufStartIdx = `0`;
1269	u8b_swap->bufLimitIdx = destIx;
1270	if (seenNonAscii == FALSE) {
1271	u8b_swap->bufNILimit = destIx;
1272	}
1273	u8b_swap->toUCharsMapStart = u8b_swap->bufNativeStart;
1274
1275	// Set UText chunk to refer to this buffer.
1276	ut->chunkContents = buf;
1277	ut->chunkOffset = `0`;
1278	ut->chunkLength = u8b_swap->bufLimitIdx;
1279	ut->chunkNativeStart = u8b_swap->bufNativeStart;
1280	ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
1281	ut->nativeIndexingLimit = u8b_swap->bufNILimit;
1282
1283	// For zero terminated strings, keep track of the maximum point
1284	// scanned so far.
1285	if (nulTerminated && srcIx>ut->c) {
1286	ut->c = srcIx;
1287	if (c==`0`) {
1288	// We scanned to the end.
1289	// Remember the actual length.
1290	ut->b = srcIx;
1291	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1292	}
1293	}
1294	return TRUE;
1295	}
1296
1297
1298	fillReverse:
1299	{
1300	// Move the incoming index to a code point boundary.
1301	// Can only do this if the incoming index is somewhere in the interior of the string.
1302	// If index is at the end, there is no character there to look at.
1303	if (ix != ut->b) {
1304	// Note: this function will only move the index back if it is on a trail byte
1305	// and there is a preceding lead byte and the sequence from the lead
1306	// through this trail could be part of a valid UTF-8 sequence
1307	// Otherwise the index remains unchanged.
1308	U8_SET_CP_START(s8, `0`, ix);
1309	}
1310
1311	// Swap the UText buffers.
1312	// We want to fill what was previously the alternate buffer,
1313	// and make what was the current buffer be the new alternate.
1314	UTF8Buf u8b_swap = (UTF8Buf )ut->q;
1315	ut->q = ut->p;
1316	ut->p = u8b_swap;
1317
1318	UChar *buf = u8b_swap->buf;
1319	uint8_t *mapToNative = u8b_swap->mapToNative;
1320	uint8_t *mapToUChars = u8b_swap->mapToUChars;
1321	int32_t toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + `1`;
1322	// Note that toUCharsMapStart can be negative. Happens when the remaining
1323	// text from current position to the beginning is less than the buffer size.
1324	// + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
1325	int32_t destIx = UTF8_TEXT_CHUNK_SIZE+`2`; // Start in the overflow region
1326	// at end of buffer to leave room
1327	// for a surrogate pair at the
1328	// buffer start.
1329	int32_t srcIx = ix;
1330	int32_t bufNILimit = destIx;
1331	UChar32 c;
1332
1333	// Map to/from Native Indexes, fill in for the position at the end of
1334	// the buffer.
1335	//
1336	mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1337	mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1338
1339	// Fill the chunk buffer
1340	// Work backwards, filling from the end of the buffer towards the front.
1341	//
1342	while (destIx>`2` && (srcIx - toUCharsMapStart > `5`) && (srcIx > `0`)) {
1343	srcIx--;
1344	destIx--;
1345
1346	// Get last byte of the UTF-8 character
1347	c = s8[srcIx];
1348	if (c<`0x80`) {
1349	// Special case ASCII range for speed.
1350	buf[destIx] = (UChar)c;
1351	U_ASSERT(toUCharsMapStart <= srcIx);
1352	mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1353	mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1354	} else {
1355	// General case, handle everything non-ASCII.
1356
1357	int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
1358
1359	// Get the full character from the UTF8 string.
1360	// use code derived from tbe macros in utf8.h
1361	// Leaves srcIx pointing at the first byte of the UTF-8 char.
1362	//
1363	c=utf8_prevCharSafeBody(s8, `0`, &srcIx, c, -`3`);
1364	// leaves srcIx at first byte of the multi-byte char.
1365
1366	// Store the character in UTF-16 buffer.
1367	if (c<`0x10000`) {
1368	buf[destIx] = (UChar)c;
1369	mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1370	} else {
1371	buf[destIx] = U16_TRAIL(c);
1372	mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1373	buf[--destIx] = U16_LEAD(c);
1374	mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1375	}
1376
1377	// Fill in the map from native indexes to UChars buf index.
1378	do {
1379	mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1380	} while (sIx >= srcIx);
1381	U_ASSERT(toUCharsMapStart <= (srcIx+`1`));
1382
1383	// Set native indexing limit to be the current position.
1384	// We are processing a non-ascii, non-native-indexing char now;
1385	// the limit will be here if the rest of the chars to be
1386	// added to this buffer are ascii.
1387	bufNILimit = destIx;
1388	}
1389	}
1390	u8b_swap->bufNativeStart = srcIx;
1391	u8b_swap->bufNativeLimit = ix;
1392	u8b_swap->bufStartIdx = destIx;
1393	u8b_swap->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+`2`;
1394	u8b_swap->bufNILimit = bufNILimit - u8b_swap->bufStartIdx;
1395	u8b_swap->toUCharsMapStart = toUCharsMapStart;
1396
1397	ut->chunkContents = &buf[u8b_swap->bufStartIdx];
1398	ut->chunkLength = u8b_swap->bufLimitIdx - u8b_swap->bufStartIdx;
1399	ut->chunkOffset = ut->chunkLength;
1400	ut->chunkNativeStart = u8b_swap->bufNativeStart;
1401	ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
1402	ut->nativeIndexingLimit = u8b_swap->bufNILimit;
1403	return TRUE;
1404	}
1405
1406	}
1407
1408
1409
1410	//
1411	// This is a slightly modified copy of u_strFromUTF8,
1412	// Inserts a Replacement Char rather than failing on invalid UTF-8
1413	// Removes unnecessary features.
1414	//
1415	static UChar*
1416	utext_strFromUTF8(UChar *dest,
1417	int32_t destCapacity,
1418	int32_t *pDestLength,
1419	const char* src,
1420	int32_t srcLength, // required. NUL terminated not supported.
1421	UErrorCode *pErrorCode
1422	)
1423	{
1424
1425	UChar *pDest = dest;
1426	UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
1427	UChar32 ch=`0`;
1428	int32_t index = `0`;
1429	int32_t reqLength = `0`;
1430	uint8_t* pSrc = (uint8_t*) src;
1431
1432
1433	while((index < srcLength)&&(pDest<pDestLimit)){
1434	ch = pSrc[index++];
1435	if(ch <=`0x7f`){
1436	*pDest++=(UChar)ch;
1437	}else{
1438	ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -`3`);
1439	if(U_IS_BMP(ch)){
1440	*(pDest++)=(UChar)ch;
1441	}else{
1442	*(pDest++)=U16_LEAD(ch);
1443	if(pDest<pDestLimit){
1444	*(pDest++)=U16_TRAIL(ch);
1445	}else{
1446	reqLength++;
1447	break;
1448	}
1449	}
1450	}
1451	}
1452	/ donot fill the dest buffer just count the UChars needed /
1453	while(index < srcLength){
1454	ch = pSrc[index++];
1455	if(ch <= `0x7f`){
1456	reqLength++;
1457	}else{
1458	ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -`3`);
1459	reqLength+=U16_LENGTH(ch);
1460	}
1461	}
1462
1463	reqLength+=(int32_t)(pDest - dest);
1464
1465	if(pDestLength){
1466	*pDestLength = reqLength;
1467	}
1468
1469	/ Terminate the buffer /
1470	u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1471
1472	return dest;
1473	}
1474
1475
1476
1477	static int32_t U_CALLCONV
1478	utf8TextExtract(UText *ut,
1479	int64_t start, int64_t limit,
1480	UChar *dest, int32_t destCapacity,
1481	UErrorCode *pErrorCode) {
1482	if(U_FAILURE(*pErrorCode)) {
1483	return `0`;
1484	}
1485	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`)) {
1486	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1487	return `0`;
1488	}
1489	int32_t length = ut->b;
1490	int32_t start32 = pinIndex(start, length);
1491	int32_t limit32 = pinIndex(limit, length);
1492
1493	if(start32>limit32) {
1494	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1495	return `0`;
1496	}
1497
1498
1499	// adjust the incoming indexes to land on code point boundaries if needed.
1500	// adjust by no more than three, because that is the largest number of trail bytes
1501	// in a well formed UTF8 character.
1502	const uint8_t buf = (const* uint8_t *)ut->context;
1503	int i;
1504	if (start32 < ut->chunkNativeLimit) {
1505	for (i=`0`; i<`3`; i++) {
1506	if (U8_IS_SINGLE(buf[start32]) \|\| U8_IS_LEAD(buf[start32]) \|\| start32==`0`) {
1507	break;
1508	}
1509	start32--;
1510	}
1511	}
1512
1513	if (limit32 < ut->chunkNativeLimit) {
1514	for (i=`0`; i<`3`; i++) {
1515	if (U8_IS_SINGLE(buf[limit32]) \|\| U8_IS_LEAD(buf[limit32]) \|\| limit32==`0`) {
1516	break;
1517	}
1518	limit32--;
1519	}
1520	}
1521
1522	// Do the actual extract.
1523	int32_t destLength=`0`;
1524	utext_strFromUTF8(dest, destCapacity, &destLength,
1525	(const char *)ut->context+start32, limit32-start32,
1526	pErrorCode);
1527	utf8TextAccess(ut, limit32, TRUE);
1528	return destLength;
1529	}
1530
1531	//
1532	// utf8TextMapOffsetToNative
1533	//
1534	// Map a chunk (UTF-16) offset to a native index.
1535	static int64_t U_CALLCONV
1536	utf8TextMapOffsetToNative(const UText *ut) {
1537	//
1538	UTF8Buf u8b = (UTF8Buf )ut->p;
1539	U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1540	int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1541	U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1542	return nativeOffset;
1543	}
1544
1545	//
1546	// Map a native index to the corrsponding chunk offset
1547	//
1548	static int32_t U_CALLCONV
1549	utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1550	U_ASSERT(index64 <= `0x7fffffff`);
1551	int32_t index = (int32_t)index64;
1552	UTF8Buf u8b = (UTF8Buf )ut->p;
1553	U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1554	U_ASSERT(index<=ut->chunkNativeLimit);
1555	int32_t mapIndex = index - u8b->toUCharsMapStart;
1556	U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1557	int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1558	U_ASSERT(offset>=`0` && offset<=ut->chunkLength);
1559	return offset;
1560	}
1561
1562	static UText * U_CALLCONV
1563	utf8TextClone(UText dest, const* UText src, UBool deep, UErrorCode status)
1564	{
1565	// First do a generic shallow clone. Does everything needed for the UText struct itself.
1566	dest = shallowTextClone(dest, src, status);
1567
1568	// For deep clones, make a copy of the string.
1569	// The copied storage is owned by the newly created clone.
1570	//
1571	// TODO: There is an isssue with using utext_nativeLength().
1572	// That function is non-const in cases where the input was NUL terminated
1573	// and the length has not yet been determined.
1574	// This function (clone()) is const.
1575	// There potentially a thread safety issue lurking here.
1576	//
1577	if (deep && U_SUCCESS(*status)) {
1578	int32_t len = (int32_t)utext_nativeLength((UText *)src);
1579	char copyStr = (char* *)uprv_malloc(len+`1`);
1580	if (copyStr == NULL) {
1581	*status = U_MEMORY_ALLOCATION_ERROR;
1582	} else {
1583	uprv_memcpy(copyStr, src->context, len+`1`);
1584	dest->context = copyStr;
1585	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1586	}
1587	}
1588	return dest;
1589	}
1590
1591
1592	static void U_CALLCONV
1593	utf8TextClose(UText *ut) {
1594	// Most of the work of close is done by the generic UText framework close.
1595	// All that needs to be done here is to delete the UTF8 string if the UText
1596	// owns it. This occurs if the UText was created by cloning.
1597	if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1598	char s = (char* *)ut->context;
1599	uprv_free(s);
1600	ut->context = NULL;
1601	}
1602	}
1603
1604	U_CDECL_END
1605
1606
1607	static const struct UTextFuncs utf8Funcs =
1608	{
1609	sizeof(UTextFuncs),
1610	`0`, `0`, `0`, // Reserved alignment padding
1611	utf8TextClone,
1612	utf8TextLength,
1613	utf8TextAccess,
1614	utf8TextExtract,
1615	NULL, / replace/
1616	NULL, / copy /
1617	utf8TextMapOffsetToNative,
1618	utf8TextMapIndexToUTF16,
1619	utf8TextClose,
1620	NULL, // spare 1
1621	NULL, // spare 2
1622	NULL // spare 3
1623	};
1624
1625
1626	static const char gEmptyString[] = {`0`};
1627
1628	U_CAPI UText * U_EXPORT2
1629	utext_openUTF8(UText ut, const* char s, int64_t length, UErrorCode status) {
1630	if(U_FAILURE(*status)) {
1631	return NULL;
1632	}
1633	if(s==NULL && length==`0`) {
1634	s = gEmptyString;
1635	}
1636
1637	if(s==NULL \|\| length<-`1` \|\| length>INT32_MAX) {
1638	*status=U_ILLEGAL_ARGUMENT_ERROR;
1639	return NULL;
1640	}
1641
1642	ut = utext_setup(ut, sizeof(UTF8Buf) * `2`, status);
1643	if (U_FAILURE(*status)) {
1644	return ut;
1645	}
1646
1647	ut->pFuncs = &utf8Funcs;
1648	ut->context = s;
1649	ut->b = (int32_t)length;
1650	ut->c = (int32_t)length;
1651	if (ut->c < `0`) {
1652	ut->c = `0`;
1653	ut->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1654	}
1655	ut->p = ut->pExtra;
1656	ut->q = (char )ut->pExtra + sizeof*(UTF8Buf);
1657	return ut;
1658
1659	}
1660
1661
1662
1663
1664
1665
1666
1667
1668	//------------------------------------------------------------------------------
1669	//
1670	// UText implementation wrapper for Replaceable (read/write)
1671	//
1672	// Use of UText data members:
1673	// context pointer to Replaceable.
1674	// p pointer to Replaceable if it is owned by the UText.
1675	//
1676	//------------------------------------------------------------------------------
1677
1678
1679
1680	// minimum chunk size for this implementation: 3
1681	// to allow for possible trimming for code point boundaries
1682	enum { REP_TEXT_CHUNK_SIZE=`10` };
1683
1684	struct ReplExtra {
1685	/*
1686	* Chunk UChars.
1687	* +1 to simplify filling with surrogate pair at the end.
1688	*/
1689	UChar s[REP_TEXT_CHUNK_SIZE+`1`];
1690	};
1691
1692
1693	U_CDECL_BEGIN
1694
1695	static UText * U_CALLCONV
1696	repTextClone(UText dest, const* UText src, UBool deep, UErrorCode status) {
1697	// First do a generic shallow clone. Does everything needed for the UText struct itself.
1698	dest = shallowTextClone(dest, src, status);
1699
1700	// For deep clones, make a copy of the Replaceable.
1701	// The copied Replaceable storage is owned by the newly created UText clone.
1702	// A non-NULL pointer in UText.p is the signal to the close() function to delete
1703	// it.
1704	//
1705	if (deep && U_SUCCESS(*status)) {
1706	const Replaceable replSrc = (const* Replaceable *)src->context;
1707	dest->context = replSrc->clone();
1708	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1709
1710	// with deep clone, the copy is writable, even when the source is not.
1711	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
1712	}
1713	return dest;
1714	}
1715
1716
1717	static void U_CALLCONV
1718	repTextClose(UText *ut) {
1719	// Most of the work of close is done by the generic UText framework close.
1720	// All that needs to be done here is delete the Replaceable if the UText
1721	// owns it. This occurs if the UText was created by cloning.
1722	if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1723	Replaceable rep = (Replaceable )ut->context;
1724	delete rep;
1725	ut->context = NULL;
1726	}
1727	}
1728
1729
1730	static int64_t U_CALLCONV
1731	repTextLength(UText *ut) {
1732	const Replaceable replSrc = (const* Replaceable *)ut->context;
1733	int32_t len = replSrc->length();
1734	return len;
1735	}
1736
1737
1738	static UBool U_CALLCONV
1739	repTextAccess(UText *ut, int64_t index, UBool forward) {
1740	const Replaceable rep=(const* Replaceable *)ut->context;
1741	int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
1742
1743	// clip the requested index to the limits of the text.
1744	int32_t index32 = pinIndex(index, length);
1745	U_ASSERT(index<=INT32_MAX);
1746
1747
1748	/*
1749	* Compute start/limit boundaries around index, for a segment of text
1750	* to be extracted.
1751	* To allow for the possibility that our user gave an index to the trailing
1752	* half of a surrogate pair, we must request one extra preceding UChar when
1753	* going in the forward direction. This will ensure that the buffer has the
1754	* entire code point at the specified index.
1755	*/
1756	if(forward) {
1757
1758	if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
1759	// Buffer already contains the requested position.
1760	ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1761	return TRUE;
1762	}
1763	if (index32>=length && ut->chunkNativeLimit==length) {
1764	// Request for end of string, and buffer already extends up to it.
1765	// Can't get the data, but don't change the buffer.
1766	ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
1767	return FALSE;
1768	}
1769
1770	ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - `1`;
1771	// Going forward, so we want to have the buffer with stuff at and beyond
1772	// the requested index. The -1 gets us one code point before the
1773	// requested index also, to handle the case of the index being on
1774	// a trail surrogate of a surrogate pair.
1775	if(ut->chunkNativeLimit > length) {
1776	ut->chunkNativeLimit = length;
1777	}
1778	// unless buffer ran off end, start is index-1.
1779	ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
1780	if(ut->chunkNativeStart < `0`) {
1781	ut->chunkNativeStart = `0`;
1782	}
1783	} else {
1784	// Reverse iteration. Fill buffer with data preceding the requested index.
1785	if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
1786	// Requested position already in buffer.
1787	ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
1788	return TRUE;
1789	}
1790	if (index32==`0` && ut->chunkNativeStart==`0`) {
1791	// Request for start, buffer already begins at start.
1792	// No data, but keep the buffer as is.
1793	ut->chunkOffset = `0`;
1794	return FALSE;
1795	}
1796
1797	// Figure out the bounds of the chunk to extract for reverse iteration.
1798	// Need to worry about chunk not splitting surrogate pairs, and while still
1799	// containing the data we need.
1800	// Fix by requesting a chunk that includes an extra UChar at the end.
1801	// If this turns out to be a lead surrogate, we can lop it off and still have
1802	// the data we wanted.
1803	ut->chunkNativeStart = index32 + `1` - REP_TEXT_CHUNK_SIZE;
1804	if (ut->chunkNativeStart < `0`) {
1805	ut->chunkNativeStart = `0`;
1806	}
1807
1808	ut->chunkNativeLimit = index32 + `1`;
1809	if (ut->chunkNativeLimit > length) {
1810	ut->chunkNativeLimit = length;
1811	}
1812	}
1813
1814	// Extract the new chunk of text from the Replaceable source.
1815	ReplExtra ex = (ReplExtra )ut->pExtra;
1816	// UnicodeString with its buffer a writable alias to the chunk buffer
1817	UnicodeString buffer(ex->s, `0` /buffer length/, REP_TEXT_CHUNK_SIZE /buffer capacity/);
1818	rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
1819
1820	ut->chunkContents = ex->s;
1821	ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
1822	ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
1823
1824	// Surrogate pairs from the input text must not span chunk boundaries.
1825	// If end of chunk could be the start of a surrogate, trim it off.
1826	if (ut->chunkNativeLimit < length &&
1827	U16_IS_LEAD(ex->s[ut->chunkLength-`1`])) {
1828	ut->chunkLength--;
1829	ut->chunkNativeLimit--;
1830	if (ut->chunkOffset > ut->chunkLength) {
1831	ut->chunkOffset = ut->chunkLength;
1832	}
1833	}
1834
1835	// if the first UChar in the chunk could be the trailing half of a surrogate pair,
1836	// trim it off.
1837	if(ut->chunkNativeStart>`0` && U16_IS_TRAIL(ex->s[`0`])) {
1838	++(ut->chunkContents);
1839	++(ut->chunkNativeStart);
1840	--(ut->chunkLength);
1841	--(ut->chunkOffset);
1842	}
1843
1844	// adjust the index/chunkOffset to a code point boundary
1845	U16_SET_CP_START(ut->chunkContents, `0`, ut->chunkOffset);
1846
1847	// Use fast indexing for get/setNativeIndex()
1848	ut->nativeIndexingLimit = ut->chunkLength;
1849
1850	return TRUE;
1851	}
1852
1853
1854
1855	static int32_t U_CALLCONV
1856	repTextExtract(UText *ut,
1857	int64_t start, int64_t limit,
1858	UChar *dest, int32_t destCapacity,
1859	UErrorCode *status) {
1860	const Replaceable rep=(const* Replaceable *)ut->context;
1861	int32_t length=rep->length();
1862
1863	if(U_FAILURE(*status)) {
1864	return `0`;
1865	}
1866	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`)) {
1867	*status=U_ILLEGAL_ARGUMENT_ERROR;
1868	}
1869	if(start>limit) {
1870	*status=U_INDEX_OUTOFBOUNDS_ERROR;
1871	return `0`;
1872	}
1873
1874	int32_t start32 = pinIndex(start, length);
1875	int32_t limit32 = pinIndex(limit, length);
1876
1877	// adjust start, limit if they point to trail half of surrogates
1878	if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
1879	U_IS_SUPPLEMENTARY(rep->char32At(start32))){
1880	start32--;
1881	}
1882	if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
1883	U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
1884	limit32--;
1885	}
1886
1887	length=limit32-start32;
1888	if(length>destCapacity) {
1889	limit32 = start32 + destCapacity;
1890	}
1891	UnicodeString buffer(dest, `0`, destCapacity); // writable alias
1892	rep->extractBetween(start32, limit32, buffer);
1893	repTextAccess(ut, limit32, TRUE);
1894
1895	return u_terminateUChars(dest, destCapacity, length, status);
1896	}
1897
1898	static int32_t U_CALLCONV
1899	repTextReplace(UText *ut,
1900	int64_t start, int64_t limit,
1901	const UChar *src, int32_t length,
1902	UErrorCode *status) {
1903	Replaceable rep=(Replaceable )ut->context;
1904	int32_t oldLength;
1905
1906	if(U_FAILURE(*status)) {
1907	return `0`;
1908	}
1909	if(src==NULL && length!=`0`) {
1910	*status=U_ILLEGAL_ARGUMENT_ERROR;
1911	return `0`;
1912	}
1913	oldLength=rep->length(); // will subtract from new length
1914	if(start>limit ) {
1915	*status=U_INDEX_OUTOFBOUNDS_ERROR;
1916	return `0`;
1917	}
1918
1919	int32_t start32 = pinIndex(start, oldLength);
1920	int32_t limit32 = pinIndex(limit, oldLength);
1921
1922	// Snap start & limit to code point boundaries.
1923	if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
1924	start32>`0` && U16_IS_LEAD(rep->charAt(start32-`1`)))
1925	{
1926	start32--;
1927	}
1928	if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-`1`)) &&
1929	U16_IS_TRAIL(rep->charAt(limit32)))
1930	{
1931	limit32++;
1932	}
1933
1934	// Do the actual replace operation using methods of the Replaceable class
1935	UnicodeString replStr((UBool)(length<`0`), src, length); // read-only alias
1936	rep->handleReplaceBetween(start32, limit32, replStr);
1937	int32_t newLength = rep->length();
1938	int32_t lengthDelta = newLength - oldLength;
1939
1940	// Is the UText chunk buffer OK?
1941	if (ut->chunkNativeLimit > start32) {
1942	// this replace operation may have impacted the current chunk.
1943	// invalidate it, which will force a reload on the next access.
1944	invalidateChunk(ut);
1945	}
1946
1947	// set the iteration position to the end of the newly inserted replacement text.
1948	int32_t newIndexPos = limit32 + lengthDelta;
1949	repTextAccess(ut, newIndexPos, TRUE);
1950
1951	return lengthDelta;
1952	}
1953
1954
1955	static void U_CALLCONV
1956	repTextCopy(UText *ut,
1957	int64_t start, int64_t limit,
1958	int64_t destIndex,
1959	UBool move,
1960	UErrorCode *status)
1961	{
1962	Replaceable rep=(Replaceable )ut->context;
1963	int32_t length=rep->length();
1964
1965	if(U_FAILURE(*status)) {
1966	return;
1967	}
1968	if (start>limit \|\| (start<destIndex && destIndex<limit))
1969	{
1970	*status=U_INDEX_OUTOFBOUNDS_ERROR;
1971	return;
1972	}
1973
1974	int32_t start32 = pinIndex(start, length);
1975	int32_t limit32 = pinIndex(limit, length);
1976	int32_t destIndex32 = pinIndex(destIndex, length);
1977
1978	// TODO: snap input parameters to code point boundaries.
1979
1980	if(move) {
1981	// move: copy to destIndex, then replace original with nothing
1982	int32_t segLength=limit32-start32;
1983	rep->copy(start32, limit32, destIndex32);
1984	if(destIndex32<start32) {
1985	start32+=segLength;
1986	limit32+=segLength;
1987	}
1988	rep->handleReplaceBetween(start32, limit32, UnicodeString ());
1989	} else {
1990	// copy
1991	rep->copy(start32, limit32, destIndex32);
1992	}
1993
1994	// If the change to the text touched the region in the chunk buffer,
1995	// invalidate the buffer.
1996	int32_t firstAffectedIndex = destIndex32;
1997	if (move && start32<firstAffectedIndex) {
1998	firstAffectedIndex = start32;
1999	}
2000	if (firstAffectedIndex < ut->chunkNativeLimit) {
2001	// changes may have affected range covered by the chunk
2002	invalidateChunk(ut);
2003	}
2004
2005	// Put iteration position at the newly inserted (moved) block,
2006	int32_t nativeIterIndex = destIndex32 + limit32 - start32;
2007	if (move && destIndex32>start32) {
2008	// moved a block of text towards the end of the string.
2009	nativeIterIndex = destIndex32;
2010	}
2011
2012	// Set position, reload chunk if needed.
2013	repTextAccess(ut, nativeIterIndex, TRUE);
2014	}
2015
2016	static const struct UTextFuncs repFuncs =
2017	{
2018	sizeof(UTextFuncs),
2019	`0`, `0`, `0`, // Reserved alignment padding
2020	repTextClone,
2021	repTextLength,
2022	repTextAccess,
2023	repTextExtract,
2024	repTextReplace,
2025	repTextCopy,
2026	NULL, // MapOffsetToNative,
2027	NULL, // MapIndexToUTF16,
2028	repTextClose,
2029	NULL, // spare 1
2030	NULL, // spare 2
2031	NULL // spare 3
2032	};
2033
2034
2035	U_CAPI UText * U_EXPORT2
2036	utext_openReplaceable(UText ut, Replaceable rep, UErrorCode *status)
2037	{
2038	if(U_FAILURE(*status)) {
2039	return NULL;
2040	}
2041	if(rep==NULL) {
2042	*status=U_ILLEGAL_ARGUMENT_ERROR;
2043	return NULL;
2044	}
2045	ut = utext_setup(ut, sizeof(ReplExtra), status);
2046	if(U_FAILURE(*status)) {
2047	return ut;
2048	}
2049
2050	ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2051	if(rep->hasMetaData()) {
2052	ut->providerProperties \|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2053	}
2054
2055	ut->pFuncs = &repFuncs;
2056	ut->context = rep;
2057	return ut;
2058	}
2059
2060	U_CDECL_END
2061
2062
2063
2064
2065
2066
2067
2068
2069	//------------------------------------------------------------------------------
2070	//
2071	// UText implementation for UnicodeString (read/write) and
2072	// for const UnicodeString (read only)
2073	// (same implementation, only the flags are different)
2074	//
2075	// Use of UText data members:
2076	// context pointer to UnicodeString
2077	// p pointer to UnicodeString IF this UText owns the string
2078	// and it must be deleted on close(). NULL otherwise.
2079	//
2080	//------------------------------------------------------------------------------
2081
2082	U_CDECL_BEGIN
2083
2084
2085	static UText * U_CALLCONV
2086	unistrTextClone(UText dest, const* UText src, UBool deep, UErrorCode status) {
2087	// First do a generic shallow clone. Does everything needed for the UText struct itself.
2088	dest = shallowTextClone(dest, src, status);
2089
2090	// For deep clones, make a copy of the UnicodeSring.
2091	// The copied UnicodeString storage is owned by the newly created UText clone.
2092	// A non-NULL pointer in UText.p is the signal to the close() function to delete
2093	// the UText.
2094	//
2095	if (deep && U_SUCCESS(*status)) {
2096	const UnicodeString srcString = (const* UnicodeString *)src->context;
2097	dest->context = new UnicodeString (*srcString);
2098	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2099
2100	// with deep clone, the copy is writable, even when the source is not.
2101	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2102	}
2103	return dest;
2104	}
2105
2106	static void U_CALLCONV
2107	unistrTextClose(UText *ut) {
2108	// Most of the work of close is done by the generic UText framework close.
2109	// All that needs to be done here is delete the UnicodeString if the UText
2110	// owns it. This occurs if the UText was created by cloning.
2111	if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2112	UnicodeString str = (UnicodeString )ut->context;
2113	delete str;
2114	ut->context = NULL;
2115	}
2116	}
2117
2118
2119	static int64_t U_CALLCONV
2120	unistrTextLength(UText *t) {
2121	return ((const UnicodeString *)t->context)->length();
2122	}
2123
2124
2125	static UBool U_CALLCONV
2126	unistrTextAccess(UText *ut, int64_t index, UBool forward) {
2127	int32_t length = ut->chunkLength;
2128	ut->chunkOffset = pinIndex(index, length);
2129
2130	// Check whether request is at the start or end
2131	UBool retVal = (forward && index<length) \|\| (!forward && index>`0`);
2132	return retVal;
2133	}
2134
2135
2136
2137	static int32_t U_CALLCONV
2138	unistrTextExtract(UText *t,
2139	int64_t start, int64_t limit,
2140	UChar *dest, int32_t destCapacity,
2141	UErrorCode *pErrorCode) {
2142	const UnicodeString us=(const* UnicodeString *)t->context;
2143	int32_t length=us->length();
2144
2145	if(U_FAILURE(*pErrorCode)) {
2146	return `0`;
2147	}
2148	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`)) {
2149	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2150	}
2151	if(start<`0` \|\| start>limit) {
2152	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2153	return `0`;
2154	}
2155
2156	int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2157	int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2158
2159	length=limit32-start32;
2160	if (destCapacity>`0` && dest!=NULL) {
2161	int32_t trimmedLength = length;
2162	if(trimmedLength>destCapacity) {
2163	trimmedLength=destCapacity;
2164	}
2165	us->extract(start32, trimmedLength, dest);
2166	t->chunkOffset = start32+trimmedLength;
2167	} else {
2168	t->chunkOffset = start32;
2169	}
2170	u_terminateUChars(dest, destCapacity, length, pErrorCode);
2171	return length;
2172	}
2173
2174	static int32_t U_CALLCONV
2175	unistrTextReplace(UText *ut,
2176	int64_t start, int64_t limit,
2177	const UChar *src, int32_t length,
2178	UErrorCode *pErrorCode) {
2179	UnicodeString us=(UnicodeString )ut->context;
2180	int32_t oldLength;
2181
2182	if(U_FAILURE(*pErrorCode)) {
2183	return `0`;
2184	}
2185	if(src==NULL && length!=`0`) {
2186	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2187	}
2188	if(start>limit) {
2189	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2190	return `0`;
2191	}
2192	oldLength=us->length();
2193	int32_t start32 = pinIndex(start, oldLength);
2194	int32_t limit32 = pinIndex(limit, oldLength);
2195	if (start32 < oldLength) {
2196	start32 = us->getChar32Start(start32);
2197	}
2198	if (limit32 < oldLength) {
2199	limit32 = us->getChar32Start(limit32);
2200	}
2201
2202	// replace
2203	us->replace(start32, limit32-start32, src, length);
2204	int32_t newLength = us->length();
2205
2206	// Update the chunk description.
2207	ut->chunkContents = us->getBuffer();
2208	ut->chunkLength = newLength;
2209	ut->chunkNativeLimit = newLength;
2210	ut->nativeIndexingLimit = newLength;
2211
2212	// Set iteration position to the point just following the newly inserted text.
2213	int32_t lengthDelta = newLength - oldLength;
2214	ut->chunkOffset = limit32 + lengthDelta;
2215
2216	return lengthDelta;
2217	}
2218
2219	static void U_CALLCONV
2220	unistrTextCopy(UText *ut,
2221	int64_t start, int64_t limit,
2222	int64_t destIndex,
2223	UBool move,
2224	UErrorCode *pErrorCode) {
2225	UnicodeString us=(UnicodeString )ut->context;
2226	int32_t length=us->length();
2227
2228	if(U_FAILURE(*pErrorCode)) {
2229	return;
2230	}
2231	int32_t start32 = pinIndex(start, length);
2232	int32_t limit32 = pinIndex(limit, length);
2233	int32_t destIndex32 = pinIndex(destIndex, length);
2234
2235	if( start32>limit32 \|\| (start32<destIndex32 && destIndex32<limit32)) {
2236	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2237	return;
2238	}
2239
2240	if(move) {
2241	// move: copy to destIndex, then remove original
2242	int32_t segLength=limit32-start32;
2243	us->copy(start32, limit32, destIndex32);
2244	if(destIndex32<start32) {
2245	start32+=segLength;
2246	}
2247	us->remove(start32, segLength);
2248	} else {
2249	// copy
2250	us->copy(start32, limit32, destIndex32);
2251	}
2252
2253	// update chunk description, set iteration position.
2254	ut->chunkContents = us->getBuffer();
2255	if (move==FALSE) {
2256	// copy operation, string length grows
2257	ut->chunkLength += limit32-start32;
2258	ut->chunkNativeLimit = ut->chunkLength;
2259	ut->nativeIndexingLimit = ut->chunkLength;
2260	}
2261
2262	// Iteration position to end of the newly inserted text.
2263	ut->chunkOffset = destIndex32+limit32-start32;
2264	if (move && destIndex32>start32) {
2265	ut->chunkOffset = destIndex32;
2266	}
2267
2268	}
2269
2270	static const struct UTextFuncs unistrFuncs =
2271	{
2272	sizeof(UTextFuncs),
2273	`0`, `0`, `0`, // Reserved alignment padding
2274	unistrTextClone,
2275	unistrTextLength,
2276	unistrTextAccess,
2277	unistrTextExtract,
2278	unistrTextReplace,
2279	unistrTextCopy,
2280	NULL, // MapOffsetToNative,
2281	NULL, // MapIndexToUTF16,
2282	unistrTextClose,
2283	NULL, // spare 1
2284	NULL, // spare 2
2285	NULL // spare 3
2286	};
2287
2288
2289
2290	U_CDECL_END
2291
2292
2293	U_CAPI UText * U_EXPORT2
2294	utext_openUnicodeString(UText ut, UnicodeString s, UErrorCode *status) {
2295	ut = utext_openConstUnicodeString(ut, s, status);
2296	if (U_SUCCESS(*status)) {
2297	ut->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2298	}
2299	return ut;
2300	}
2301
2302
2303
2304	U_CAPI UText * U_EXPORT2
2305	utext_openConstUnicodeString(UText ut, const* UnicodeString s, UErrorCode status) {
2306	if (U_SUCCESS(*status) && s->isBogus()) {
2307	// The UnicodeString is bogus, but we still need to detach the UText
2308	// from whatever it was hooked to before, if anything.
2309	utext_openUChars(ut, NULL, `0`, status);
2310	*status = U_ILLEGAL_ARGUMENT_ERROR;
2311	return ut;
2312	}
2313	ut = utext_setup(ut, `0`, status);
2314	// note: use the standard (writable) function table for UnicodeString.
2315	// The flag settings disable writing, so having the functions in
2316	// the table is harmless.
2317	if (U_SUCCESS(*status)) {
2318	ut->pFuncs = &unistrFuncs;
2319	ut->context = s;
2320	ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2321	ut->chunkContents = s->getBuffer();
2322	ut->chunkLength = s->length();
2323	ut->chunkNativeStart = `0`;
2324	ut->chunkNativeLimit = ut->chunkLength;
2325	ut->nativeIndexingLimit = ut->chunkLength;
2326	}
2327	return ut;
2328	}
2329
2330	//------------------------------------------------------------------------------
2331	//
2332	// UText implementation for const UChar strings*
2333	//
2334	// Use of UText data members:
2335	// context pointer to UnicodeString
2336	// a length. -1 if not yet known.
2337	//
2338	// TODO: support 64 bit lengths.
2339	//
2340	//------------------------------------------------------------------------------
2341
2342	U_CDECL_BEGIN
2343
2344
2345	static UText * U_CALLCONV
2346	ucstrTextClone(UText dest, const* UText * src, UBool deep, UErrorCode * status) {
2347	// First do a generic shallow clone.
2348	dest = shallowTextClone(dest, src, status);
2349
2350	// For deep clones, make a copy of the string.
2351	// The copied storage is owned by the newly created clone.
2352	// A non-NULL pointer in UText.p is the signal to the close() function to delete
2353	// it.
2354	//
2355	if (deep && U_SUCCESS(*status)) {
2356	U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2357	int32_t len = (int32_t)utext_nativeLength(dest);
2358
2359	// The cloned string IS going to be NUL terminated, whether or not the original was.
2360	const UChar srcStr = (const* UChar *)src->context;
2361	UChar copyStr = (UChar )uprv_malloc((len+`1`) * sizeof(UChar));
2362	if (copyStr == NULL) {
2363	*status = U_MEMORY_ALLOCATION_ERROR;
2364	} else {
2365	int64_t i;
2366	for (i=`0`; i<len; i++) {
2367	copyStr[i] = srcStr[i];
2368	}
2369	copyStr[len] = `0`;
2370	dest->context = copyStr;
2371	dest->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2372	}
2373	}
2374	return dest;
2375	}
2376
2377
2378	static void U_CALLCONV
2379	ucstrTextClose(UText *ut) {
2380	// Most of the work of close is done by the generic UText framework close.
2381	// All that needs to be done here is delete the string if the UText
2382	// owns it. This occurs if the UText was created by cloning.
2383	if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2384	UChar s = (UChar )ut->context;
2385	uprv_free(s);
2386	ut->context = NULL;
2387	}
2388	}
2389
2390
2391
2392	static int64_t U_CALLCONV
2393	ucstrTextLength(UText *ut) {
2394	if (ut->a < `0`) {
2395	// null terminated, we don't yet know the length. Scan for it.
2396	// Access is not convenient for doing this
2397	// because the current interation postion can't be changed.
2398	const UChar str = (const* UChar *)ut->context;
2399	for (;;) {
2400	if (str[ut->chunkNativeLimit] == `0`) {
2401	break;
2402	}
2403	ut->chunkNativeLimit++;
2404	}
2405	ut->a = ut->chunkNativeLimit;
2406	ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2407	ut->nativeIndexingLimit = ut->chunkLength;
2408	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2409	}
2410	return ut->a;
2411	}
2412
2413
2414	static UBool U_CALLCONV
2415	ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
2416	const UChar str = (const* UChar *)ut->context;
2417
2418	// pin the requested index to the bounds of the string,
2419	// and set current iteration position.
2420	if (index<`0`) {
2421	index = `0`;
2422	} else if (index < ut->chunkNativeLimit) {
2423	// The request data is within the chunk as it is known so far.
2424	// Put index on a code point boundary.
2425	U16_SET_CP_START(str, `0`, index);
2426	} else if (ut->a >= `0`) {
2427	// We know the length of this string, and the user is requesting something
2428	// at or beyond the length. Pin the requested index to the length.
2429	index = ut->a;
2430	} else {
2431	// Null terminated string, length not yet known, and the requested index
2432	// is beyond where we have scanned so far.
2433	// Scan to 32 UChars beyond the requested index. The strategy here is
2434	// to avoid fully scanning a long string when the caller only wants to
2435	// see a few characters at its beginning.
2436	int32_t scanLimit = (int32_t)index + `32`;
2437	if ((index + `32`)>INT32_MAX \|\| (index + `32`)<`0` ) { // note: int64 expression
2438	scanLimit = INT32_MAX;
2439	}
2440
2441	int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2442	for (; chunkLimit<scanLimit; chunkLimit++) {
2443	if (str[chunkLimit] == `0`) {
2444	// We found the end of the string. Remember it, pin the requested index to it,
2445	// and bail out of here.
2446	ut->a = chunkLimit;
2447	ut->chunkLength = chunkLimit;
2448	ut->nativeIndexingLimit = chunkLimit;
2449	if (index >= chunkLimit) {
2450	index = chunkLimit;
2451	} else {
2452	U16_SET_CP_START(str, `0`, index);
2453	}
2454
2455	ut->chunkNativeLimit = chunkLimit;
2456	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2457	goto breakout;
2458	}
2459	}
2460	// We scanned through the next batch of UChars without finding the end.
2461	U16_SET_CP_START(str, `0`, index);
2462	if (chunkLimit == INT32_MAX) {
2463	// Scanned to the limit of a 32 bit length.
2464	// Forceably trim the overlength string back so length fits in int32
2465	// TODO: add support for 64 bit strings.
2466	ut->a = chunkLimit;
2467	ut->chunkLength = chunkLimit;
2468	ut->nativeIndexingLimit = chunkLimit;
2469	if (index > chunkLimit) {
2470	index = chunkLimit;
2471	}
2472	ut->chunkNativeLimit = chunkLimit;
2473	ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2474	} else {
2475	// The endpoint of a chunk must not be left in the middle of a surrogate pair.
2476	// If the current end is on a lead surrogate, back the end up by one.
2477	// It doesn't matter if the end char happens to be an unpaired surrogate,
2478	// and it's simpler not to worry about it.
2479	if (U16_IS_LEAD(str[chunkLimit-`1`])) {
2480	--chunkLimit;
2481	}
2482	// Null-terminated chunk with end still unknown.
2483	// Update the chunk length to reflect what has been scanned thus far.
2484	// That the full length is still unknown is (still) flagged by
2485	// ut->a being < 0.
2486	ut->chunkNativeLimit = chunkLimit;
2487	ut->nativeIndexingLimit = chunkLimit;
2488	ut->chunkLength = chunkLimit;
2489	}
2490
2491	}
2492	breakout:
2493	U_ASSERT(index<=INT32_MAX);
2494	ut->chunkOffset = (int32_t)index;
2495
2496	// Check whether request is at the start or end
2497	UBool retVal = (forward && index<ut->chunkNativeLimit) \|\| (!forward && index>`0`);
2498	return retVal;
2499	}
2500
2501
2502
2503	static int32_t U_CALLCONV
2504	ucstrTextExtract(UText *ut,
2505	int64_t start, int64_t limit,
2506	UChar *dest, int32_t destCapacity,
2507	UErrorCode *pErrorCode)
2508	{
2509	if(U_FAILURE(*pErrorCode)) {
2510	return `0`;
2511	}
2512	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`) \|\| start>limit) {
2513	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2514	return `0`;
2515	}
2516
2517	//const UChar s=(const UChar )ut->context;
2518	int32_t si, di;
2519
2520	int32_t start32;
2521	int32_t limit32;
2522
2523	// Access the start. Does two things we need:
2524	// Pins 'start' to the length of the string, if it came in out-of-bounds.
2525	// Snaps 'start' to the beginning of a code point.
2526	ucstrTextAccess(ut, start, TRUE);
2527	const UChar *s=ut->chunkContents;
2528	start32 = ut->chunkOffset;
2529
2530	int32_t strLength=(int32_t)ut->a;
2531	if (strLength >= `0`) {
2532	limit32 = pinIndex(limit, strLength);
2533	} else {
2534	limit32 = pinIndex(limit, INT32_MAX);
2535	}
2536	di = `0`;
2537	for (si=start32; si<limit32; si++) {
2538	if (strLength<`0` && s[si]==`0`) {
2539	// Just hit the end of a null-terminated string.
2540	ut->a = si; // set string length for this UText
2541	ut->chunkNativeLimit = si;
2542	ut->chunkLength = si;
2543	ut->nativeIndexingLimit = si;
2544	strLength = si;
2545	limit32 = si;
2546	break;
2547	}
2548	U_ASSERT(di>=`0`); / to ensure di never exceeds INT32_MAX, which must not happen logically /
2549	if (di<destCapacity) {
2550	// only store if there is space.
2551	dest[di] = s[si];
2552	} else {
2553	if (strLength>=`0`) {
2554	// We have filled the destination buffer, and the string length is known.
2555	// Cut the loop short. There is no need to scan string termination.
2556	di = limit32 - start32;
2557	si = limit32;
2558	break;
2559	}
2560	}
2561	di++;
2562	}
2563
2564	// If the limit index points to a lead surrogate of a pair,
2565	// add the corresponding trail surrogate to the destination.
2566	if (si>`0` && U16_IS_LEAD(s[si-`1`]) &&
2567	((si<strLength \|\| strLength<`0`) && U16_IS_TRAIL(s[si])))
2568	{
2569	if (di<destCapacity) {
2570	// store only if there is space in the output buffer.
2571	dest[di++] = s[si];
2572	}
2573	si++;
2574	}
2575
2576	// Put iteration position at the point just following the extracted text
2577	if (si <= ut->chunkNativeLimit) {
2578	ut->chunkOffset = si;
2579	} else {
2580	ucstrTextAccess(ut, si, TRUE);
2581	}
2582
2583	// Add a terminating NUL if space in the buffer permits,
2584	// and set the error status as required.
2585	u_terminateUChars(dest, destCapacity, di, pErrorCode);
2586	return di;
2587	}
2588
2589	static const struct UTextFuncs ucstrFuncs =
2590	{
2591	sizeof(UTextFuncs),
2592	`0`, `0`, `0`, // Reserved alignment padding
2593	ucstrTextClone,
2594	ucstrTextLength,
2595	ucstrTextAccess,
2596	ucstrTextExtract,
2597	NULL, // Replace
2598	NULL, // Copy
2599	NULL, // MapOffsetToNative,
2600	NULL, // MapIndexToUTF16,
2601	ucstrTextClose,
2602	NULL, // spare 1
2603	NULL, // spare 2
2604	NULL, // spare 3
2605	};
2606
2607	U_CDECL_END
2608
2609	static const UChar gEmptyUString[] = {`0`};
2610
2611	U_CAPI UText * U_EXPORT2
2612	utext_openUChars(UText ut, const* UChar s, int64_t length, UErrorCode status) {
2613	if (U_FAILURE(*status)) {
2614	return NULL;
2615	}
2616	if(s==NULL && length==`0`) {
2617	s = gEmptyUString;
2618	}
2619	if (s==NULL \|\| length < -`1` \|\| length>INT32_MAX) {
2620	*status = U_ILLEGAL_ARGUMENT_ERROR;
2621	return NULL;
2622	}
2623	ut = utext_setup(ut, `0`, status);
2624	if (U_SUCCESS(*status)) {
2625	ut->pFuncs = &ucstrFuncs;
2626	ut->context = s;
2627	ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2628	if (length==-`1`) {
2629	ut->providerProperties \|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2630	}
2631	ut->a = length;
2632	ut->chunkContents = s;
2633	ut->chunkNativeStart = `0`;
2634	ut->chunkNativeLimit = length>=`0`? length : `0`;
2635	ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2636	ut->chunkOffset = `0`;
2637	ut->nativeIndexingLimit = ut->chunkLength;
2638	}
2639	return ut;
2640	}
2641
2642
2643	//------------------------------------------------------------------------------
2644	//
2645	// UText implementation for text from ICU CharacterIterators
2646	//
2647	// Use of UText data members:
2648	// context pointer to the CharacterIterator
2649	// a length of the full text.
2650	// p pointer to buffer 1
2651	// b start index of local buffer 1 contents
2652	// q pointer to buffer 2
2653	// c start index of local buffer 2 contents
2654	// r pointer to the character iterator if the UText owns it.
2655	// Null otherwise.
2656	//
2657	//------------------------------------------------------------------------------
2658	#define CIBufSize 16
2659
2660	U_CDECL_BEGIN
2661	static void U_CALLCONV
2662	charIterTextClose(UText *ut) {
2663	// Most of the work of close is done by the generic UText framework close.
2664	// All that needs to be done here is delete the CharacterIterator if the UText
2665	// owns it. This occurs if the UText was created by cloning.
2666	CharacterIterator ci = (CharacterIterator )ut->r;
2667	delete ci;
2668	ut->r = NULL;
2669	}
2670
2671	static int64_t U_CALLCONV
2672	charIterTextLength(UText *ut) {
2673	return (int32_t)ut->a;
2674	}
2675
2676	static UBool U_CALLCONV
2677	charIterTextAccess(UText *ut, int64_t index, UBool forward) {
2678	CharacterIterator ci = (CharacterIterator )ut->context;
2679
2680	int32_t clippedIndex = (int32_t)index;
2681	if (clippedIndex<`0`) {
2682	clippedIndex=`0`;
2683	} else if (clippedIndex>=ut->a) {
2684	clippedIndex=(int32_t)ut->a;
2685	}
2686	int32_t neededIndex = clippedIndex;
2687	if (!forward && neededIndex>`0`) {
2688	// reverse iteration, want the position just before what was asked for.
2689	neededIndex--;
2690	} else if (forward && neededIndex==ut->a && neededIndex>`0`) {
2691	// Forward iteration, don't ask for something past the end of the text.
2692	neededIndex--;
2693	}
2694
2695	// Find the native index of the start of the buffer containing what we want.
2696	neededIndex -= neededIndex % CIBufSize;
2697
2698	UChar *buf = NULL;
2699	UBool needChunkSetup = TRUE;
2700	int i;
2701	if (ut->chunkNativeStart == neededIndex) {
2702	// The buffer we want is already the current chunk.
2703	needChunkSetup = FALSE;
2704	} else if (ut->b == neededIndex) {
2705	// The first buffer (buffer p) has what we need.
2706	buf = (UChar *)ut->p;
2707	} else if (ut->c == neededIndex) {
2708	// The second buffer (buffer q) has what we need.
2709	buf = (UChar *)ut->q;
2710	} else {
2711	// Neither buffer already has what we need.
2712	// Load new data from the character iterator.
2713	// Use the buf that is not the current buffer.
2714	buf = (UChar *)ut->p;
2715	if (ut->p == ut->chunkContents) {
2716	buf = (UChar *)ut->q;
2717	}
2718	ci->setIndex(neededIndex);
2719	for (i=`0`; i<CIBufSize; i++) {
2720	buf[i] = ci->nextPostInc();
2721	if (i+neededIndex > ut->a) {
2722	break;
2723	}
2724	}
2725	}
2726
2727	// We have a buffer with the data we need.
2728	// Set it up as the current chunk, if it wasn't already.
2729	if (needChunkSetup) {
2730	ut->chunkContents = buf;
2731	ut->chunkLength = CIBufSize;
2732	ut->chunkNativeStart = neededIndex;
2733	ut->chunkNativeLimit = neededIndex + CIBufSize;
2734	if (ut->chunkNativeLimit > ut->a) {
2735	ut->chunkNativeLimit = ut->a;
2736	ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
2737	}
2738	ut->nativeIndexingLimit = ut->chunkLength;
2739	U_ASSERT(ut->chunkOffset>=`0` && ut->chunkOffset<=CIBufSize);
2740	}
2741	ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
2742	UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>`0`);
2743	return success;
2744	}
2745
2746	static UText * U_CALLCONV
2747	charIterTextClone(UText dest, const* UText src, UBool deep, UErrorCode status) {
2748	if (U_FAILURE(*status)) {
2749	return NULL;
2750	}
2751
2752	if (deep) {
2753	// There is no CharacterIterator API for cloning the underlying text storage.
2754	*status = U_UNSUPPORTED_ERROR;
2755	return NULL;
2756	} else {
2757	CharacterIterator srcCI =(CharacterIterator )src->context;
2758	srcCI = srcCI->clone();
2759	dest = utext_openCharacterIterator(dest, srcCI, status);
2760	if (U_FAILURE(*status)) {
2761	return dest;
2762	}
2763	// cast off const on getNativeIndex.
2764	// For CharacterIterator based UTexts, this is safe, the operation is const.
2765	int64_t ix = utext_getNativeIndex((UText *)src);
2766	utext_setNativeIndex(dest, ix);
2767	dest->r = srcCI; // flags that this UText owns the CharacterIterator
2768	}
2769	return dest;
2770	}
2771
2772	static int32_t U_CALLCONV
2773	charIterTextExtract(UText *ut,
2774	int64_t start, int64_t limit,
2775	UChar *dest, int32_t destCapacity,
2776	UErrorCode *status)
2777	{
2778	if(U_FAILURE(*status)) {
2779	return `0`;
2780	}
2781	if(destCapacity<`0` \|\| (dest==NULL && destCapacity>`0`) \|\| start>limit) {
2782	*status=U_ILLEGAL_ARGUMENT_ERROR;
2783	return `0`;
2784	}
2785	int32_t length = (int32_t)ut->a;
2786	int32_t start32 = pinIndex(start, length);
2787	int32_t limit32 = pinIndex(limit, length);
2788	int32_t desti = `0`;
2789	int32_t srci;
2790	int32_t copyLimit;
2791
2792	CharacterIterator ci = (CharacterIterator )ut->context;
2793	ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
2794	srci = ci->getIndex();
2795	copyLimit = srci;
2796	while (srci<limit32) {
2797	UChar32 c = ci->next32PostInc();
2798	int32_t len = U16_LENGTH(c);
2799	U_ASSERT(desti+len>`0`); / to ensure desti+len never exceeds MAX_INT32, which must not happen logically /
2800	if (desti+len <= destCapacity) {
2801	U16_APPEND_UNSAFE(dest, desti, c);
2802	copyLimit = srci+len;
2803	} else {
2804	desti += len;
2805	*status = U_BUFFER_OVERFLOW_ERROR;
2806	}
2807	srci += len;
2808	}
2809
2810	charIterTextAccess(ut, copyLimit, TRUE);
2811
2812	u_terminateUChars(dest, destCapacity, desti, status);
2813	return desti;
2814	}
2815
2816	static const struct UTextFuncs charIterFuncs =
2817	{
2818	sizeof(UTextFuncs),
2819	`0`, `0`, `0`, // Reserved alignment padding
2820	charIterTextClone,
2821	charIterTextLength,
2822	charIterTextAccess,
2823	charIterTextExtract,
2824	NULL, // Replace
2825	NULL, // Copy
2826	NULL, // MapOffsetToNative,
2827	NULL, // MapIndexToUTF16,
2828	charIterTextClose,
2829	NULL, // spare 1
2830	NULL, // spare 2
2831	NULL // spare 3
2832	};
2833	U_CDECL_END
2834
2835
2836	U_CAPI UText * U_EXPORT2
2837	utext_openCharacterIterator(UText ut, CharacterIterator ci, UErrorCode *status) {
2838	if (U_FAILURE(*status)) {
2839	return NULL;
2840	}
2841
2842	if (ci->startIndex() > `0`) {
2843	// No support for CharacterIterators that do not start indexing from zero.
2844	*status = U_UNSUPPORTED_ERROR;
2845	return NULL;
2846	}
2847
2848	// Extra space in UText for 2 buffers of CIBufSize UChars each.
2849	int32_t extraSpace = `2` * CIBufSize * sizeof(UChar);
2850	ut = utext_setup(ut, extraSpace, status);
2851	if (U_SUCCESS(*status)) {
2852	ut->pFuncs = &charIterFuncs;
2853	ut->context = ci;
2854	ut->providerProperties = `0`;
2855	ut->a = ci->endIndex(); // Length of text
2856	ut->p = ut->pExtra; // First buffer
2857	ut->b = -`1`; // Native index of first buffer contents
2858	ut->q = (UChar)ut->pExtra+CIBufSize; // Second buffer*
2859	ut->c = -`1`; // Native index of second buffer contents
2860
2861	// Initialize current chunk contents to be empty.
2862	// First access will fault something in.
2863	// Note: The initial nativeStart and chunkOffset must sum to zero
2864	// so that getNativeIndex() will correctly compute to zero
2865	// if no call to Access() has ever been made. They can't be both
2866	// zero without Access() thinking that the chunk is valid.
2867	ut->chunkContents = (UChar *)ut->p;
2868	ut->chunkNativeStart = -`1`;
2869	ut->chunkOffset = `1`;
2870	ut->chunkNativeLimit = `0`;
2871	ut->chunkLength = `0`;
2872	ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
2873	}
2874	return ut;
2875	}
2876

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/utext.cpp