unisetspan.cpp source code [Godot/thirdparty/icu4c/common/unisetspan.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2007-2012, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: unisetspan.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2007mar01
16	* created by: Markus W. Scherer
17	*/
18
19	#include "unicode/utypes.h"
20	#include "unicode/uniset.h"
21	#include "unicode/ustring.h"
22	#include "unicode/utf8.h"
23	#include "unicode/utf16.h"
24	#include "cmemory.h"
25	#include "uvector.h"
26	#include "unisetspan.h"
27
28	U_NAMESPACE_BEGIN
29
30	/*
31	* List of offsets from the current position from where to try matching
32	* a code point or a string.
33	* Store offsets rather than indexes to simplify the code and use the same list
34	* for both increments (in span()) and decrements (in spanBack()).
35	*
36	* Assumption: The maximum offset is limited, and the offsets that are stored
37	* at any one time are relatively dense, that is, there are normally no gaps of
38	* hundreds or thousands of offset values.
39	*
40	* The implementation uses a circular buffer of byte flags,
41	* each indicating whether the corresponding offset is in the list.
42	* This avoids inserting into a sorted list of offsets (or absolute indexes) and
43	* physically moving part of the list.
44	*
45	* Note: In principle, the caller should setMaxLength() to the maximum of the
46	* max string length and U16_LENGTH/U8_LENGTH to account for
47	* "long" single code points.
48	* However, this implementation uses at least a staticList with more than
49	* U8_LENGTH entries anyway.
50	*
51	* Note: If maxLength were guaranteed to be no more than 32 or 64,
52	* the list could be stored as bit flags in a single integer.
53	* Rather than handling a circular buffer with a start list index,
54	* the integer would simply be shifted when lower offsets are removed.
55	* UnicodeSet does not have a limit on the lengths of strings.
56	*/
57	class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58	public:
59	OffsetList() : list(staticList), capacity(`0`), length(`0`), start(`0`) {}
60
61	~OffsetList() {
62	if(list!=staticList) {
63	uprv_free(list);
64	}
65	}
66
67	// Call exactly once if the list is to be used.
68	void setMaxLength(int32_t maxLength) {
69	if(maxLength<=(int32_t)sizeof(staticList)) {
70	capacity=(int32_t)sizeof(staticList);
71	} else {
72	UBool l=(UBool )uprv_malloc(maxLength);
73	if(l!=nullptr) {
74	list=l;
75	capacity=maxLength;
76	}
77	}
78	uprv_memset(list, `0`, capacity);
79	}
80
81	void clear() {
82	uprv_memset(list, `0`, capacity);
83	start=length=`0`;
84	}
85
86	UBool isEmpty() const {
87	return (UBool)(length==`0`);
88	}
89
90	// Reduce all stored offsets by delta, used when the current position
91	// moves by delta.
92	// There must not be any offsets lower than delta.
93	// If there is an offset equal to delta, it is removed.
94	// delta=[1..maxLength]
95	void shift(int32_t delta) {
96	int32_t i=start+delta;
97	if(i>=capacity) {
98	i-=capacity;
99	}
100	if(list[i]) {
101	list[i]=false;
102	--length;
103	}
104	start=i;
105	}
106
107	// Add an offset. The list must not contain it yet.
108	// offset=[1..maxLength]
109	void addOffset(int32_t offset) {
110	int32_t i=start+offset;
111	if(i>=capacity) {
112	i-=capacity;
113	}
114	list[i]=true;
115	++length;
116	}
117
118	// offset=[1..maxLength]
119	UBool containsOffset(int32_t offset) const {
120	int32_t i=start+offset;
121	if(i>=capacity) {
122	i-=capacity;
123	}
124	return list[i];
125	}
126
127	// Find the lowest stored offset from a non-empty list, remove it,
128	// and reduce all other offsets by this minimum.
129	// Returns [1..maxLength].
130	int32_t popMinimum() {
131	// Look for the next offset in list[start+1..capacity-1].
132	int32_t i=start, result;
133	while(++i<capacity) {
134	if(list[i]) {
135	list[i]=false;
136	--length;
137	result=i-start;
138	start=i;
139	return result;
140	}
141	}
142	// i==capacity
143
144	// Wrap around and look for the next offset in list[0..start].
145	// Since the list is not empty, there will be one.
146	result=capacity-start;
147	i=`0`;
148	while(!list[i]) {
149	++i;
150	}
151	list[i]=false;
152	--length;
153	start=i;
154	return result+=i;
155	}
156
157	private:
158	UBool *list;
159	int32_t capacity;
160	int32_t length;
161	int32_t start;
162
163	UBool staticList[`16`];
164	};
165
166	// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167	static int32_t
168	getUTF8Length(const char16_t *s, int32_t length) {
169	UErrorCode errorCode=U_ZERO_ERROR;
170	int32_t length8=`0`;
171	u_strToUTF8(nullptr, `0`, &length8, s, length, &errorCode);
172	if(U_SUCCESS(errorCode) \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {
173	return length8;
174	} else {
175	// The string contains an unpaired surrogate.
176	// Ignore this string.
177	return `0`;
178	}
179	}
180
181	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182	static int32_t
183	appendUTF8(const char16_t s, int32_t length, uint8_t t, int32_t capacity) {
184	UErrorCode errorCode=U_ZERO_ERROR;
185	int32_t length8=`0`;
186	u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187	if(U_SUCCESS(errorCode)) {
188	return length8;
189	} else {
190	// The string contains an unpaired surrogate.
191	// Ignore this string.
192	return `0`;
193	}
194	}
195
196	static inline uint8_t
197	makeSpanLengthByte(int32_t spanLength) {
198	// 0xfe==UnicodeSetStringSpan::LONG_SPAN
199	return spanLength<`0xfe` ? (uint8_t)spanLength : (uint8_t)`0xfe`;
200	}
201
202	// Construct for all variants of span(), or only for any one variant.
203	// Initialize as little as possible, for single use.
204	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205	const UVector &setStrings,
206	uint32_t which)
207	: spanSet (`0`, `0x10ffff`), pSpanNotSet(nullptr), strings(setStrings),
208	utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
209	utf8Length(`0`),
210	maxLength16(`0`), maxLength8(`0`),
211	all((UBool)(which==ALL)) {
212	spanSet.retainAll(set);
213	if(which&NOT_CONTAINED) {
214	// Default to the same sets.
215	// addToSpanNotSet() will create a separate set if necessary.
216	pSpanNotSet=&spanSet;
217	}
218
219	// Determine if the strings even need to be taken into account at all for span() etc.
220	// If any string is relevant, then all strings need to be used for
221	// span(longest match) but only the relevant ones for span(while contained).
222	// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223	// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224	// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225	// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226	int32_t stringsLength=strings.size();
227
228	int32_t i, spanLength;
229	UBool someRelevant=false;
230	for(i=`0`; i<stringsLength; ++i) {
231	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
232	const char16_t *s16=string.getBuffer();
233	int32_t length16=string.length();
234	if (length16==`0`) {
235	continue; // skip the empty string
236	}
237	UBool thisRelevant;
238	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239	if(spanLength<length16) { // Relevant string.
240	someRelevant=thisRelevant=true;
241	} else {
242	thisRelevant=false;
243	}
244	if((which&UTF16) && length16>maxLength16) {
245	maxLength16=length16;
246	}
247	if((which&UTF8) && (thisRelevant \|\| (which&CONTAINED))) {
248	int32_t length8=getUTF8Length(s16, length16);
249	utf8Length+=length8;
250	if(length8>maxLength8) {
251	maxLength8=length8;
252	}
253	}
254	}
255	if(!someRelevant) {
256	maxLength16=maxLength8=`0`;
257	return;
258	}
259
260	// Freeze after checking for the need to use strings at all because freezing
261	// a set takes some time and memory which are wasted if there are no relevant strings.
262	if(all) {
263	spanSet.freeze();
264	}
265
266	uint8_t *spanBackLengths;
267	uint8_t *spanUTF8Lengths;
268	uint8_t *spanBackUTF8Lengths;
269
270	// Allocate a block of meta data.
271	int32_t allocSize;
272	if(all) {
273	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274	allocSize=stringsLength*(`4`+`1`+`1`+`1`+`1`)+utf8Length;
275	} else {
276	allocSize=stringsLength; // One set of span lengths.
277	if(which&UTF8) {
278	// UTF-8 lengths and UTF-8 strings.
279	allocSize+=stringsLength*`4`+utf8Length;
280	}
281	}
282	if(allocSize<=(int32_t)sizeof(staticLengths)) {
283	utf8Lengths=staticLengths;
284	} else {
285	utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286	if(utf8Lengths==nullptr) {
287	maxLength16=maxLength8=`0`; // Prevent usage by making needsStringSpanUTF16/8() return false.
288	return; // Out of memory.
289	}
290	}
291
292	if(all) {
293	// Store span lengths for all span() variants.
294	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295	spanBackLengths=spanLengths+stringsLength;
296	spanUTF8Lengths=spanBackLengths+stringsLength;
297	spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298	utf8=spanBackUTF8Lengths+stringsLength;
299	} else {
300	// Store span lengths for only one span() variant.
301	if(which&UTF8) {
302	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303	utf8=spanLengths+stringsLength;
304	} else {
305	spanLengths=(uint8_t *)utf8Lengths;
306	}
307	spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308	}
309
310	// Set the meta data and pSpanNotSet and write the UTF-8 strings.
311	int32_t utf8Count=`0`; // Count UTF-8 bytes written so far.
312
313	for(i=`0`; i<stringsLength; ++i) {
314	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
315	const char16_t *s16=string.getBuffer();
316	int32_t length16=string.length();
317	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318	if(spanLength<length16 && length16>`0`) { // Relevant string.
319	if(which&UTF16) {
320	if(which&CONTAINED) {
321	if(which&FWD) {
322	spanLengths[i]=makeSpanLengthByte(spanLength);
323	}
324	if(which&BACK) {
325	spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326	spanBackLengths[i]=makeSpanLengthByte(spanLength);
327	}
328	} else / not CONTAINED, not all, but NOT_CONTAINED / {
329	spanLengths[i]=spanBackLengths[i]=`0`; // Only store a relevant/irrelevant flag.
330	}
331	}
332	if(which&UTF8) {
333	uint8_t *s8=utf8+utf8Count;
334	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335	utf8Count+=utf8Lengths[i]=length8;
336	if(length8==`0`) { // Irrelevant for UTF-8 because not representable in UTF-8.
337	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338	} else { // Relevant for UTF-8.
339	if(which&CONTAINED) {
340	if(which&FWD) {
341	spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342	spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343	}
344	if(which&BACK) {
345	spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346	spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347	}
348	} else / not CONTAINED, not all, but NOT_CONTAINED / {
349	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=`0`; // Only store a relevant/irrelevant flag.
350	}
351	}
352	}
353	if(which&NOT_CONTAINED) {
354	// Add string start and end code points to the spanNotSet so that
355	// a span(while not contained) stops before any string.
356	UChar32 c;
357	if(which&FWD) {
358	int32_t len=`0`;
359	U16_NEXT(s16, len, length16, c);
360	addToSpanNotSet(c);
361	}
362	if(which&BACK) {
363	int32_t len=length16;
364	U16_PREV(s16, `0`, len, c);
365	addToSpanNotSet(c);
366	}
367	}
368	} else { // Irrelevant string. (Also the empty string.)
369	if(which&UTF8) {
370	if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
371	uint8_t *s8=utf8+utf8Count;
372	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373	utf8Count+=utf8Lengths[i]=length8;
374	} else {
375	utf8Lengths[i]=`0`;
376	}
377	}
378	if(all) {
379	spanLengths[i]=spanBackLengths[i]=
380	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381	(uint8_t)ALL_CP_CONTAINED;
382	} else {
383	// All spanXYZLengths pointers contain the same address.
384	spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385	}
386	}
387	}
388
389	// Finish.
390	if(all) {
391	pSpanNotSet->freeze();
392	}
393	}
394
395	// Copy constructor. Assumes which==ALL for a frozen set.
396	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397	const UVector &newParentSetStrings)
398	: spanSet (otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
399	utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
400	utf8Length(otherStringSpan.utf8Length),
401	maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402	all(true) {
403	if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404	pSpanNotSet=&spanSet;
405	} else {
406	pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407	}
408
409	// Allocate a block of meta data.
410	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411	int32_t stringsLength=strings.size();
412	int32_t allocSize=stringsLength*(`4`+`1`+`1`+`1`+`1`)+utf8Length;
413	if(allocSize<=(int32_t)sizeof(staticLengths)) {
414	utf8Lengths=staticLengths;
415	} else {
416	utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417	if(utf8Lengths==nullptr) {
418	maxLength16=maxLength8=`0`; // Prevent usage by making needsStringSpanUTF16/8() return false.
419	return; // Out of memory.
420	}
421	}
422
423	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424	utf8=spanLengths+stringsLength*`4`;
425	uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426	}
427
428	UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429	if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
430	delete pSpanNotSet;
431	}
432	if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
433	uprv_free(utf8Lengths);
434	}
435	}
436
437	void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438	if(pSpanNotSet==nullptr \|\| pSpanNotSet==&spanSet) {
439	if(spanSet.contains(c)) {
440	return; // Nothing to do.
441	}
442	UnicodeSet *newSet=spanSet.cloneAsThawed();
443	if(newSet==nullptr) {
444	return; // Out of memory.
445	} else {
446	pSpanNotSet=newSet;
447	}
448	}
449	pSpanNotSet->add(c);
450	}
451
452	// Compare strings without any argument checks. Requires length>0.
453	static inline UBool
454	matches16(const char16_t s, const* char16_t *t, int32_t length) {
455	do {
456	if(s++!=t++) {
457	return false;
458	}
459	} while(--length>`0`);
460	return true;
461	}
462
463	static inline UBool
464	matches8(const uint8_t s, const* uint8_t *t, int32_t length) {
465	do {
466	if(s++!=t++) {
467	return false;
468	}
469	} while(--length>`0`);
470	return true;
471	}
472
473	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
474	// at code point boundaries.
475	// That is, each edge of a match must not be in the middle of a surrogate pair.
476	static inline UBool
477	matches16CPB(const char16_t s, int32_t start, int32_t limit, const* char16_t *t, int32_t length) {
478	s+=start;
479	limit-=start;
480	return matches16(s, t, length) &&
481	!(`0`<start && U16_IS_LEAD(s[-`1`]) && U16_IS_TRAIL(s[`0`])) &&
482	!(length<limit && U16_IS_LEAD(s[length-`1`]) && U16_IS_TRAIL(s[length]));
483	}
484
485	// Does the set contain the next code point?
486	// If so, return its length; otherwise return its negative length.
487	static inline int32_t
488	spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
489	char16_t c=*s, c2;
490	if(c>=`0xd800` && c<=`0xdbff` && length>=`2` && U16_IS_TRAIL(c2=s[`1`])) {
491	return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? `2` : -`2`;
492	}
493	return set.contains(c) ? `1` : -`1`;
494	}
495
496	static inline int32_t
497	spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
498	char16_t c=s[length-`1`], c2;
499	if(c>=`0xdc00` && c<=`0xdfff` && length>=`2` && U16_IS_LEAD(c2=s[length-`2`])) {
500	return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? `2` : -`2`;
501	}
502	return set.contains(c) ? `1` : -`1`;
503	}
504
505	static inline int32_t
506	spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507	UChar32 c=*s;
508	if(U8_IS_SINGLE(c)) {
509	return set.contains(c) ? `1` : -`1`;
510	}
511	// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512	int32_t i=`0`;
513	U8_NEXT_OR_FFFD(s, i, length, c);
514	return set.contains(c) ? i : -i;
515	}
516
517	static inline int32_t
518	spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519	UChar32 c=s[length-`1`];
520	if(U8_IS_SINGLE(c)) {
521	return set.contains(c) ? `1` : -`1`;
522	}
523	int32_t i=length-`1`;
524	c=utf8_prevCharSafeBody(s, `0`, &i, c, -`3`);
525	length-=i;
526	return set.contains(c) ? length : -length;
527	}
528
529	/*
530	* Note: In span() when spanLength==0 (after a string match, or at the beginning
531	* after an empty code point span) and in spanNot() and spanNotUTF8(),
532	* string matching could use a binary search
533	* because all string matches are done from the same start index.
534	*
535	* For UTF-8, this would require a comparison function that returns UTF-16 order.
536	*
537	* This optimization should not be necessary for normal UnicodeSets because
538	* most sets have no strings, and most sets with strings have
539	* very few very short strings.
540	* For cases with many strings, it might be better to use a different API
541	* and implementation with a DFA (state machine).
542	*/
543
544	/*
545	* Algorithm for span(USET_SPAN_CONTAINED)
546	*
547	* Theoretical algorithm:
548	* - Iterate through the string, and at each code point boundary:
549	* + If the code point there is in the set, then remember to continue after it.
550	* + If a set string matches at the current position, then remember to continue after it.
551	* + Either recursively span for each code point or string match,
552	* or recursively span for all but the shortest one and
553	* iteratively continue the span with the shortest local match.
554	* + Remember the longest recursive span (the farthest end point).
555	* + If there is no match at the current position, neither for the code point there
556	* nor for any set string, then stop and return the longest recursive span length.
557	*
558	* Optimized implementation:
559	*
560	* (We assume that most sets will have very few very short strings.
561	* A span using a string-less set is extremely fast.)
562	*
563	* Create and cache a spanSet which contains all of the single code points
564	* of the original set but none of its strings.
565	*
566	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567	* - Loop:
568	* + Try to match each set string at the end of the spanLength.
569	* ~ Set strings that start with set-contained code points must be matched
570	* with a partial overlap because the recursive algorithm would have tried
571	* to match them at every position.
572	* ~ Set strings that entirely consist of set-contained code points
573	* are irrelevant for span(USET_SPAN_CONTAINED) because the
574	* recursive algorithm would continue after them anyway
575	* and find the longest recursive match from their end.
576	* ~ Rather than recursing, note each end point of a set string match.
577	* + If no set string matched after spanSet.span(), then return
578	* with where the spanSet.span() ended.
579	* + If at least one set string matched after spanSet.span(), then
580	* pop the shortest string match end point and continue
581	* the loop, trying to match all set strings from there.
582	* + If at least one more set string matched after a previous string match,
583	* then test if the code point after the previous string match is also
584	* contained in the set.
585	* Continue the loop with the shortest end point of either this code point
586	* or a matching set string.
587	* + If no more set string matched after a previous string match,
588	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589	* Stop if spanLength==0, otherwise continue the loop.
590	*
591	* By noting each end point of a set string match,
592	* the function visits each string position at most once and finishes
593	* in linear time.
594	*
595	* The recursive algorithm may visit the same string position many times
596	* if multiple paths lead to it and finishes in exponential time.
597	*/
598
599	/*
600	* Algorithm for span(USET_SPAN_SIMPLE)
601	*
602	* Theoretical algorithm:
603	* - Iterate through the string, and at each code point boundary:
604	* + If the code point there is in the set, then remember to continue after it.
605	* + If a set string matches at the current position, then remember to continue after it.
606	* + Continue from the farthest match position and ignore all others.
607	* + If there is no match at the current position,
608	* then stop and return the current position.
609	*
610	* Optimized implementation:
611	*
612	* (Same assumption and spanSet as above.)
613	*
614	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615	* - Loop:
616	* + Try to match each set string at the end of the spanLength.
617	* ~ Set strings that start with set-contained code points must be matched
618	* with a partial overlap because the standard algorithm would have tried
619	* to match them earlier.
620	* ~ Set strings that entirely consist of set-contained code points
621	* must be matched with a full overlap because the longest-match algorithm
622	* would hide set string matches that end earlier.
623	* Such set strings need not be matched earlier inside the code point span
624	* because the standard algorithm would then have continued after
625	* the set string match anyway.
626	* ~ Remember the longest set string match (farthest end point) from the earliest
627	* starting point.
628	* + If no set string matched after spanSet.span(), then return
629	* with where the spanSet.span() ended.
630	* + If at least one set string matched, then continue the loop after the
631	* longest match from the earliest position.
632	* + If no more set string matched after a previous string match,
633	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634	* Stop if spanLength==0, otherwise continue the loop.
635	*/
636
637	int32_t UnicodeSetStringSpan::span(const char16_t s, int32_t length, USetSpanCondition spanCondition) const* {
638	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639	return spanNot(s, length);
640	}
641	int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642	if(spanLength==length) {
643	return length;
644	}
645
646	// Consider strings; they may overlap with the span.
647	OffsetList offsets;
648	if(spanCondition==USET_SPAN_CONTAINED) {
649	// Use offset list to try all possibilities.
650	offsets.setMaxLength(maxLength16);
651	}
652	int32_t pos=spanLength, rest=length-pos;
653	int32_t i, stringsLength=strings.size();
654	for(;;) {
655	if(spanCondition==USET_SPAN_CONTAINED) {
656	for(i=`0`; i<stringsLength; ++i) {
657	int32_t overlap=spanLengths[i];
658	if(overlap==ALL_CP_CONTAINED) {
659	continue; // Irrelevant string. (Also the empty string.)
660	}
661	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
662	const char16_t *s16=string.getBuffer();
663	int32_t length16=string.length();
664	U_ASSERT(length>`0`);
665
666	// Try to match this string at pos-overlap..pos.
667	if(overlap>=LONG_SPAN) {
668	overlap=length16;
669	// While contained: No point matching fully inside the code point span.
670	U16_BACK_1(s16, `0`, overlap); // Length of the string minus the last code point.
671	}
672	if(overlap>spanLength) {
673	overlap=spanLength;
674	}
675	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
676	for(;;) {
677	if(inc>rest) {
678	break;
679	}
680	// Try to match if the increment is not listed already.
681	if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682	if(inc==rest) {
683	return length; // Reached the end of the string.
684	}
685	offsets.addOffset(inc);
686	}
687	if(overlap==`0`) {
688	break;
689	}
690	--overlap;
691	++inc;
692	}
693	}
694	} else / USET_SPAN_SIMPLE / {
695	int32_t maxInc=`0`, maxOverlap=`0`;
696	for(i=`0`; i<stringsLength; ++i) {
697	int32_t overlap=spanLengths[i];
698	// For longest match, we do need to try to match even an all-contained string
699	// to find the match from the earliest start.
700
701	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
702	const char16_t *s16=string.getBuffer();
703	int32_t length16=string.length();
704	if (length16==`0`) {
705	continue; // skip the empty string
706	}
707
708	// Try to match this string at pos-overlap..pos.
709	if(overlap>=LONG_SPAN) {
710	overlap=length16;
711	// Longest match: Need to match fully inside the code point span
712	// to find the match from the earliest start.
713	}
714	if(overlap>spanLength) {
715	overlap=spanLength;
716	}
717	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
718	for(;;) {
719	if(inc>rest \|\| overlap<maxOverlap) {
720	break;
721	}
722	// Try to match if the string is longer or starts earlier.
723	if( (overlap>maxOverlap \|\| / redundant overlap==maxOverlap && / inc>maxInc) &&
724	matches16CPB(s, pos-overlap, length, s16, length16)
725	) {
726	maxInc=inc; // Longest match from earliest start.
727	maxOverlap=overlap;
728	break;
729	}
730	--overlap;
731	++inc;
732	}
733	}
734
735	if(maxInc!=`0` \|\| maxOverlap!=`0`) {
736	// Longest-match algorithm, and there was a string match.
737	// Simply continue after it.
738	pos+=maxInc;
739	rest-=maxInc;
740	if(rest==`0`) {
741	return length; // Reached the end of the string.
742	}
743	spanLength=`0`; // Match strings from after a string match.
744	continue;
745	}
746	}
747	// Finished trying to match all strings at pos.
748
749	if(spanLength!=`0` \|\| pos==`0`) {
750	// The position is after an unlimited code point span (spanLength!=0),
751	// not after a string match.
752	// The only position where spanLength==0 after a span is pos==0.
753	// Otherwise, an unlimited code point span is only tried again when no
754	// strings match, and if such a non-initial span fails we stop.
755	if(offsets.isEmpty()) {
756	return pos; // No strings matched after a span.
757	}
758	// Match strings from after the next string match.
759	} else {
760	// The position is after a string match (or a single code point).
761	if(offsets.isEmpty()) {
762	// No more strings matched after a previous string match.
763	// Try another code point span from after the last string match.
764	spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765	if( spanLength==rest \|\| // Reached the end of the string, or
766	spanLength==`0` // neither strings nor span progressed.
767	) {
768	return pos+spanLength;
769	}
770	pos+=spanLength;
771	rest-=spanLength;
772	continue; // spanLength>0: Match strings from after a span.
773	} else {
774	// Try to match only one code point from after a string match if some
775	// string matched beyond it, so that we try all possible positions
776	// and don't overshoot.
777	spanLength=spanOne(spanSet, s+pos, rest);
778	if(spanLength>`0`) {
779	if(spanLength==rest) {
780	return length; // Reached the end of the string.
781	}
782	// Match strings after this code point.
783	// There cannot be any increments below it because UnicodeSet strings
784	// contain multiple code points.
785	pos+=spanLength;
786	rest-=spanLength;
787	offsets.shift(spanLength);
788	spanLength=`0`;
789	continue; // Match strings from after a single code point.
790	}
791	// Match strings from after the next string match.
792	}
793	}
794	int32_t minOffset=offsets.popMinimum();
795	pos+=minOffset;
796	rest-=minOffset;
797	spanLength=`0`; // Match strings from after a string match.
798	}
799	}
800
801	int32_t UnicodeSetStringSpan::spanBack(const char16_t s, int32_t length, USetSpanCondition spanCondition) const* {
802	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803	return spanNotBack(s, length);
804	}
805	int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806	if(pos==`0`) {
807	return `0`;
808	}
809	int32_t spanLength=length-pos;
810
811	// Consider strings; they may overlap with the span.
812	OffsetList offsets;
813	if(spanCondition==USET_SPAN_CONTAINED) {
814	// Use offset list to try all possibilities.
815	offsets.setMaxLength(maxLength16);
816	}
817	int32_t i, stringsLength=strings.size();
818	uint8_t *spanBackLengths=spanLengths;
819	if(all) {
820	spanBackLengths+=stringsLength;
821	}
822	for(;;) {
823	if(spanCondition==USET_SPAN_CONTAINED) {
824	for(i=`0`; i<stringsLength; ++i) {
825	int32_t overlap=spanBackLengths[i];
826	if(overlap==ALL_CP_CONTAINED) {
827	continue; // Irrelevant string. (Also the empty string.)
828	}
829	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
830	const char16_t *s16=string.getBuffer();
831	int32_t length16=string.length();
832	U_ASSERT(length>`0`);
833
834	// Try to match this string at pos-(length16-overlap)..pos-length16.
835	if(overlap>=LONG_SPAN) {
836	overlap=length16;
837	// While contained: No point matching fully inside the code point span.
838	int32_t len1=`0`;
839	U16_FWD_1(s16, len1, overlap);
840	overlap-=len1; // Length of the string minus the first code point.
841	}
842	if(overlap>spanLength) {
843	overlap=spanLength;
844	}
845	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
846	for(;;) {
847	if(dec>pos) {
848	break;
849	}
850	// Try to match if the decrement is not listed already.
851	if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852	if(dec==pos) {
853	return `0`; // Reached the start of the string.
854	}
855	offsets.addOffset(dec);
856	}
857	if(overlap==`0`) {
858	break;
859	}
860	--overlap;
861	++dec;
862	}
863	}
864	} else / USET_SPAN_SIMPLE / {
865	int32_t maxDec=`0`, maxOverlap=`0`;
866	for(i=`0`; i<stringsLength; ++i) {
867	int32_t overlap=spanBackLengths[i];
868	// For longest match, we do need to try to match even an all-contained string
869	// to find the match from the latest end.
870
871	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
872	const char16_t *s16=string.getBuffer();
873	int32_t length16=string.length();
874	if (length16==`0`) {
875	continue; // skip the empty string
876	}
877
878	// Try to match this string at pos-(length16-overlap)..pos-length16.
879	if(overlap>=LONG_SPAN) {
880	overlap=length16;
881	// Longest match: Need to match fully inside the code point span
882	// to find the match from the latest end.
883	}
884	if(overlap>spanLength) {
885	overlap=spanLength;
886	}
887	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
888	for(;;) {
889	if(dec>pos \|\| overlap<maxOverlap) {
890	break;
891	}
892	// Try to match if the string is longer or ends later.
893	if( (overlap>maxOverlap \|\| / redundant overlap==maxOverlap && / dec>maxDec) &&
894	matches16CPB(s, pos-dec, length, s16, length16)
895	) {
896	maxDec=dec; // Longest match from latest end.
897	maxOverlap=overlap;
898	break;
899	}
900	--overlap;
901	++dec;
902	}
903	}
904
905	if(maxDec!=`0` \|\| maxOverlap!=`0`) {
906	// Longest-match algorithm, and there was a string match.
907	// Simply continue before it.
908	pos-=maxDec;
909	if(pos==`0`) {
910	return `0`; // Reached the start of the string.
911	}
912	spanLength=`0`; // Match strings from before a string match.
913	continue;
914	}
915	}
916	// Finished trying to match all strings at pos.
917
918	if(spanLength!=`0` \|\| pos==length) {
919	// The position is before an unlimited code point span (spanLength!=0),
920	// not before a string match.
921	// The only position where spanLength==0 before a span is pos==length.
922	// Otherwise, an unlimited code point span is only tried again when no
923	// strings match, and if such a non-initial span fails we stop.
924	if(offsets.isEmpty()) {
925	return pos; // No strings matched before a span.
926	}
927	// Match strings from before the next string match.
928	} else {
929	// The position is before a string match (or a single code point).
930	if(offsets.isEmpty()) {
931	// No more strings matched before a previous string match.
932	// Try another code point span from before the last string match.
933	int32_t oldPos=pos;
934	pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935	spanLength=oldPos-pos;
936	if( pos==`0` \|\| // Reached the start of the string, or
937	spanLength==`0` // neither strings nor span progressed.
938	) {
939	return pos;
940	}
941	continue; // spanLength>0: Match strings from before a span.
942	} else {
943	// Try to match only one code point from before a string match if some
944	// string matched beyond it, so that we try all possible positions
945	// and don't overshoot.
946	spanLength=spanOneBack(spanSet, s, pos);
947	if(spanLength>`0`) {
948	if(spanLength==pos) {
949	return `0`; // Reached the start of the string.
950	}
951	// Match strings before this code point.
952	// There cannot be any decrements below it because UnicodeSet strings
953	// contain multiple code points.
954	pos-=spanLength;
955	offsets.shift(spanLength);
956	spanLength=`0`;
957	continue; // Match strings from before a single code point.
958	}
959	// Match strings from before the next string match.
960	}
961	}
962	pos-=offsets.popMinimum();
963	spanLength=`0`; // Match strings from before a string match.
964	}
965	}
966
967	int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t s, int32_t length, USetSpanCondition spanCondition) const* {
968	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969	return spanNotUTF8(s, length);
970	}
971	int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972	if(spanLength==length) {
973	return length;
974	}
975
976	// Consider strings; they may overlap with the span.
977	OffsetList offsets;
978	if(spanCondition==USET_SPAN_CONTAINED) {
979	// Use offset list to try all possibilities.
980	offsets.setMaxLength(maxLength8);
981	}
982	int32_t pos=spanLength, rest=length-pos;
983	int32_t i, stringsLength=strings.size();
984	uint8_t *spanUTF8Lengths=spanLengths;
985	if(all) {
986	spanUTF8Lengths+=`2`*stringsLength;
987	}
988	for(;;) {
989	const uint8_t *s8=utf8;
990	int32_t length8;
991	if(spanCondition==USET_SPAN_CONTAINED) {
992	for(i=`0`; i<stringsLength; ++i) {
993	length8=utf8Lengths[i];
994	if(length8==`0`) {
995	continue; // String not representable in UTF-8.
996	}
997	int32_t overlap=spanUTF8Lengths[i];
998	if(overlap==ALL_CP_CONTAINED) {
999	s8+=length8;
1000	continue; // Irrelevant string.
1001	}
1002
1003	// Try to match this string at pos-overlap..pos.
1004	if(overlap>=LONG_SPAN) {
1005	overlap=length8;
1006	// While contained: No point matching fully inside the code point span.
1007	U8_BACK_1(s8, `0`, overlap); // Length of the string minus the last code point.
1008	}
1009	if(overlap>spanLength) {
1010	overlap=spanLength;
1011	}
1012	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1013	for(;;) {
1014	if(inc>rest) {
1015	break;
1016	}
1017	// Try to match if the increment is not listed already.
1018	// Match at code point boundaries. (The UTF-8 strings were converted
1019	// from UTF-16 and are guaranteed to be well-formed.)
1020	if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021	!offsets.containsOffset(inc) &&
1022	matches8(s+pos-overlap, s8, length8)) {
1023	if(inc==rest) {
1024	return length; // Reached the end of the string.
1025	}
1026	offsets.addOffset(inc);
1027	}
1028	if(overlap==`0`) {
1029	break;
1030	}
1031	--overlap;
1032	++inc;
1033	}
1034	s8+=length8;
1035	}
1036	} else / USET_SPAN_SIMPLE / {
1037	int32_t maxInc=`0`, maxOverlap=`0`;
1038	for(i=`0`; i<stringsLength; ++i) {
1039	length8=utf8Lengths[i];
1040	if(length8==`0`) {
1041	continue; // String not representable in UTF-8.
1042	}
1043	int32_t overlap=spanUTF8Lengths[i];
1044	// For longest match, we do need to try to match even an all-contained string
1045	// to find the match from the earliest start.
1046
1047	// Try to match this string at pos-overlap..pos.
1048	if(overlap>=LONG_SPAN) {
1049	overlap=length8;
1050	// Longest match: Need to match fully inside the code point span
1051	// to find the match from the earliest start.
1052	}
1053	if(overlap>spanLength) {
1054	overlap=spanLength;
1055	}
1056	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1057	for(;;) {
1058	if(inc>rest \|\| overlap<maxOverlap) {
1059	break;
1060	}
1061	// Try to match if the string is longer or starts earlier.
1062	// Match at code point boundaries. (The UTF-8 strings were converted
1063	// from UTF-16 and are guaranteed to be well-formed.)
1064	if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065	(overlap>maxOverlap \|\|
1066	/ redundant overlap==maxOverlap && / inc>maxInc) &&
1067	matches8(s+pos-overlap, s8, length8)) {
1068	maxInc=inc; // Longest match from earliest start.
1069	maxOverlap=overlap;
1070	break;
1071	}
1072	--overlap;
1073	++inc;
1074	}
1075	s8+=length8;
1076	}
1077
1078	if(maxInc!=`0` \|\| maxOverlap!=`0`) {
1079	// Longest-match algorithm, and there was a string match.
1080	// Simply continue after it.
1081	pos+=maxInc;
1082	rest-=maxInc;
1083	if(rest==`0`) {
1084	return length; // Reached the end of the string.
1085	}
1086	spanLength=`0`; // Match strings from after a string match.
1087	continue;
1088	}
1089	}
1090	// Finished trying to match all strings at pos.
1091
1092	if(spanLength!=`0` \|\| pos==`0`) {
1093	// The position is after an unlimited code point span (spanLength!=0),
1094	// not after a string match.
1095	// The only position where spanLength==0 after a span is pos==0.
1096	// Otherwise, an unlimited code point span is only tried again when no
1097	// strings match, and if such a non-initial span fails we stop.
1098	if(offsets.isEmpty()) {
1099	return pos; // No strings matched after a span.
1100	}
1101	// Match strings from after the next string match.
1102	} else {
1103	// The position is after a string match (or a single code point).
1104	if(offsets.isEmpty()) {
1105	// No more strings matched after a previous string match.
1106	// Try another code point span from after the last string match.
1107	spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108	if( spanLength==rest \|\| // Reached the end of the string, or
1109	spanLength==`0` // neither strings nor span progressed.
1110	) {
1111	return pos+spanLength;
1112	}
1113	pos+=spanLength;
1114	rest-=spanLength;
1115	continue; // spanLength>0: Match strings from after a span.
1116	} else {
1117	// Try to match only one code point from after a string match if some
1118	// string matched beyond it, so that we try all possible positions
1119	// and don't overshoot.
1120	spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121	if(spanLength>`0`) {
1122	if(spanLength==rest) {
1123	return length; // Reached the end of the string.
1124	}
1125	// Match strings after this code point.
1126	// There cannot be any increments below it because UnicodeSet strings
1127	// contain multiple code points.
1128	pos+=spanLength;
1129	rest-=spanLength;
1130	offsets.shift(spanLength);
1131	spanLength=`0`;
1132	continue; // Match strings from after a single code point.
1133	}
1134	// Match strings from after the next string match.
1135	}
1136	}
1137	int32_t minOffset=offsets.popMinimum();
1138	pos+=minOffset;
1139	rest-=minOffset;
1140	spanLength=`0`; // Match strings from after a string match.
1141	}
1142	}
1143
1144	int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t s, int32_t length, USetSpanCondition spanCondition) const* {
1145	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146	return spanNotBackUTF8(s, length);
1147	}
1148	int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149	if(pos==`0`) {
1150	return `0`;
1151	}
1152	int32_t spanLength=length-pos;
1153
1154	// Consider strings; they may overlap with the span.
1155	OffsetList offsets;
1156	if(spanCondition==USET_SPAN_CONTAINED) {
1157	// Use offset list to try all possibilities.
1158	offsets.setMaxLength(maxLength8);
1159	}
1160	int32_t i, stringsLength=strings.size();
1161	uint8_t *spanBackUTF8Lengths=spanLengths;
1162	if(all) {
1163	spanBackUTF8Lengths+=`3`*stringsLength;
1164	}
1165	for(;;) {
1166	const uint8_t *s8=utf8;
1167	int32_t length8;
1168	if(spanCondition==USET_SPAN_CONTAINED) {
1169	for(i=`0`; i<stringsLength; ++i) {
1170	length8=utf8Lengths[i];
1171	if(length8==`0`) {
1172	continue; // String not representable in UTF-8.
1173	}
1174	int32_t overlap=spanBackUTF8Lengths[i];
1175	if(overlap==ALL_CP_CONTAINED) {
1176	s8+=length8;
1177	continue; // Irrelevant string.
1178	}
1179
1180	// Try to match this string at pos-(length8-overlap)..pos-length8.
1181	if(overlap>=LONG_SPAN) {
1182	overlap=length8;
1183	// While contained: No point matching fully inside the code point span.
1184	int32_t len1=`0`;
1185	U8_FWD_1(s8, len1, overlap);
1186	overlap-=len1; // Length of the string minus the first code point.
1187	}
1188	if(overlap>spanLength) {
1189	overlap=spanLength;
1190	}
1191	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1192	for(;;) {
1193	if(dec>pos) {
1194	break;
1195	}
1196	// Try to match if the decrement is not listed already.
1197	// Match at code point boundaries. (The UTF-8 strings were converted
1198	// from UTF-16 and are guaranteed to be well-formed.)
1199	if( !U8_IS_TRAIL(s[pos-dec]) &&
1200	!offsets.containsOffset(dec) &&
1201	matches8(s+pos-dec, s8, length8)
1202	) {
1203	if(dec==pos) {
1204	return `0`; // Reached the start of the string.
1205	}
1206	offsets.addOffset(dec);
1207	}
1208	if(overlap==`0`) {
1209	break;
1210	}
1211	--overlap;
1212	++dec;
1213	}
1214	s8+=length8;
1215	}
1216	} else / USET_SPAN_SIMPLE / {
1217	int32_t maxDec=`0`, maxOverlap=`0`;
1218	for(i=`0`; i<stringsLength; ++i) {
1219	length8=utf8Lengths[i];
1220	if(length8==`0`) {
1221	continue; // String not representable in UTF-8.
1222	}
1223	int32_t overlap=spanBackUTF8Lengths[i];
1224	// For longest match, we do need to try to match even an all-contained string
1225	// to find the match from the latest end.
1226
1227	// Try to match this string at pos-(length8-overlap)..pos-length8.
1228	if(overlap>=LONG_SPAN) {
1229	overlap=length8;
1230	// Longest match: Need to match fully inside the code point span
1231	// to find the match from the latest end.
1232	}
1233	if(overlap>spanLength) {
1234	overlap=spanLength;
1235	}
1236	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1237	for(;;) {
1238	if(dec>pos \|\| overlap<maxOverlap) {
1239	break;
1240	}
1241	// Try to match if the string is longer or ends later.
1242	// Match at code point boundaries. (The UTF-8 strings were converted
1243	// from UTF-16 and are guaranteed to be well-formed.)
1244	if( !U8_IS_TRAIL(s[pos-dec]) &&
1245	(overlap>maxOverlap \|\| / redundant overlap==maxOverlap && / dec>maxDec) &&
1246	matches8(s+pos-dec, s8, length8)
1247	) {
1248	maxDec=dec; // Longest match from latest end.
1249	maxOverlap=overlap;
1250	break;
1251	}
1252	--overlap;
1253	++dec;
1254	}
1255	s8+=length8;
1256	}
1257
1258	if(maxDec!=`0` \|\| maxOverlap!=`0`) {
1259	// Longest-match algorithm, and there was a string match.
1260	// Simply continue before it.
1261	pos-=maxDec;
1262	if(pos==`0`) {
1263	return `0`; // Reached the start of the string.
1264	}
1265	spanLength=`0`; // Match strings from before a string match.
1266	continue;
1267	}
1268	}
1269	// Finished trying to match all strings at pos.
1270
1271	if(spanLength!=`0` \|\| pos==length) {
1272	// The position is before an unlimited code point span (spanLength!=0),
1273	// not before a string match.
1274	// The only position where spanLength==0 before a span is pos==length.
1275	// Otherwise, an unlimited code point span is only tried again when no
1276	// strings match, and if such a non-initial span fails we stop.
1277	if(offsets.isEmpty()) {
1278	return pos; // No strings matched before a span.
1279	}
1280	// Match strings from before the next string match.
1281	} else {
1282	// The position is before a string match (or a single code point).
1283	if(offsets.isEmpty()) {
1284	// No more strings matched before a previous string match.
1285	// Try another code point span from before the last string match.
1286	int32_t oldPos=pos;
1287	pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288	spanLength=oldPos-pos;
1289	if( pos==`0` \|\| // Reached the start of the string, or
1290	spanLength==`0` // neither strings nor span progressed.
1291	) {
1292	return pos;
1293	}
1294	continue; // spanLength>0: Match strings from before a span.
1295	} else {
1296	// Try to match only one code point from before a string match if some
1297	// string matched beyond it, so that we try all possible positions
1298	// and don't overshoot.
1299	spanLength=spanOneBackUTF8(spanSet, s, pos);
1300	if(spanLength>`0`) {
1301	if(spanLength==pos) {
1302	return `0`; // Reached the start of the string.
1303	}
1304	// Match strings before this code point.
1305	// There cannot be any decrements below it because UnicodeSet strings
1306	// contain multiple code points.
1307	pos-=spanLength;
1308	offsets.shift(spanLength);
1309	spanLength=`0`;
1310	continue; // Match strings from before a single code point.
1311	}
1312	// Match strings from before the next string match.
1313	}
1314	}
1315	pos-=offsets.popMinimum();
1316	spanLength=`0`; // Match strings from before a string match.
1317	}
1318	}
1319
1320	/*
1321	* Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322	*
1323	* Theoretical algorithm:
1324	* - Iterate through the string, and at each code point boundary:
1325	* + If the code point there is in the set, then return with the current position.
1326	* + If a set string matches at the current position, then return with the current position.
1327	*
1328	* Optimized implementation:
1329	*
1330	* (Same assumption as for span() above.)
1331	*
1332	* Create and cache a spanNotSet which contains all of the single code points
1333	* of the original set but none of its strings.
1334	* For each set string add its initial code point to the spanNotSet.
1335	* (Also add its final code point for spanNotBack().)
1336	*
1337	* - Loop:
1338	* + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339	* + If the current code point is in the original set, then
1340	* return the current position.
1341	* + If any set string matches at the current position, then
1342	* return the current position.
1343	* + If there is no match at the current position, neither for the code point there
1344	* nor for any set string, then skip this code point and continue the loop.
1345	* This happens for set-string-initial code points that were added to spanNotSet
1346	* when there is not actually a match for such a set string.
1347	*/
1348
1349	int32_t UnicodeSetStringSpan::spanNot(const char16_t s, int32_t length) const* {
1350	int32_t pos=`0`, rest=length;
1351	int32_t i, stringsLength=strings.size();
1352	do {
1353	// Span until we find a code point from the set,
1354	// or a code point that starts or ends some string.
1355	i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356	if(i==rest) {
1357	return length; // Reached the end of the string.
1358	}
1359	pos+=i;
1360	rest-=i;
1361
1362	// Check whether the current code point is in the original set,
1363	// without the string starts and ends.
1364	int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365	if(cpLength>`0`) {
1366	return pos; // There is a set element at pos.
1367	}
1368
1369	// Try to match the strings at pos.
1370	for(i=`0`; i<stringsLength; ++i) {
1371	if(spanLengths[i]==ALL_CP_CONTAINED) {
1372	continue; // Irrelevant string. (Also the empty string.)
1373	}
1374	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
1375	const char16_t *s16=string.getBuffer();
1376	int32_t length16=string.length();
1377	U_ASSERT(length>`0`);
1378	if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379	return pos; // There is a set element at pos.
1380	}
1381	}
1382
1383	// The span(while not contained) ended on a string start/end which is
1384	// not in the original set. Skip this code point and continue.
1385	// cpLength<0
1386	pos-=cpLength;
1387	rest+=cpLength;
1388	} while(rest!=`0`);
1389	return length; // Reached the end of the string.
1390	}
1391
1392	int32_t UnicodeSetStringSpan::spanNotBack(const char16_t s, int32_t length) const* {
1393	int32_t pos=length;
1394	int32_t i, stringsLength=strings.size();
1395	do {
1396	// Span until we find a code point from the set,
1397	// or a code point that starts or ends some string.
1398	pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399	if(pos==`0`) {
1400	return `0`; // Reached the start of the string.
1401	}
1402
1403	// Check whether the current code point is in the original set,
1404	// without the string starts and ends.
1405	int32_t cpLength=spanOneBack(spanSet, s, pos);
1406	if(cpLength>`0`) {
1407	return pos; // There is a set element at pos.
1408	}
1409
1410	// Try to match the strings at pos.
1411	for(i=`0`; i<stringsLength; ++i) {
1412	// Use spanLengths rather than a spanBackLengths pointer because
1413	// it is easier and we only need to know whether the string is irrelevant
1414	// which is the same in either array.
1415	if(spanLengths[i]==ALL_CP_CONTAINED) {
1416	continue; // Irrelevant string. (Also the empty string.)
1417	}
1418	const UnicodeString &string=(const* UnicodeString *)strings.elementAt(i);
1419	const char16_t *s16=string.getBuffer();
1420	int32_t length16=string.length();
1421	U_ASSERT(length>`0`);
1422	if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423	return pos; // There is a set element at pos.
1424	}
1425	}
1426
1427	// The span(while not contained) ended on a string start/end which is
1428	// not in the original set. Skip this code point and continue.
1429	// cpLength<0
1430	pos+=cpLength;
1431	} while(pos!=`0`);
1432	return `0`; // Reached the start of the string.
1433	}
1434
1435	int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t s, int32_t length) const* {
1436	int32_t pos=`0`, rest=length;
1437	int32_t i, stringsLength=strings.size();
1438	uint8_t *spanUTF8Lengths=spanLengths;
1439	if(all) {
1440	spanUTF8Lengths+=`2`*stringsLength;
1441	}
1442	do {
1443	// Span until we find a code point from the set,
1444	// or a code point that starts or ends some string.
1445	i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446	if(i==rest) {
1447	return length; // Reached the end of the string.
1448	}
1449	pos+=i;
1450	rest-=i;
1451
1452	// Check whether the current code point is in the original set,
1453	// without the string starts and ends.
1454	int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455	if(cpLength>`0`) {
1456	return pos; // There is a set element at pos.
1457	}
1458
1459	// Try to match the strings at pos.
1460	const uint8_t *s8=utf8;
1461	int32_t length8;
1462	for(i=`0`; i<stringsLength; ++i) {
1463	length8=utf8Lengths[i];
1464	// ALL_CP_CONTAINED: Irrelevant string.
1465	if(length8!=`0` && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466	return pos; // There is a set element at pos.
1467	}
1468	s8+=length8;
1469	}
1470
1471	// The span(while not contained) ended on a string start/end which is
1472	// not in the original set. Skip this code point and continue.
1473	// cpLength<0
1474	pos-=cpLength;
1475	rest+=cpLength;
1476	} while(rest!=`0`);
1477	return length; // Reached the end of the string.
1478	}
1479
1480	int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t s, int32_t length) const* {
1481	int32_t pos=length;
1482	int32_t i, stringsLength=strings.size();
1483	uint8_t *spanBackUTF8Lengths=spanLengths;
1484	if(all) {
1485	spanBackUTF8Lengths+=`3`*stringsLength;
1486	}
1487	do {
1488	// Span until we find a code point from the set,
1489	// or a code point that starts or ends some string.
1490	pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491	if(pos==`0`) {
1492	return `0`; // Reached the start of the string.
1493	}
1494
1495	// Check whether the current code point is in the original set,
1496	// without the string starts and ends.
1497	int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498	if(cpLength>`0`) {
1499	return pos; // There is a set element at pos.
1500	}
1501
1502	// Try to match the strings at pos.
1503	const uint8_t *s8=utf8;
1504	int32_t length8;
1505	for(i=`0`; i<stringsLength; ++i) {
1506	length8=utf8Lengths[i];
1507	// ALL_CP_CONTAINED: Irrelevant string.
1508	if(length8!=`0` && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509	return pos; // There is a set element at pos.
1510	}
1511	s8+=length8;
1512	}
1513
1514	// The span(while not contained) ended on a string start/end which is
1515	// not in the original set. Skip this code point and continue.
1516	// cpLength<0
1517	pos+=cpLength;
1518	} while(pos!=`0`);
1519	return `0`; // Reached the start of the string.
1520	}
1521
1522	U_NAMESPACE_END
1523

Browse the source code of Godot/thirdparty/icu4c/common/unisetspan.cpp