collationfcd.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationfcd.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2012-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationfcd.h
9	*
10	* created on: 2012aug18
11	* created by: Markus W. Scherer
12	*/
13
14	#ifndef __COLLATIONFCD_H__
15	#define __COLLATIONFCD_H__
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_COLLATION
20
21	#include "unicode/utf16.h"
22
23	U_NAMESPACE_BEGIN
24
25	/**
26	* Data and functions for the FCD check fast path.
27	*
28	* The fast path looks at a pair of 16-bit code units and checks
29	* whether there is an FCD boundary between them;
30	* there is if the first unit has a trailing ccc=0 (!hasTccc(first))
31	* or the second unit has a leading ccc=0 (!hasLccc(second)),
32	* or both.
33	* When the fast path finds a possible non-boundary,
34	* then the FCD check slow path looks at the actual sequence of FCD values.
35	*
36	* This is a pure optimization.
37	* The fast path must at least find all possible non-boundaries.
38	* If the fast path is too pessimistic, it costs performance.
39	*
40	* For a pair of BMP characters, the fast path tests are precise (1 bit per character).
41	*
42	* For a supplementary code point, the two units are its lead and trail surrogates.
43	* We set hasTccc(lead)=true if any of its 1024 associated supplementary code points
44	* has lccc!=0 or tccc!=0.
45	* We set hasLccc(trail)=true for all trail surrogates.
46	* As a result, we leave the fast path if the lead surrogate might start a
47	* supplementary code point that is not FCD-inert.
48	* (So the fast path need not detect that there is a surrogate pair,
49	* nor look ahead to the next full code point.)
50	*
51	* hasLccc(lead)=true if any of its 1024 associated supplementary code points
52	* has lccc!=0, for fast boundary checking between BMP & supplementary.
53	*
54	* hasTccc(trail)=false:
55	* It should only be tested for unpaired trail surrogates which are FCD-inert.
56	*/
57	class U_I18N_API CollationFCD {
58	public:
59	static inline UBool hasLccc(UChar32 c) {
60	// assert c <= 0xffff
61	// c can be negative, e.g., U_SENTINEL from UCharIterator;
62	// that is handled in the first test.
63	int32_t i;
64	return
65	// U+0300 is the first character with lccc!=0.
66	c >= `0x300` &&
67	(i = lcccIndex[c >> `5`]) != `0` &&
68	(lcccBits[i] & ((uint32_t)`1` << (c & `0x1f`))) != `0`;
69	}
70
71	static inline UBool hasTccc(UChar32 c) {
72	// assert c <= 0xffff
73	// c can be negative, e.g., U_SENTINEL from UCharIterator;
74	// that is handled in the first test.
75	int32_t i;
76	return
77	// U+00C0 is the first character with tccc!=0.
78	c >= `0xc0` &&
79	(i = tcccIndex[c >> `5`]) != `0` &&
80	(tcccBits[i] & ((uint32_t)`1` << (c & `0x1f`))) != `0`;
81	}
82
83	static inline UBool mayHaveLccc(UChar32 c) {
84	// Handles all of Unicode 0..10FFFF.
85	// c can be negative, e.g., U_SENTINEL.
86	// U+0300 is the first character with lccc!=0.
87	if(c < `0x300`) { return FALSE; }
88	if(c > `0xffff`) { c = U16_LEAD(c); }
89	int32_t i;
90	return
91	(i = lcccIndex[c >> `5`]) != `0` &&
92	(lcccBits[i] & ((uint32_t)`1` << (c & `0x1f`))) != `0`;
93	}
94
95	/**
96	* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
97	* must be decomposed before reaching the core collation code,
98	* or else some sequences including them, even ones passing the FCD check,
99	* do not yield canonically equivalent results.
100	*
101	* This is a fast and imprecise test.
102	*
103	* @param c a code point
104	* @return TRUE if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters
105	*/
106	static inline UBool maybeTibetanCompositeVowel(UChar32 c) {
107	return (c & `0x1fff01`) == `0xf01`;
108	}
109
110	/**
111	* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
112	* must be decomposed before reaching the core collation code,
113	* or else some sequences including them, even ones passing the FCD check,
114	* do not yield canonically equivalent results.
115	*
116	* They have distinct lccc/tccc combinations: 129/130 or 129/132.
117	*
118	* @param fcd16 the FCD value (lccc/tccc combination) of a code point
119	* @return TRUE if fcd16 is from U+0F73, U+0F75 or U+0F81
120	*/
121	static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {
122	return fcd16 == `0x8182` \|\| fcd16 == `0x8184`;
123	}
124
125	private:
126	CollationFCD(); // No instantiation.
127
128	static const uint8_t lcccIndex[`2048`];
129	static const uint8_t tcccIndex[`2048`];
130	static const uint32_t lcccBits[];
131	static const uint32_t tcccBits[];
132	};
133
134	U_NAMESPACE_END
135
136	#endif // !UCONFIG_NO_COLLATION
137	#endif // __COLLATIONFCD_H__
138

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationfcd.h