Unicode.h source code [ClickHouse/contrib/poco/Foundation/include/Poco/Unicode.h]

1	//
2	// Unicode.h
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: Unicode
7	//
8	// Definition of the Unicode class.
9	//
10	// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
11	// and Contributors.
12	//
13	// SPDX-License-Identifier: BSL-1.0
14	//
15
16
17	#ifndef Foundation_Unicode_INCLUDED
18	#define Foundation_Unicode_INCLUDED
19
20
21	#include "Poco/Foundation.h"
22
23
24	namespace Poco {
25
26
27	class Foundation_API Unicode
28	/// This class contains enumerations and static
29	/// utility functions for dealing with Unicode characters
30	/// and their properties.
31	///
32	/// For more information on Unicode, see <http://www.unicode.org>.
33	///
34	/// The implementation is based on the Unicode support
35	/// functions in PCRE.
36	{
37	public:
38	// Implementation note: the following definitions must be kept
39	// in sync with those from ucp.h (PCRE).
40	enum CharacterCategory
41	/// Unicode character categories.
42	{
43	UCP_OTHER,
44	UCP_LETTER,
45	UCP_MARK,
46	UCP_NUMBER,
47	UCP_PUNCTUATION,
48	UCP_SYMBOL,
49	UCP_SEPARATOR
50	};
51
52	enum CharacterType
53	/// Unicode character types.
54	{
55	UCP_CONTROL,
56	UCP_FORMAT,
57	UCP_UNASSIGNED,
58	UCP_PRIVATE_USE,
59	UCP_SURROGATE,
60	UCP_LOWER_CASE_LETTER,
61	UCP_MODIFIER_LETTER,
62	UCP_OTHER_LETTER,
63	UCP_TITLE_CASE_LETTER,
64	UCP_UPPER_CASE_LETTER,
65	UCP_SPACING_MARK,
66	UCP_ENCLOSING_MARK,
67	UCP_NON_SPACING_MARK,
68	UCP_DECIMAL_NUMBER,
69	UCP_LETTER_NUMBER,
70	UCP_OTHER_NUMBER,
71	UCP_CONNECTOR_PUNCTUATION,
72	UCP_DASH_PUNCTUATION,
73	UCP_CLOSE_PUNCTUATION,
74	UCP_FINAL_PUNCTUATION,
75	UCP_INITIAL_PUNCTUATION,
76	UCP_OTHER_PUNCTUATION,
77	UCP_OPEN_PUNCTUATION,
78	UCP_CURRENCY_SYMBOL,
79	UCP_MODIFIER_SYMBOL,
80	UCP_MATHEMATICAL_SYMBOL,
81	UCP_OTHER_SYMBOL,
82	UCP_LINE_SEPARATOR,
83	UCP_PARAGRAPH_SEPARATOR,
84	UCP_SPACE_SEPARATOR
85	};
86
87	enum Script
88	/// Unicode 7.0 script identifiers.
89	{
90	UCP_ARABIC,
91	UCP_ARMENIAN,
92	UCP_BENGALI,
93	UCP_BOPOMOFO,
94	UCP_BRAILLE,
95	UCP_BUGINESE,
96	UCP_BUHID,
97	UCP_CANADIAN_ABORIGINAL,
98	UCP_CHEROKEE,
99	UCP_COMMON,
100	UCP_COPTIC,
101	UCP_CYPRIOT,
102	UCP_CYRILLIC,
103	UCP_DESERET,
104	UCP_DEVANAGARI,
105	UCP_ETHIOPIC,
106	UCP_GEORGIAN,
107	UCP_GLAGOLITIC,
108	UCP_GOTHIC,
109	UCP_GREEK,
110	UCP_GUJARATI,
111	UCP_GURMUKHI,
112	UCP_HAN,
113	UCP_HANGUL,
114	UCP_HANUNOO,
115	UCP_HEBREW,
116	UCP_HIRAGANA,
117	UCP_INHERITED,
118	UCP_KANNADA,
119	UCP_KATAKANA,
120	UCP_KHAROSHTHI,
121	UCP_KHMER,
122	UCP_LAO,
123	UCP_LATIN,
124	UCP_LIMBU,
125	UCP_LINEAR_B,
126	UCP_MALAYALAM,
127	UCP_MONGOLIAN,
128	UCP_MYANMAR,
129	UCP_NEW_TAI_LUE,
130	UCP_OGHAM,
131	UCP_OLD_ITALIC,
132	UCP_OLD_PERSIAN,
133	UCP_ORIYA,
134	UCP_OSMANYA,
135	UCP_RUNIC,
136	UCP_SHAVIAN,
137	UCP_SINHALA,
138	UCP_SYLOTI_NAGRI,
139	UCP_SYRIAC,
140	UCP_TAGALOG,
141	UCP_TAGBANWA,
142	UCP_TAI_LE,
143	UCP_TAMIL,
144	UCP_TELUGU,
145	UCP_THAANA,
146	UCP_THAI,
147	UCP_TIBETAN,
148	UCP_TIFINAGH,
149	UCP_UGARITIC,
150	UCP_YI,
151	// Unicode 5.0
152	UCP_BALINESE,
153	UCP_CUNEIFORM,
154	UCP_NKO,
155	UCP_PHAGS_PA,
156	UCP_PHOENICIAN,
157	// Unicode 5.1
158	UCP_CARIAN,
159	UCP_CHAM,
160	UCP_KAYAH_LI,
161	UCP_LEPCHA,
162	UCP_LYCIAN,
163	UCP_LYDIAN,
164	UCP_OL_CHIKI,
165	UCP_REJANG,
166	UCP_SAURASHTRA,
167	UCP_SUNDANESE,
168	UCP_VAI,
169	// Unicode 5.2
170	UCP_AVESTAN,
171	UCP_BAMUM,
172	UCP_EGYPTIAN_HIEROGLYPHS,
173	UCP_IMPERIAL_ARAMAIC,
174	UCP_INSCRIPTIONAL_PAHLAVI,
175	UCP_INSCRIPTIONAL_PARTHIAN,
176	UCP_JAVANESE,
177	UCP_KAITHI,
178	UCP_LISU,
179	UCP_MEETEI_MAYEK,
180	UCP_OLD_SOUTH_ARABIAN,
181	UCP_OLD_TURKIC,
182	UCP_SAMARITAN,
183	UCP_TAI_THAM,
184	UCP_TAI_VIET,
185	// Unicode 6.0
186	UCP_BATAK,
187	UCP_BRAHMI,
188	UCP_MANDAIC,
189	// Unicode 6.1
190	UCP_CHAKMA,
191	UCP_MEROITIC_CURSIVE,
192	UCP_MEROITIC_HIEROGLYPHS,
193	UCP_MIAO,
194	UCP_SHARADA,
195	UCP_SORA_SOMPENG,
196	UCP_TAKRI,
197	// Unicode 7.0
198	UCP_BASSA_VAH,
199	UCP_CAUCASIAN_ALBANIAN,
200	UCP_DUPLOYAN,
201	UCP_ELBASAN,
202	UCP_GRANTHA,
203	UCP_KHOJKI,
204	UCP_KHUDAWADI,
205	UCP_LINEAR_A,
206	UCP_MAHAJANI,
207	UCP_MANICHAEAN,
208	UCP_MENDE_KIKAKUI,
209	UCP_MODI,
210	UCP_MRO,
211	UCP_NABATAEAN,
212	UCP_OLD_NORTH_ARABIAN,
213	UCP_OLD_PERMIC,
214	UCP_PAHAWH_HMONG,
215	UCP_PALMYRENE,
216	UCP_PSALTER_PAHLAVI,
217	UCP_PAU_CIN_HAU,
218	UCP_SIDDHAM,
219	UCP_TIRHUTA,
220	UCP_WARANG_CITI
221	};
222
223	enum
224	{
225	UCP_MAX_CODEPOINT = `0x10FFFF`
226	};
227
228	struct CharacterProperties
229	/// This structure holds the character properties
230	/// of an Unicode character.
231	{
232	CharacterCategory category;
233	CharacterType type;
234	Script script;
235	};
236
237	static void properties(int ch, CharacterProperties& props);
238	/// Return the Unicode character properties for the
239	/// character with the given Unicode value.
240
241	static bool isSpace(int ch);
242	/// Returns true iff the given character is a separator.
243
244	static bool isDigit(int ch);
245	/// Returns true iff the given character is a numeric character.
246
247	static bool isPunct(int ch);
248	/// Returns true iff the given character is a punctuation character.
249
250	static bool isAlpha(int ch);
251	/// Returns true iff the given character is a letter.
252
253	static bool isLower(int ch);
254	/// Returns true iff the given character is a lowercase
255	/// character.
256
257	static bool isUpper(int ch);
258	/// Returns true iff the given character is an uppercase
259	/// character.
260
261	static int toLower(int ch);
262	/// If the given character is an uppercase character,
263	/// return its lowercase counterpart, otherwise return
264	/// the character.
265
266	static int toUpper(int ch);
267	/// If the given character is a lowercase character,
268	/// return its uppercase counterpart, otherwise return
269	/// the character.
270	};
271
272
273	//
274	// inlines
275	//
276	inline bool Unicode::isSpace(int ch)
277	{
278	CharacterProperties props;
279	properties(ch, props);
280	return props.category == UCP_SEPARATOR;
281	}
282
283
284	inline bool Unicode::isDigit(int ch)
285	{
286	CharacterProperties props;
287	properties(ch, props);
288	return props.category == UCP_NUMBER;
289	}
290
291
292	inline bool Unicode::isPunct(int ch)
293	{
294	CharacterProperties props;
295	properties(ch, props);
296	return props.category == UCP_PUNCTUATION;
297	}
298
299
300	inline bool Unicode::isAlpha(int ch)
301	{
302	CharacterProperties props;
303	properties(ch, props);
304	return props.category == UCP_LETTER;
305	}
306
307
308	inline bool Unicode::isLower(int ch)
309	{
310	CharacterProperties props;
311	properties(ch, props);
312	return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
313	}
314
315
316	inline bool Unicode::isUpper(int ch)
317	{
318	CharacterProperties props;
319	properties(ch, props);
320	return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
321	}
322
323
324	} // namespace Poco
325
326
327	#endif // Foundation_Unicode_INCLUDED
328

Browse the source code of ClickHouse/contrib/poco/Foundation/include/Poco/Unicode.h