csdetect.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/csdetect.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "unicode/ucsdet.h"
15
16	#include "csdetect.h"
17	#include "csmatch.h"
18	#include "uenumimp.h"
19
20	#include "cmemory.h"
21	#include "cstring.h"
22	#include "umutex.h"
23	#include "ucln_in.h"
24	#include "uarrsort.h"
25	#include "inputext.h"
26	#include "csrsbcs.h"
27	#include "csrmbcs.h"
28	#include "csrutf8.h"
29	#include "csrucode.h"
30	#include "csr2022.h"
31
32	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
33	#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35	U_NAMESPACE_BEGIN
36
37	struct CSRecognizerInfo : public UMemory {
38	CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39	: recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
40
41	~CSRecognizerInfo() {delete recognizer;}
42
43	CharsetRecognizer *recognizer;
44	UBool isDefaultEnabled;
45	};
46
47	U_NAMESPACE_END
48
49	static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50	static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER;
51	static int32_t fCSRecognizers_size = `0`;
52
53	U_CDECL_BEGIN
54	static UBool U_CALLCONV csdet_cleanup(void)
55	{
56	U_NAMESPACE_USE
57	if (fCSRecognizers != NULL) {
58	for(int32_t r = `0`; r < fCSRecognizers_size; r += `1`) {
59	delete fCSRecognizers[r];
60	fCSRecognizers[r] = NULL;
61	}
62
63	DELETE_ARRAY(fCSRecognizers);
64	fCSRecognizers = NULL;
65	fCSRecognizers_size = `0`;
66	}
67	gCSRecognizersInitOnce.reset();
68
69	return TRUE;
70	}
71
72	static int32_t U_CALLCONV
73	charsetMatchComparator(const void * /context/, const void left, const* void *right)
74	{
75	U_NAMESPACE_USE
76
77	const CharsetMatch *csm_l = (const* CharsetMatch **) left;
78	const CharsetMatch *csm_r = (const* CharsetMatch **) right;
79
80	// NOTE: compare is backwards to sort from highest to lowest.
81	return (csm_r)->getConfidence() - (csm_l)->getConfidence();
82	}
83
84	static void U_CALLCONV initRecognizers(UErrorCode &status) {
85	U_NAMESPACE_USE
86	ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87	CSRecognizerInfo *tempArray[] = {
88	new CSRecognizerInfo (new CharsetRecog_UTF8 (), TRUE),
89
90	new CSRecognizerInfo (new CharsetRecog_UTF_16_BE (), TRUE),
91	new CSRecognizerInfo (new CharsetRecog_UTF_16_LE (), TRUE),
92	new CSRecognizerInfo (new CharsetRecog_UTF_32_BE (), TRUE),
93	new CSRecognizerInfo (new CharsetRecog_UTF_32_LE (), TRUE),
94
95	new CSRecognizerInfo (new CharsetRecog_8859_1 (), TRUE),
96	new CSRecognizerInfo (new CharsetRecog_8859_2 (), TRUE),
97	new CSRecognizerInfo (new CharsetRecog_8859_5_ru (), TRUE),
98	new CSRecognizerInfo (new CharsetRecog_8859_6_ar (), TRUE),
99	new CSRecognizerInfo (new CharsetRecog_8859_7_el (), TRUE),
100	new CSRecognizerInfo (new CharsetRecog_8859_8_I_he (), TRUE),
101	new CSRecognizerInfo (new CharsetRecog_8859_8_he (), TRUE),
102	new CSRecognizerInfo (new CharsetRecog_windows_1251 (), TRUE),
103	new CSRecognizerInfo (new CharsetRecog_windows_1256 (), TRUE),
104	new CSRecognizerInfo (new CharsetRecog_KOI8_R (), TRUE),
105	new CSRecognizerInfo (new CharsetRecog_8859_9_tr (), TRUE),
106	new CSRecognizerInfo (new CharsetRecog_sjis (), TRUE),
107	new CSRecognizerInfo (new CharsetRecog_gb_18030 (), TRUE),
108	new CSRecognizerInfo (new CharsetRecog_euc_jp (), TRUE),
109	new CSRecognizerInfo (new CharsetRecog_euc_kr (), TRUE),
110	new CSRecognizerInfo (new CharsetRecog_big5 (), TRUE),
111
112	new CSRecognizerInfo (new CharsetRecog_2022JP (), TRUE),
113	#if !UCONFIG_ONLY_HTML_CONVERSION
114	new CSRecognizerInfo (new CharsetRecog_2022KR (), TRUE),
115	new CSRecognizerInfo (new CharsetRecog_2022CN (), TRUE),
116
117	new CSRecognizerInfo (new CharsetRecog_IBM424_he_rtl (), FALSE),
118	new CSRecognizerInfo (new CharsetRecog_IBM424_he_ltr (), FALSE),
119	new CSRecognizerInfo (new CharsetRecog_IBM420_ar_rtl (), FALSE),
120	new CSRecognizerInfo (new CharsetRecog_IBM420_ar_ltr (), FALSE)
121	#endif
122	};
123	int32_t rCount = UPRV_LENGTHOF(tempArray);
124
125	fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127	if (fCSRecognizers == NULL) {
128	status = U_MEMORY_ALLOCATION_ERROR;
129	}
130	else {
131	fCSRecognizers_size = rCount;
132	for (int32_t r = `0`; r < rCount; r += `1`) {
133	fCSRecognizers[r] = tempArray[r];
134	if (fCSRecognizers[r] == NULL) {
135	status = U_MEMORY_ALLOCATION_ERROR;
136	}
137	}
138	}
139	}
140
141	U_CDECL_END
142
143	U_NAMESPACE_BEGIN
144
145	void CharsetDetector::setRecognizers(UErrorCode &status)
146	{
147	umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148	}
149
150	CharsetDetector::CharsetDetector(UErrorCode &status)
151	: textIn(new InputText (status)), resultArray(NULL),
152	resultCount(`0`), fStripTags(FALSE), fFreshTextSet(FALSE),
153	fEnabledRecognizers(NULL)
154	{
155	if (U_FAILURE(status)) {
156	return;
157	}
158
159	setRecognizers(status);
160
161	if (U_FAILURE(status)) {
162	return;
163	}
164
165	resultArray = (CharsetMatch )uprv_malloc(sizeof*(CharsetMatch )*fCSRecognizers_size);
166
167	if (resultArray == NULL) {
168	status = U_MEMORY_ALLOCATION_ERROR;
169	return;
170	}
171
172	for(int32_t i = `0`; i < fCSRecognizers_size; i += `1`) {
173	resultArray[i] = new CharsetMatch ();
174
175	if (resultArray[i] == NULL) {
176	status = U_MEMORY_ALLOCATION_ERROR;
177	break;
178	}
179	}
180	}
181
182	CharsetDetector::~CharsetDetector()
183	{
184	delete textIn;
185
186	for(int32_t i = `0`; i < fCSRecognizers_size; i += `1`) {
187	delete resultArray[i];
188	}
189
190	uprv_free(resultArray);
191
192	if (fEnabledRecognizers) {
193	uprv_free(fEnabledRecognizers);
194	}
195	}
196
197	void CharsetDetector::setText(const char *in, int32_t len)
198	{
199	textIn->setText(in, len);
200	fFreshTextSet = TRUE;
201	}
202
203	UBool CharsetDetector::setStripTagsFlag(UBool flag)
204	{
205	UBool temp = fStripTags;
206	fStripTags = flag;
207	fFreshTextSet = TRUE;
208	return temp;
209	}
210
211	UBool CharsetDetector::getStripTagsFlag() const
212	{
213	return fStripTags;
214	}
215
216	void CharsetDetector::setDeclaredEncoding(const char encoding, int32_t len) const*
217	{
218	textIn->setDeclaredEncoding(encoding,len);
219	}
220
221	int32_t CharsetDetector::getDetectableCount()
222	{
223	UErrorCode status = U_ZERO_ERROR;
224
225	setRecognizers(status);
226
227	return fCSRecognizers_size;
228	}
229
230	const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231	{
232	int32_t maxMatchesFound = `0`;
233
234	detectAll(maxMatchesFound, status);
235
236	if(maxMatchesFound > `0`) {
237	return resultArray[`0`];
238	} else {
239	return NULL;
240	}
241	}
242
243	const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244	{
245	if(!textIn->isSet()) {
246	status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
247
248	return NULL;
249	} else if (fFreshTextSet) {
250	CharsetRecognizer *csr;
251	int32_t i;
252
253	textIn->MungeInput(fStripTags);
254
255	// Iterate over all possible charsets, remember all that
256	// give a match quality > 0.
257	resultCount = `0`;
258	for (i = `0`; i < fCSRecognizers_size; i += `1`) {
259	csr = fCSRecognizers[i]->recognizer;
260	if (csr->match(textIn, resultArray[resultCount])) {
261	resultCount++;
262	}
263	}
264
265	if (resultCount > `1`) {
266	uprv_sortArray(resultArray, resultCount, sizeof resultArray[`0`], charsetMatchComparator, NULL, TRUE, &status);
267	}
268	fFreshTextSet = FALSE;
269	}
270
271	maxMatchesFound = resultCount;
272
273	return resultArray;
274	}
275
276	void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
277	{
278	if (U_FAILURE(status)) {
279	return;
280	}
281
282	int32_t modIdx = -`1`;
283	UBool isDefaultVal = FALSE;
284	for (int32_t i = `0`; i < fCSRecognizers_size; i++) {
285	CSRecognizerInfo *csrinfo = fCSRecognizers[i];
286	if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == `0`) {
287	modIdx = i;
288	isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
289	break;
290	}
291	}
292	if (modIdx < `0`) {
293	// No matching encoding found
294	status = U_ILLEGAL_ARGUMENT_ERROR;
295	return;
296	}
297
298	if (fEnabledRecognizers == NULL && !isDefaultVal) {
299	// Create an array storing the non default setting
300	fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
301	if (fEnabledRecognizers == NULL) {
302	status = U_MEMORY_ALLOCATION_ERROR;
303	return;
304	}
305	// Initialize the array with default info
306	for (int32_t i = `0`; i < fCSRecognizers_size; i++) {
307	fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
308	}
309	}
310
311	if (fEnabledRecognizers != NULL) {
312	fEnabledRecognizers[modIdx] = enabled;
313	}
314	}
315
316	/const char CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
317	{
318	if( index > fCSRecognizers_size-1 \|\| index < 0) {
319	status = U_INDEX_OUTOFBOUNDS_ERROR;
320
321	return 0;
322	} else {
323	return fCSRecognizers[index]->getName();
324	}
325	}/*
326
327	U_NAMESPACE_END
328
329	U_CDECL_BEGIN
330	typedef struct {
331	int32_t currIndex;
332	UBool all;
333	UBool *enabledRecognizers;
334	} Context;
335
336
337
338	static void U_CALLCONV
339	enumClose(UEnumeration *en) {
340	if(en->context != NULL) {
341	DELETE_ARRAY(en->context);
342	}
343
344	DELETE_ARRAY(en);
345	}
346
347	static int32_t U_CALLCONV
348	enumCount(UEnumeration en, UErrorCode ) {
349	if (((Context *)en->context)->all) {
350	// ucsdet_getAllDetectableCharsets, all charset detector names
351	return fCSRecognizers_size;
352	}
353
354	// Otherwise, ucsdet_getDetectableCharsets - only enabled ones
355	int32_t count = `0`;
356	UBool enabledArray = ((Context )en->context)->enabledRecognizers;
357	if (enabledArray != NULL) {
358	// custom set
359	for (int32_t i = `0`; i < fCSRecognizers_size; i++) {
360	if (enabledArray[i]) {
361	count++;
362	}
363	}
364	} else {
365	// default set
366	for (int32_t i = `0`; i < fCSRecognizers_size; i++) {
367	if (fCSRecognizers[i]->isDefaultEnabled) {
368	count++;
369	}
370	}
371	}
372	return count;
373	}
374
375	static const char* U_CALLCONV
376	enumNext(UEnumeration en, int32_t resultLength, UErrorCode * /status/) {
377	const char *currName = NULL;
378
379	if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
380	if (((Context *)en->context)->all) {
381	// ucsdet_getAllDetectableCharsets, all charset detector names
382	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
383	((Context *)en->context)->currIndex++;
384	} else {
385	// ucsdet_getDetectableCharsets
386	UBool enabledArray = ((Context )en->context)->enabledRecognizers;
387	if (enabledArray != NULL) {
388	// custome set
389	while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
390	if (enabledArray[((Context *)en->context)->currIndex]) {
391	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
392	}
393	((Context *)en->context)->currIndex++;
394	}
395	} else {
396	// default set
397	while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
398	if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
399	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
400	}
401	((Context *)en->context)->currIndex++;
402	}
403	}
404	}
405	}
406
407	if(resultLength != NULL) {
408	*resultLength = currName == NULL ? `0` : (int32_t)uprv_strlen(currName);
409	}
410
411	return currName;
412	}
413
414
415	static void U_CALLCONV
416	enumReset(UEnumeration en, UErrorCode ) {
417	((Context *)en->context)->currIndex = `0`;
418	}
419
420	static const UEnumeration gCSDetEnumeration = {
421	NULL,
422	NULL,
423	enumClose,
424	enumCount,
425	uenum_unextDefault,
426	enumNext,
427	enumReset
428	};
429
430	U_CDECL_END
431
432	U_NAMESPACE_BEGIN
433
434	UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
435	{
436
437	/ Initialize recognized charsets. /
438	setRecognizers(status);
439
440	if(U_FAILURE(status)) {
441	return `0`;
442	}
443
444	UEnumeration *en = NEW_ARRAY(UEnumeration, `1`);
445	if (en == NULL) {
446	status = U_MEMORY_ALLOCATION_ERROR;
447	return `0`;
448	}
449	memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
450	en->context = (void*)NEW_ARRAY(Context, `1`);
451	if (en->context == NULL) {
452	status = U_MEMORY_ALLOCATION_ERROR;
453	DELETE_ARRAY(en);
454	return `0`;
455	}
456	uprv_memset(en->context, `0`, sizeof(Context));
457	((Context*)en->context)->all = TRUE;
458	return en;
459	}
460
461	UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
462	{
463	if(U_FAILURE(status)) {
464	return `0`;
465	}
466
467	UEnumeration *en = NEW_ARRAY(UEnumeration, `1`);
468	if (en == NULL) {
469	status = U_MEMORY_ALLOCATION_ERROR;
470	return `0`;
471	}
472	memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
473	en->context = (void*)NEW_ARRAY(Context, `1`);
474	if (en->context == NULL) {
475	status = U_MEMORY_ALLOCATION_ERROR;
476	DELETE_ARRAY(en);
477	return `0`;
478	}
479	uprv_memset(en->context, `0`, sizeof(Context));
480	((Context*)en->context)->all = FALSE;
481	((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
482	return en;
483	}
484
485	U_NAMESPACE_END
486
487	#endif
488

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csdetect.cpp