csrucode.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/csrucode.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2013, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "csrucode.h"
15	#include "csmatch.h"
16
17	U_NAMESPACE_BEGIN
18
19	CharsetRecog_Unicode::~CharsetRecog_Unicode()
20	{
21	// nothing to do
22	}
23
24	CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
25	{
26	// nothing to do
27	}
28
29	const char CharsetRecog_UTF_16_BE::getName() const*
30	{
31	return "UTF-16BE";
32	}
33
34	// UTF-16 confidence calculation. Very simple minded, but better than nothing.
35	// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
36	// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
37	// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
38	// NULs should be rare in actual text.
39
40	static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
41	if (codeUnit == `0`) {
42	confidence -= `10`;
43	} else if ((codeUnit >= `0x20` && codeUnit <= `0xff`) \|\| codeUnit == `0x0a`) {
44	confidence += `10`;
45	}
46	if (confidence < `0`) {
47	confidence = `0`;
48	} else if (confidence > `100`) {
49	confidence = `100`;
50	}
51	return confidence;
52	}
53
54
55	UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch results) const*
56	{
57	const uint8_t *input = textIn->fRawInput;
58	int32_t confidence = `10`;
59	int32_t length = textIn->fRawLength;
60
61	int32_t bytesToCheck = (length > `30`) ? `30` : length;
62	for (int32_t charIndex=`0`; charIndex<bytesToCheck-`1`; charIndex+=`2`) {
63	UChar codeUnit = (input[charIndex] << `8`) \| input[charIndex + `1`];
64	if (charIndex == `0` && codeUnit == `0xFEFF`) {
65	confidence = `100`;
66	break;
67	}
68	confidence = adjustConfidence(codeUnit, confidence);
69	if (confidence == `0` \|\| confidence == `100`) {
70	break;
71	}
72	}
73	if (bytesToCheck < `4` && confidence < `100`) {
74	confidence = `0`;
75	}
76	results->set(textIn, this, confidence);
77	return (confidence > `0`);
78	}
79
80	CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
81	{
82	// nothing to do
83	}
84
85	const char CharsetRecog_UTF_16_LE::getName() const*
86	{
87	return "UTF-16LE";
88	}
89
90	UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch results) const*
91	{
92	const uint8_t *input = textIn->fRawInput;
93	int32_t confidence = `10`;
94	int32_t length = textIn->fRawLength;
95
96	int32_t bytesToCheck = (length > `30`) ? `30` : length;
97	for (int32_t charIndex=`0`; charIndex<bytesToCheck-`1`; charIndex+=`2`) {
98	UChar codeUnit = input[charIndex] \| (input[charIndex + `1`] << `8`);
99	if (charIndex == `0` && codeUnit == `0xFEFF`) {
100	confidence = `100`; // UTF-16 BOM
101	if (length >= `4` && input[`2`] == `0` && input[`3`] == `0`) {
102	confidence = `0`; // UTF-32 BOM
103	}
104	break;
105	}
106	confidence = adjustConfidence(codeUnit, confidence);
107	if (confidence == `0` \|\| confidence == `100`) {
108	break;
109	}
110	}
111	if (bytesToCheck < `4` && confidence < `100`) {
112	confidence = `0`;
113	}
114	results->set(textIn, this, confidence);
115	return (confidence > `0`);
116	}
117
118	CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
119	{
120	// nothing to do
121	}
122
123	UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch results) const*
124	{
125	const uint8_t *input = textIn->fRawInput;
126	int32_t limit = (textIn->fRawLength / `4`) * `4`;
127	int32_t numValid = `0`;
128	int32_t numInvalid = `0`;
129	bool hasBOM = FALSE;
130	int32_t confidence = `0`;
131
132	if (limit > `0` && getChar(input, `0`) == `0x0000FEFFUL`) {
133	hasBOM = TRUE;
134	}
135
136	for(int32_t i = `0`; i < limit; i += `4`) {
137	int32_t ch = getChar(input, i);
138
139	if (ch < `0` \|\| ch >= `0x10FFFF` \|\| (ch >= `0xD800` && ch <= `0xDFFF`)) {
140	numInvalid += `1`;
141	} else {
142	numValid += `1`;
143	}
144	}
145
146
147	// Cook up some sort of confidence score, based on presense of a BOM
148	// and the existence of valid and/or invalid multi-byte sequences.
149	if (hasBOM && numInvalid==`0`) {
150	confidence = `100`;
151	} else if (hasBOM && numValid > numInvalid*`10`) {
152	confidence = `80`;
153	} else if (numValid > `3` && numInvalid == `0`) {
154	confidence = `100`;
155	} else if (numValid > `0` && numInvalid == `0`) {
156	confidence = `80`;
157	} else if (numValid > numInvalid*`10`) {
158	// Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
159	confidence = `25`;
160	}
161
162	results->set(textIn, this, confidence);
163	return (confidence > `0`);
164	}
165
166	CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
167	{
168	// nothing to do
169	}
170
171	const char CharsetRecog_UTF_32_BE::getName() const*
172	{
173	return "UTF-32BE";
174	}
175
176	int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t input, int32_t index) const*
177	{
178	return input[index + `0`] << `24` \| input[index + `1`] << `16` \|
179	input[index + `2`] << `8` \| input[index + `3`];
180	}
181
182	CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
183	{
184	// nothing to do
185	}
186
187	const char CharsetRecog_UTF_32_LE::getName() const*
188	{
189	return "UTF-32LE";
190	}
191
192	int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t input, int32_t index) const*
193	{
194	return input[index + `3`] << `24` \| input[index + `2`] << `16` \|
195	input[index + `1`] << `8` \| input[index + `0`];
196	}
197
198	U_NAMESPACE_END
199	#endif
200
201

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csrucode.cpp