csrmbcs.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/csrmbcs.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "cmemory.h"
15	#include "csmatch.h"
16	#include "csrmbcs.h"
17
18	#include <math.h>
19
20	U_NAMESPACE_BEGIN
21
22	#define min(x,y) (((x)<(y))?(x):(y))
23
24	static const uint16_t commonChars_sjis [] = {
25	// TODO: This set of data comes from the character frequency-
26	// of-occurence analysis tool. The data needs to be moved
27	// into a resource and loaded from there.
28	`0x8140`, `0x8141`, `0x8142`, `0x8145`, `0x815b`, `0x8169`, `0x816a`, `0x8175`, `0x8176`, `0x82a0`,
29	`0x82a2`, `0x82a4`, `0x82a9`, `0x82aa`, `0x82ab`, `0x82ad`, `0x82af`, `0x82b1`, `0x82b3`, `0x82b5`,
30	`0x82b7`, `0x82bd`, `0x82be`, `0x82c1`, `0x82c4`, `0x82c5`, `0x82c6`, `0x82c8`, `0x82c9`, `0x82cc`,
31	`0x82cd`, `0x82dc`, `0x82e0`, `0x82e7`, `0x82e8`, `0x82e9`, `0x82ea`, `0x82f0`, `0x82f1`, `0x8341`,
32	`0x8343`, `0x834e`, `0x834f`, `0x8358`, `0x835e`, `0x8362`, `0x8367`, `0x8375`, `0x8376`, `0x8389`,
33	`0x838a`, `0x838b`, `0x838d`, `0x8393`, `0x8e96`, `0x93fa`, `0x95aa`};
34
35	static const uint16_t commonChars_euc_jp[] = {
36	// TODO: This set of data comes from the character frequency-
37	// of-occurence analysis tool. The data needs to be moved
38	// into a resource and loaded from there.
39	`0xa1a1`, `0xa1a2`, `0xa1a3`, `0xa1a6`, `0xa1bc`, `0xa1ca`, `0xa1cb`, `0xa1d6`, `0xa1d7`, `0xa4a2`,
40	`0xa4a4`, `0xa4a6`, `0xa4a8`, `0xa4aa`, `0xa4ab`, `0xa4ac`, `0xa4ad`, `0xa4af`, `0xa4b1`, `0xa4b3`,
41	`0xa4b5`, `0xa4b7`, `0xa4b9`, `0xa4bb`, `0xa4bd`, `0xa4bf`, `0xa4c0`, `0xa4c1`, `0xa4c3`, `0xa4c4`,
42	`0xa4c6`, `0xa4c7`, `0xa4c8`, `0xa4c9`, `0xa4ca`, `0xa4cb`, `0xa4ce`, `0xa4cf`, `0xa4d0`, `0xa4de`,
43	`0xa4df`, `0xa4e1`, `0xa4e2`, `0xa4e4`, `0xa4e8`, `0xa4e9`, `0xa4ea`, `0xa4eb`, `0xa4ec`, `0xa4ef`,
44	`0xa4f2`, `0xa4f3`, `0xa5a2`, `0xa5a3`, `0xa5a4`, `0xa5a6`, `0xa5a7`, `0xa5aa`, `0xa5ad`, `0xa5af`,
45	`0xa5b0`, `0xa5b3`, `0xa5b5`, `0xa5b7`, `0xa5b8`, `0xa5b9`, `0xa5bf`, `0xa5c3`, `0xa5c6`, `0xa5c7`,
46	`0xa5c8`, `0xa5c9`, `0xa5cb`, `0xa5d0`, `0xa5d5`, `0xa5d6`, `0xa5d7`, `0xa5de`, `0xa5e0`, `0xa5e1`,
47	`0xa5e5`, `0xa5e9`, `0xa5ea`, `0xa5eb`, `0xa5ec`, `0xa5ed`, `0xa5f3`, `0xb8a9`, `0xb9d4`, `0xbaee`,
48	`0xbbc8`, `0xbef0`, `0xbfb7`, `0xc4ea`, `0xc6fc`, `0xc7bd`, `0xcab8`, `0xcaf3`, `0xcbdc`, `0xcdd1`};
49
50	static const uint16_t commonChars_euc_kr[] = {
51	// TODO: This set of data comes from the character frequency-
52	// of-occurence analysis tool. The data needs to be moved
53	// into a resource and loaded from there.
54	`0xb0a1`, `0xb0b3`, `0xb0c5`, `0xb0cd`, `0xb0d4`, `0xb0e6`, `0xb0ed`, `0xb0f8`, `0xb0fa`, `0xb0fc`,
55	`0xb1b8`, `0xb1b9`, `0xb1c7`, `0xb1d7`, `0xb1e2`, `0xb3aa`, `0xb3bb`, `0xb4c2`, `0xb4cf`, `0xb4d9`,
56	`0xb4eb`, `0xb5a5`, `0xb5b5`, `0xb5bf`, `0xb5c7`, `0xb5e9`, `0xb6f3`, `0xb7af`, `0xb7c2`, `0xb7ce`,
57	`0xb8a6`, `0xb8ae`, `0xb8b6`, `0xb8b8`, `0xb8bb`, `0xb8e9`, `0xb9ab`, `0xb9ae`, `0xb9cc`, `0xb9ce`,
58	`0xb9fd`, `0xbab8`, `0xbace`, `0xbad0`, `0xbaf1`, `0xbbe7`, `0xbbf3`, `0xbbfd`, `0xbcad`, `0xbcba`,
59	`0xbcd2`, `0xbcf6`, `0xbdba`, `0xbdc0`, `0xbdc3`, `0xbdc5`, `0xbec6`, `0xbec8`, `0xbedf`, `0xbeee`,
60	`0xbef8`, `0xbefa`, `0xbfa1`, `0xbfa9`, `0xbfc0`, `0xbfe4`, `0xbfeb`, `0xbfec`, `0xbff8`, `0xc0a7`,
61	`0xc0af`, `0xc0b8`, `0xc0ba`, `0xc0bb`, `0xc0bd`, `0xc0c7`, `0xc0cc`, `0xc0ce`, `0xc0cf`, `0xc0d6`,
62	`0xc0da`, `0xc0e5`, `0xc0fb`, `0xc0fc`, `0xc1a4`, `0xc1a6`, `0xc1b6`, `0xc1d6`, `0xc1df`, `0xc1f6`,
63	`0xc1f8`, `0xc4a1`, `0xc5cd`, `0xc6ae`, `0xc7cf`, `0xc7d1`, `0xc7d2`, `0xc7d8`, `0xc7e5`, `0xc8ad`};
64
65	static const uint16_t commonChars_big5[] = {
66	// TODO: This set of data comes from the character frequency-
67	// of-occurence analysis tool. The data needs to be moved
68	// into a resource and loaded from there.
69	`0xa140`, `0xa141`, `0xa142`, `0xa143`, `0xa147`, `0xa149`, `0xa175`, `0xa176`, `0xa440`, `0xa446`,
70	`0xa447`, `0xa448`, `0xa451`, `0xa454`, `0xa457`, `0xa464`, `0xa46a`, `0xa46c`, `0xa477`, `0xa4a3`,
71	`0xa4a4`, `0xa4a7`, `0xa4c1`, `0xa4ce`, `0xa4d1`, `0xa4df`, `0xa4e8`, `0xa4fd`, `0xa540`, `0xa548`,
72	`0xa558`, `0xa569`, `0xa5cd`, `0xa5e7`, `0xa657`, `0xa661`, `0xa662`, `0xa668`, `0xa670`, `0xa6a8`,
73	`0xa6b3`, `0xa6b9`, `0xa6d3`, `0xa6db`, `0xa6e6`, `0xa6f2`, `0xa740`, `0xa751`, `0xa759`, `0xa7da`,
74	`0xa8a3`, `0xa8a5`, `0xa8ad`, `0xa8d1`, `0xa8d3`, `0xa8e4`, `0xa8fc`, `0xa9c0`, `0xa9d2`, `0xa9f3`,
75	`0xaa6b`, `0xaaba`, `0xaabe`, `0xaacc`, `0xaafc`, `0xac47`, `0xac4f`, `0xacb0`, `0xacd2`, `0xad59`,
76	`0xaec9`, `0xafe0`, `0xb0ea`, `0xb16f`, `0xb2b3`, `0xb2c4`, `0xb36f`, `0xb44c`, `0xb44e`, `0xb54c`,
77	`0xb5a5`, `0xb5bd`, `0xb5d0`, `0xb5d8`, `0xb671`, `0xb7ed`, `0xb867`, `0xb944`, `0xbad8`, `0xbb44`,
78	`0xbba1`, `0xbdd1`, `0xc2c4`, `0xc3b9`, `0xc440`, `0xc45f`};
79
80	static const uint16_t commonChars_gb_18030[] = {
81	// TODO: This set of data comes from the character frequency-
82	// of-occurence analysis tool. The data needs to be moved
83	// into a resource and loaded from there.
84	`0xa1a1`, `0xa1a2`, `0xa1a3`, `0xa1a4`, `0xa1b0`, `0xa1b1`, `0xa1f1`, `0xa1f3`, `0xa3a1`, `0xa3ac`,
85	`0xa3ba`, `0xb1a8`, `0xb1b8`, `0xb1be`, `0xb2bb`, `0xb3c9`, `0xb3f6`, `0xb4f3`, `0xb5bd`, `0xb5c4`,
86	`0xb5e3`, `0xb6af`, `0xb6d4`, `0xb6e0`, `0xb7a2`, `0xb7a8`, `0xb7bd`, `0xb7d6`, `0xb7dd`, `0xb8b4`,
87	`0xb8df`, `0xb8f6`, `0xb9ab`, `0xb9c9`, `0xb9d8`, `0xb9fa`, `0xb9fd`, `0xbacd`, `0xbba7`, `0xbbd6`,
88	`0xbbe1`, `0xbbfa`, `0xbcbc`, `0xbcdb`, `0xbcfe`, `0xbdcc`, `0xbecd`, `0xbedd`, `0xbfb4`, `0xbfc6`,
89	`0xbfc9`, `0xc0b4`, `0xc0ed`, `0xc1cb`, `0xc2db`, `0xc3c7`, `0xc4dc`, `0xc4ea`, `0xc5cc`, `0xc6f7`,
90	`0xc7f8`, `0xc8ab`, `0xc8cb`, `0xc8d5`, `0xc8e7`, `0xc9cf`, `0xc9fa`, `0xcab1`, `0xcab5`, `0xcac7`,
91	`0xcad0`, `0xcad6`, `0xcaf5`, `0xcafd`, `0xccec`, `0xcdf8`, `0xceaa`, `0xcec4`, `0xced2`, `0xcee5`,
92	`0xcfb5`, `0xcfc2`, `0xcfd6`, `0xd0c2`, `0xd0c5`, `0xd0d0`, `0xd0d4`, `0xd1a7`, `0xd2aa`, `0xd2b2`,
93	`0xd2b5`, `0xd2bb`, `0xd2d4`, `0xd3c3`, `0xd3d0`, `0xd3fd`, `0xd4c2`, `0xd4da`, `0xd5e2`, `0xd6d0`};
94
95	static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96	{
97	int32_t start = `0`, end = len-`1`;
98	int32_t mid = (start+end)/`2`;
99
100	while(start <= end) {
101	if(array[mid] == value) {
102	return mid;
103	}
104
105	if(array[mid] < value){
106	start = mid+`1`;
107	} else {
108	end = mid-`1`;
109	}
110
111	mid = (start+end)/`2`;
112	}
113
114	return -`1`;
115	}
116
117	IteratedChar::IteratedChar() :
118	charValue(`0`), index(-`1`), nextIndex(`0`), error(FALSE), done(FALSE)
119	{
120	// nothing else to do.
121	}
122
123	/void IteratedChar::reset()*
124	{
125	charValue = 0;
126	index = -1;
127	nextIndex = 0;
128	error = FALSE;
129	done = FALSE;
130	}/*
131
132	int32_t IteratedChar::nextByte(InputText *det)
133	{
134	if (nextIndex >= det->fRawLength) {
135	done = TRUE;
136
137	return -`1`;
138	}
139
140	return det->fRawInput[nextIndex++];
141	}
142
143	CharsetRecog_mbcs::~CharsetRecog_mbcs()
144	{
145	// nothing to do.
146	}
147
148	int32_t CharsetRecog_mbcs::match_mbcs(InputText det, const* uint16_t commonChars[], int32_t commonCharsLen) const {
149	int32_t singleByteCharCount = `0`;
150	int32_t doubleByteCharCount = `0`;
151	int32_t commonCharCount = `0`;
152	int32_t badCharCount = `0`;
153	int32_t totalCharCount = `0`;
154	int32_t confidence = `0`;
155	IteratedChar iter;
156
157	while (nextChar(&iter, det)) {
158	totalCharCount++;
159
160	if (iter.error) {
161	badCharCount++;
162	} else {
163	if (iter.charValue <= `0xFF`) {
164	singleByteCharCount++;
165	} else {
166	doubleByteCharCount++;
167
168	if (commonChars != `0`) {
169	if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= `0`){
170	commonCharCount += `1`;
171	}
172	}
173	}
174	}
175
176
177	if (badCharCount >= `2` && badCharCount*`5` >= doubleByteCharCount) {
178	// Bail out early if the byte data is not matching the encoding scheme.
179	// break detectBlock;
180	return confidence;
181	}
182	}
183
184	if (doubleByteCharCount <= `10` && badCharCount == `0`) {
185	// Not many multi-byte chars.
186	if (doubleByteCharCount == `0` && totalCharCount < `10`) {
187	// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188	// We don't have enough data to have any confidence.
189	// Statistical analysis of single byte non-ASCII charcters would probably help here.
190	confidence = `0`;
191	}
192	else {
193	// ASCII or ISO file? It's probably not our encoding,
194	// but is not incompatible with our encoding, so don't give it a zero.
195	confidence = `10`;
196	}
197
198	return confidence;
199	}
200
201	//
202	// No match if there are too many characters that don't fit the encoding scheme.
203	// (should we have zero tolerance for these?)
204	//
205	if (doubleByteCharCount < `20`*badCharCount) {
206	confidence = `0`;
207
208	return confidence;
209	}
210
211	if (commonChars == `0`) {
212	// We have no statistics on frequently occuring characters.
213	// Assess confidence purely on having a reasonable number of
214	// multi-byte characters (the more the better)
215	confidence = `30` + doubleByteCharCount - `20`*badCharCount;
216
217	if (confidence > `100`) {
218	confidence = `100`;
219	}
220	} else {
221	//
222	// Frequency of occurence statistics exist.
223	//
224
225	double maxVal = log((double)doubleByteCharCount / `4`); /(float)?/
226	double scaleFactor = `90.0` / maxVal;
227	confidence = (int32_t)(log((double)commonCharCount+`1`) * scaleFactor + `10.0`);
228
229	confidence = min(confidence, `100`);
230	}
231
232	if (confidence < `0`) {
233	confidence = `0`;
234	}
235
236	return confidence;
237	}
238
239	CharsetRecog_sjis::~CharsetRecog_sjis()
240	{
241	// nothing to do
242	}
243
244	UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245	it->index = it->nextIndex;
246	it->error = FALSE;
247
248	int32_t firstByte = it->charValue = it->nextByte(det);
249
250	if (firstByte < `0`) {
251	return FALSE;
252	}
253
254	if (firstByte <= `0x7F` \|\| (firstByte > `0xA0` && firstByte <= `0xDF`)) {
255	return TRUE;
256	}
257
258	int32_t secondByte = it->nextByte(det);
259	if (secondByte >= `0`) {
260	it->charValue = (firstByte << `8`) \| secondByte;
261	}
262	// else we'll handle the error later.
263
264	if (! ((secondByte >= `0x40` && secondByte <= `0x7F`) \|\| (secondByte >= `0x80` && secondByte <= `0xFE`))) {
265	// Illegal second byte value.
266	it->error = TRUE;
267	}
268
269	return TRUE;
270	}
271
272	UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch results) const* {
273	int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274	results->set(det, this, confidence);
275	return (confidence > `0`);
276	}
277
278	const char CharsetRecog_sjis::getName() const*
279	{
280	return "Shift_JIS";
281	}
282
283	const char CharsetRecog_sjis::getLanguage() const*
284	{
285	return "ja";
286	}
287
288	CharsetRecog_euc::~CharsetRecog_euc()
289	{
290	// nothing to do
291	}
292
293	UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294	int32_t firstByte = `0`;
295	int32_t secondByte = `0`;
296	int32_t thirdByte = `0`;
297
298	it->index = it->nextIndex;
299	it->error = FALSE;
300	firstByte = it->charValue = it->nextByte(det);
301
302	if (firstByte < `0`) {
303	// Ran off the end of the input data
304	return FALSE;
305	}
306
307	if (firstByte <= `0x8D`) {
308	// single byte char
309	return TRUE;
310	}
311
312	secondByte = it->nextByte(det);
313	if (secondByte >= `0`) {
314	it->charValue = (it->charValue << `8`) \| secondByte;
315	}
316	// else we'll handle the error later.
317
318	if (firstByte >= `0xA1` && firstByte <= `0xFE`) {
319	// Two byte Char
320	if (secondByte < `0xA1`) {
321	it->error = TRUE;
322	}
323
324	return TRUE;
325	}
326
327	if (firstByte == `0x8E`) {
328	// Code Set 2.
329	// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330	// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331	// We don't know which we've got.
332	// Treat it like EUC-JP. If the data really was EUC-TW, the following two
333	// bytes will look like a well formed 2 byte char.
334	if (secondByte < `0xA1`) {
335	it->error = TRUE;
336	}
337
338	return TRUE;
339	}
340
341	if (firstByte == `0x8F`) {
342	// Code set 3.
343	// Three byte total char size, two bytes of actual char value.
344	thirdByte = it->nextByte(det);
345	it->charValue = (it->charValue << `8`) \| thirdByte;
346
347	if (thirdByte < `0xa1`) {
348	// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349	it->error = TRUE;
350	}
351	}
352
353	return TRUE;
354
355	}
356
357	CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358	{
359	// nothing to do
360	}
361
362	const char CharsetRecog_euc_jp::getName() const*
363	{
364	return "EUC-JP";
365	}
366
367	const char CharsetRecog_euc_jp::getLanguage() const*
368	{
369	return "ja";
370	}
371
372	UBool CharsetRecog_euc_jp::match(InputText det, CharsetMatch results) const
373	{
374	int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375	results->set(det, this, confidence);
376	return (confidence > `0`);
377	}
378
379	CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380	{
381	// nothing to do
382	}
383
384	const char CharsetRecog_euc_kr::getName() const*
385	{
386	return "EUC-KR";
387	}
388
389	const char CharsetRecog_euc_kr::getLanguage() const*
390	{
391	return "ko";
392	}
393
394	UBool CharsetRecog_euc_kr::match(InputText det, CharsetMatch results) const
395	{
396	int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397	results->set(det, this, confidence);
398	return (confidence > `0`);
399	}
400
401	CharsetRecog_big5::~CharsetRecog_big5()
402	{
403	// nothing to do
404	}
405
406	UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407	{
408	int32_t firstByte;
409
410	it->index = it->nextIndex;
411	it->error = FALSE;
412	firstByte = it->charValue = it->nextByte(det);
413
414	if (firstByte < `0`) {
415	return FALSE;
416	}
417
418	if (firstByte <= `0x7F` \|\| firstByte == `0xFF`) {
419	// single byte character.
420	return TRUE;
421	}
422
423	int32_t secondByte = it->nextByte(det);
424	if (secondByte >= `0`) {
425	it->charValue = (it->charValue << `8`) \| secondByte;
426	}
427	// else we'll handle the error later.
428
429	if (secondByte < `0x40` \|\| secondByte == `0x7F` \|\| secondByte == `0xFF`) {
430	it->error = TRUE;
431	}
432
433	return TRUE;
434	}
435
436	const char CharsetRecog_big5::getName() const*
437	{
438	return "Big5";
439	}
440
441	const char CharsetRecog_big5::getLanguage() const*
442	{
443	return "zh";
444	}
445
446	UBool CharsetRecog_big5::match(InputText det, CharsetMatch results) const
447	{
448	int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449	results->set(det, this, confidence);
450	return (confidence > `0`);
451	}
452
453	CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454	{
455	// nothing to do
456	}
457
458	UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459	int32_t firstByte = `0`;
460	int32_t secondByte = `0`;
461	int32_t thirdByte = `0`;
462	int32_t fourthByte = `0`;
463
464	it->index = it->nextIndex;
465	it->error = FALSE;
466	firstByte = it->charValue = it->nextByte(det);
467
468	if (firstByte < `0`) {
469	// Ran off the end of the input data
470	return FALSE;
471	}
472
473	if (firstByte <= `0x80`) {
474	// single byte char
475	return TRUE;
476	}
477
478	secondByte = it->nextByte(det);
479	if (secondByte >= `0`) {
480	it->charValue = (it->charValue << `8`) \| secondByte;
481	}
482	// else we'll handle the error later.
483
484	if (firstByte >= `0x81` && firstByte <= `0xFE`) {
485	// Two byte Char
486	if ((secondByte >= `0x40` && secondByte <= `0x7E`) \|\| (secondByte >=`80` && secondByte <= `0xFE`)) {
487	return TRUE;
488	}
489
490	// Four byte char
491	if (secondByte >= `0x30` && secondByte <= `0x39`) {
492	thirdByte = it->nextByte(det);
493
494	if (thirdByte >= `0x81` && thirdByte <= `0xFE`) {
495	fourthByte = it->nextByte(det);
496
497	if (fourthByte >= `0x30` && fourthByte <= `0x39`) {
498	it->charValue = (it->charValue << `16`) \| (thirdByte << `8`) \| fourthByte;
499
500	return TRUE;
501	}
502	}
503	}
504
505	// Something wasn't valid, or we ran out of data (-1).
506	it->error = TRUE;
507	}
508
509	return TRUE;
510	}
511
512	const char CharsetRecog_gb_18030::getName() const*
513	{
514	return "GB18030";
515	}
516
517	const char CharsetRecog_gb_18030::getLanguage() const*
518	{
519	return "zh";
520	}
521
522	UBool CharsetRecog_gb_18030::match(InputText det, CharsetMatch results) const
523	{
524	int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525	results->set(det, this, confidence);
526	return (confidence > `0`);
527	}
528
529	U_NAMESPACE_END
530	#endif
531

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csrmbcs.cpp