csr2022.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/csr2022.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "cmemory.h"
15	#include "cstring.h"
16
17	#include "csr2022.h"
18	#include "csmatch.h"
19
20	U_NAMESPACE_BEGIN
21
22	/**
23	* Matching function shared among the 2022 detectors JP, CN and KR
24	* Counts up the number of legal and unrecognized escape sequences in
25	* the sample of text, and computes a score based on the total number &
26	* the proportion that fit the encoding.
27	*
28	*
29	* @param text the byte buffer containing text to analyse
30	* @param textLen the size of the text in the byte.
31	* @param escapeSequences the byte escape sequences to test for.
32	* @return match quality, in the range of 0-100.
33	*/
34	int32_t CharsetRecog_2022::match_2022(const uint8_t text, int32_t textLen, const* uint8_t escapeSequences[][`5`], int32_t escapeSequences_length) const
35	{
36	int32_t i, j;
37	int32_t escN;
38	int32_t hits = `0`;
39	int32_t misses = `0`;
40	int32_t shifts = `0`;
41	int32_t quality;
42
43	i = `0`;
44	while(i < textLen) {
45	if(text[i] == `0x1B`) {
46	escN = `0`;
47	while(escN < escapeSequences_length) {
48	const uint8_t *seq = escapeSequences[escN];
49	int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);
50
51	if (textLen-i >= seq_length) {
52	j = `1`;
53	while(j < seq_length) {
54	if(seq[j] != text[i+j]) {
55	goto checkEscapes;
56	}
57
58	j += `1`;
59	}
60
61	hits += `1`;
62	i += seq_length-`1`;
63	goto scanInput;
64	}
65	// else we ran out of string to compare this time.
66	checkEscapes:
67	escN += `1`;
68	}
69
70	misses += `1`;
71	}
72
73	if( text[i]== `0x0e` \|\| text[i] == `0x0f`){
74	shifts += `1`;
75	}
76
77	scanInput:
78	i += `1`;
79	}
80
81	if (hits == `0`) {
82	return `0`;
83	}
84
85	//
86	// Initial quality is based on relative proportion of recongized vs.
87	// unrecognized escape sequences.
88	// All good: quality = 100;
89	// half or less good: quality = 0;
90	// linear inbetween.
91	quality = (`100`hits - `100`misses) / (hits + misses);
92
93	// Back off quality if there were too few escape sequences seen.
94	// Include shifts in this computation, so that KR does not get penalized
95	// for having only a single Escape sequence, but many shifts.
96	if (hits+shifts < `5`) {
97	quality -= (`5`-(hits+shifts))*`10`;
98	}
99
100	if (quality < `0`) {
101	quality = `0`;
102	}
103
104	return quality;
105	}
106
107
108	static const uint8_t escapeSequences_2022JP[][`5`] = {
109	{`0x1b`, `0x24`, `0x28`, `0x43`, `0x00`}, // KS X 1001:1992
110	{`0x1b`, `0x24`, `0x28`, `0x44`, `0x00`}, // JIS X 212-1990
111	{`0x1b`, `0x24`, `0x40`, `0x00`, `0x00`}, // JIS C 6226-1978
112	{`0x1b`, `0x24`, `0x41`, `0x00`, `0x00`}, // GB 2312-80
113	{`0x1b`, `0x24`, `0x42`, `0x00`, `0x00`}, // JIS X 208-1983
114	{`0x1b`, `0x26`, `0x40`, `0x00`, `0x00`}, // JIS X 208 1990, 1997
115	{`0x1b`, `0x28`, `0x42`, `0x00`, `0x00`}, // ASCII
116	{`0x1b`, `0x28`, `0x48`, `0x00`, `0x00`}, // JIS-Roman
117	{`0x1b`, `0x28`, `0x49`, `0x00`, `0x00`}, // Half-width katakana
118	{`0x1b`, `0x28`, `0x4a`, `0x00`, `0x00`}, // JIS-Roman
119	{`0x1b`, `0x2e`, `0x41`, `0x00`, `0x00`}, // ISO 8859-1
120	{`0x1b`, `0x2e`, `0x46`, `0x00`, `0x00`} // ISO 8859-7
121	};
122
123	#if !UCONFIG_ONLY_HTML_CONVERSION
124	static const uint8_t escapeSequences_2022KR[][`5`] = {
125	{`0x1b`, `0x24`, `0x29`, `0x43`, `0x00`}
126	};
127
128	static const uint8_t escapeSequences_2022CN[][`5`] = {
129	{`0x1b`, `0x24`, `0x29`, `0x41`, `0x00`}, // GB 2312-80
130	{`0x1b`, `0x24`, `0x29`, `0x47`, `0x00`}, // CNS 11643-1992 Plane 1
131	{`0x1b`, `0x24`, `0x2A`, `0x48`, `0x00`}, // CNS 11643-1992 Plane 2
132	{`0x1b`, `0x24`, `0x29`, `0x45`, `0x00`}, // ISO-IR-165
133	{`0x1b`, `0x24`, `0x2B`, `0x49`, `0x00`}, // CNS 11643-1992 Plane 3
134	{`0x1b`, `0x24`, `0x2B`, `0x4A`, `0x00`}, // CNS 11643-1992 Plane 4
135	{`0x1b`, `0x24`, `0x2B`, `0x4B`, `0x00`}, // CNS 11643-1992 Plane 5
136	{`0x1b`, `0x24`, `0x2B`, `0x4C`, `0x00`}, // CNS 11643-1992 Plane 6
137	{`0x1b`, `0x24`, `0x2B`, `0x4D`, `0x00`}, // CNS 11643-1992 Plane 7
138	{`0x1b`, `0x4e`, `0x00`, `0x00`, `0x00`}, // SS2
139	{`0x1b`, `0x4f`, `0x00`, `0x00`, `0x00`}, // SS3
140	};
141	#endif
142
143	CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
144
145	const char CharsetRecog_2022JP::getName() const* {
146	return "ISO-2022-JP";
147	}
148
149	UBool CharsetRecog_2022JP::match(InputText textIn, CharsetMatch results) const {
150	int32_t confidence = match_2022(textIn->fInputBytes,
151	textIn->fInputLen,
152	escapeSequences_2022JP,
153	UPRV_LENGTHOF(escapeSequences_2022JP));
154	results->set(textIn, this, confidence);
155	return (confidence > `0`);
156	}
157
158	#if !UCONFIG_ONLY_HTML_CONVERSION
159	CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
160
161	const char CharsetRecog_2022KR::getName() const* {
162	return "ISO-2022-KR";
163	}
164
165	UBool CharsetRecog_2022KR::match(InputText textIn, CharsetMatch results) const {
166	int32_t confidence = match_2022(textIn->fInputBytes,
167	textIn->fInputLen,
168	escapeSequences_2022KR,
169	UPRV_LENGTHOF(escapeSequences_2022KR));
170	results->set(textIn, this, confidence);
171	return (confidence > `0`);
172	}
173
174	CharsetRecog_2022CN::~CharsetRecog_2022CN() {}
175
176	const char CharsetRecog_2022CN::getName() const* {
177	return "ISO-2022-CN";
178	}
179
180	UBool CharsetRecog_2022CN::match(InputText textIn, CharsetMatch results) const {
181	int32_t confidence = match_2022(textIn->fInputBytes,
182	textIn->fInputLen,
183	escapeSequences_2022CN,
184	UPRV_LENGTHOF(escapeSequences_2022CN));
185	results->set(textIn, this, confidence);
186	return (confidence > `0`);
187	}
188	#endif
189
190	CharsetRecog_2022::~CharsetRecog_2022() {
191	// nothing to do
192	}
193
194	U_NAMESPACE_END
195	#endif
196

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csr2022.cpp