csrmbcs.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/csrmbcs.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2012, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#ifndef __CSRMBCS_H
11	#define __CSRMBCS_H
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_CONVERSION
16
17	#include "csrecog.h"
18
19	U_NAMESPACE_BEGIN
20
21	// "Character" iterated character class.
22	// Recognizers for specific mbcs encodings make their "characters" available
23	// by providing a nextChar() function that fills in an instance of IteratedChar
24	// with the next char from the input.
25	// The returned characters are not converted to Unicode, but remain as the raw
26	// bytes (concatenated into an int) from the codepage data.
27	//
28	// For Asian charsets, use the raw input rather than the input that has been
29	// stripped of markup. Detection only considers multi-byte chars, effectively
30	// stripping markup anyway, and double byte chars do occur in markup too.
31	//
32	class IteratedChar : public UMemory
33	{
34	public:
35	uint32_t charValue; // 1-4 bytes from the raw input data
36	int32_t index;
37	int32_t nextIndex;
38	UBool error;
39	UBool done;
40
41	public:
42	IteratedChar();
43	//void reset();
44	int32_t nextByte(InputText* det);
45	};
46
47
48	class CharsetRecog_mbcs : public CharsetRecognizer {
49
50	protected:
51	/**
52	* Test the match of this charset with the input text data
53	* which is obtained via the CharsetDetector object.
54	*
55	* @param det The CharsetDetector, which contains the input text
56	* to be checked for being in this charset.
57	* @return Two values packed into one int (Damn java, anyhow)
58	* <br/>
59	* bits 0-7: the match confidence, ranging from 0-100
60	* <br/>
61	* bits 8-15: The match reason, an enum-like value.
62	*/
63	int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
64
65	public:
66
67	virtual ~CharsetRecog_mbcs();
68
69	/**
70	* Get the IANA name of this charset.
71	* @return the charset name.
72	*/
73
74	const char getName() const* = `0`;
75	const char getLanguage() const* = `0`;
76	UBool match(InputText* input, CharsetMatch results) const* = `0`;
77
78	/**
79	* Get the next character (however many bytes it is) from the input data
80	* Subclasses for specific charset encodings must implement this function
81	* to get characters according to the rules of their encoding scheme.
82	*
83	* This function is not a method of class IteratedChar only because
84	* that would require a lot of extra derived classes, which is awkward.
85	* @param it The IteratedChar "struct" into which the returned char is placed.
86	* @param det The charset detector, which is needed to get at the input byte data
87	* being iterated over.
88	* @return True if a character was returned, false at end of input.
89	*/
90	virtual UBool nextChar(IteratedChar it, InputText textIn) const = `0`;
91
92	};
93
94
95	/**
96	* Shift-JIS charset recognizer.
97	*
98	*/
99	class CharsetRecog_sjis : public CharsetRecog_mbcs {
100	public:
101	virtual ~CharsetRecog_sjis();
102
103	UBool nextChar(IteratedChar it, InputText det) const;
104
105	UBool match(InputText* input, CharsetMatch results) const*;
106
107	const char getName() const*;
108	const char getLanguage() const*;
109
110	};
111
112
113	/**
114	* EUC charset recognizers. One abstract class that provides the common function
115	* for getting the next character according to the EUC encoding scheme,
116	* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
117	*
118	*/
119	class CharsetRecog_euc : public CharsetRecog_mbcs
120	{
121	public:
122	virtual ~CharsetRecog_euc();
123
124	const char getName() const* = `0`;
125	const char getLanguage() const* = `0`;
126
127	UBool match(InputText* input, CharsetMatch results) const* = `0`;
128	/*
129	* (non-Javadoc)
130	* Get the next character value for EUC based encodings.
131	* Character "value" is simply the raw bytes that make up the character
132	* packed into an int.
133	*/
134	UBool nextChar(IteratedChar it, InputText det) const;
135	};
136
137	/**
138	* The charset recognize for EUC-JP. A singleton instance of this class
139	* is created and kept by the public CharsetDetector class
140	*/
141	class CharsetRecog_euc_jp : public CharsetRecog_euc
142	{
143	public:
144	virtual ~CharsetRecog_euc_jp();
145
146	const char getName() const*;
147	const char getLanguage() const*;
148
149	UBool match(InputText* input, CharsetMatch results) const*;
150	};
151
152	/**
153	* The charset recognize for EUC-KR. A singleton instance of this class
154	* is created and kept by the public CharsetDetector class
155	*/
156	class CharsetRecog_euc_kr : public CharsetRecog_euc
157	{
158	public:
159	virtual ~CharsetRecog_euc_kr();
160
161	const char getName() const*;
162	const char getLanguage() const*;
163
164	UBool match(InputText* input, CharsetMatch results) const*;
165	};
166
167	/**
168	*
169	* Big5 charset recognizer.
170	*
171	*/
172	class CharsetRecog_big5 : public CharsetRecog_mbcs
173	{
174	public:
175	virtual ~CharsetRecog_big5();
176
177	UBool nextChar(IteratedChar* it, InputText* det) const;
178
179	const char getName() const*;
180	const char getLanguage() const*;
181
182	UBool match(InputText* input, CharsetMatch results) const*;
183	};
184
185
186	/**
187	*
188	* GB-18030 recognizer. Uses simplified Chinese statistics.
189	*
190	*/
191	class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
192	{
193	public:
194	virtual ~CharsetRecog_gb_18030();
195
196	UBool nextChar(IteratedChar* it, InputText* det) const;
197
198	const char getName() const*;
199	const char getLanguage() const*;
200
201	UBool match(InputText* input, CharsetMatch results) const*;
202	};
203
204	U_NAMESPACE_END
205
206	#endif
207	#endif /* __CSRMBCS_H */
208

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csrmbcs.h