ucsdet.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/unicode/ucsdet.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2013, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* file name: ucsdet.h
9	* encoding: UTF-8
10	* indentation:4
11	*
12	* created on: 2005Aug04
13	* created by: Andy Heninger
14	*
15	* ICU Character Set Detection, API for C
16	*
17	* Draft version 18 Oct 2005
18	*
19	*/
20
21	#ifndef __UCSDET_H
22	#define __UCSDET_H
23
24	#include "unicode/utypes.h"
25
26	#if !UCONFIG_NO_CONVERSION
27
28	#include "unicode/localpointer.h"
29	#include "unicode/uenum.h"
30
31	/**
32	* \file
33	* \brief C API: Charset Detection API
34	*
35	* This API provides a facility for detecting the
36	* charset or encoding of character data in an unknown text format.
37	* The input data can be from an array of bytes.
38	* <p>
39	* Character set detection is at best an imprecise operation. The detection
40	* process will attempt to identify the charset that best matches the characteristics
41	* of the byte data, but the process is partly statistical in nature, and
42	* the results can not be guaranteed to always be correct.
43	* <p>
44	* For best accuracy in charset detection, the input data should be primarily
45	* in a single language, and a minimum of a few hundred bytes worth of plain text
46	* in the language are needed. The detection process will attempt to
47	* ignore html or xml style markup that could otherwise obscure the content.
48	* <p>
49	* An alternative to the ICU Charset Detector is the
50	* Compact Encoding Detector, https://github.com/google/compact_enc_det.
51	* It often gives more accurate results, especially with short input samples.
52	*/
53
54
55	struct UCharsetDetector;
56	/**
57	* Structure representing a charset detector
58	* @stable ICU 3.6
59	*/
60	typedef struct UCharsetDetector UCharsetDetector;
61
62	struct UCharsetMatch;
63	/**
64	* Opaque structure representing a match that was identified
65	* from a charset detection operation.
66	* @stable ICU 3.6
67	*/
68	typedef struct UCharsetMatch UCharsetMatch;
69
70	/**
71	* Open a charset detector.
72	*
73	* @param status Any error conditions occurring during the open
74	* operation are reported back in this variable.
75	* @return the newly opened charset detector.
76	* @stable ICU 3.6
77	*/
78	U_STABLE UCharsetDetector * U_EXPORT2
79	ucsdet_open(UErrorCode *status);
80
81	/**
82	* Close a charset detector. All storage and any other resources
83	* owned by this charset detector will be released. Failure to
84	* close a charset detector when finished with it can result in
85	* memory leaks in the application.
86	*
87	* @param ucsd The charset detector to be closed.
88	* @stable ICU 3.6
89	*/
90	U_STABLE void U_EXPORT2
91	ucsdet_close(UCharsetDetector *ucsd);
92
93	#if U_SHOW_CPLUSPLUS_API
94
95	U_NAMESPACE_BEGIN
96
97	/**
98	* \class LocalUCharsetDetectorPointer
99	* "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
100	* For most methods see the LocalPointerBase base class.
101	*
102	* @see LocalPointerBase
103	* @see LocalPointer
104	* @stable ICU 4.4
105	*/
106	U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
107
108	U_NAMESPACE_END
109
110	#endif
111
112	/**
113	* Set the input byte data whose charset is to detected.
114	*
115	* Ownership of the input text byte array remains with the caller.
116	* The input string must not be altered or deleted until the charset
117	* detector is either closed or reset to refer to different input text.
118	*
119	* @param ucsd the charset detector to be used.
120	* @param textIn the input text of unknown encoding. .
121	* @param len the length of the input text, or -1 if the text
122	* is NUL terminated.
123	* @param status any error conditions are reported back in this variable.
124	*
125	* @stable ICU 3.6
126	*/
127	U_STABLE void U_EXPORT2
128	ucsdet_setText(UCharsetDetector ucsd, const* char textIn, int32_t len, UErrorCode status);
129
130
131	/* Set the declared encoding for charset detection.*
132	* The declared encoding of an input text is an encoding obtained
133	* by the user from an http header or xml declaration or similar source that
134	* can be provided as an additional hint to the charset detector.
135	*
136	* How and whether the declared encoding will be used during the
137	* detection process is TBD.
138	*
139	* @param ucsd the charset detector to be used.
140	* @param encoding an encoding for the current data obtained from
141	* a header or declaration or other source outside
142	* of the byte data itself.
143	* @param length the length of the encoding name, or -1 if the name string
144	* is NUL terminated.
145	* @param status any error conditions are reported back in this variable.
146	*
147	* @stable ICU 3.6
148	*/
149	U_STABLE void U_EXPORT2
150	ucsdet_setDeclaredEncoding(UCharsetDetector ucsd, const* char encoding, int32_t length, UErrorCode status);
151
152
153	/**
154	* Return the charset that best matches the supplied input data.
155	*
156	* Note though, that because the detection
157	* only looks at the start of the input data,
158	* there is a possibility that the returned charset will fail to handle
159	* the full set of input data.
160	* <p>
161	* The returned UCharsetMatch object is owned by the UCharsetDetector.
162	* It will remain valid until the detector input is reset, or until
163	* the detector is closed.
164	* <p>
165	* The function will fail if
166	* <ul>
167	* <li>no charset appears to match the data.</li>
168	* <li>no input text has been provided</li>
169	* </ul>
170	*
171	* @param ucsd the charset detector to be used.
172	* @param status any error conditions are reported back in this variable.
173	* @return a UCharsetMatch representing the best matching charset,
174	* or NULL if no charset matches the byte data.
175	*
176	* @stable ICU 3.6
177	*/
178	U_STABLE const UCharsetMatch * U_EXPORT2
179	ucsdet_detect(UCharsetDetector ucsd, UErrorCode status);
180
181
182	/**
183	* Find all charset matches that appear to be consistent with the input,
184	* returning an array of results. The results are ordered with the
185	* best quality match first.
186	*
187	* Because the detection only looks at a limited amount of the
188	* input byte data, some of the returned charsets may fail to handle
189	* the all of input data.
190	* <p>
191	* The returned UCharsetMatch objects are owned by the UCharsetDetector.
192	* They will remain valid until the detector is closed or modified
193	*
194	* <p>
195	* Return an error if
196	* <ul>
197	* <li>no charsets appear to match the input data.</li>
198	* <li>no input text has been provided</li>
199	* </ul>
200	*
201	* @param ucsd the charset detector to be used.
202	* @param matchesFound pointer to a variable that will be set to the
203	* number of charsets identified that are consistent with
204	* the input data. Output only.
205	* @param status any error conditions are reported back in this variable.
206	* @return A pointer to an array of pointers to UCharSetMatch objects.
207	* This array, and the UCharSetMatch instances to which it refers,
208	* are owned by the UCharsetDetector, and will remain valid until
209	* the detector is closed or modified.
210	* @stable ICU 3.6
211	*/
212	U_STABLE const UCharsetMatch ** U_EXPORT2
213	ucsdet_detectAll(UCharsetDetector ucsd, int32_t matchesFound, UErrorCode *status);
214
215
216
217	/**
218	* Get the name of the charset represented by a UCharsetMatch.
219	*
220	* The storage for the returned name string is owned by the
221	* UCharsetMatch, and will remain valid while the UCharsetMatch
222	* is valid.
223	*
224	* The name returned is suitable for use with the ICU conversion APIs.
225	*
226	* @param ucsm The charset match object.
227	* @param status Any error conditions are reported back in this variable.
228	* @return The name of the matching charset.
229	*
230	* @stable ICU 3.6
231	*/
232	U_STABLE const char * U_EXPORT2
233	ucsdet_getName(const UCharsetMatch ucsm, UErrorCode status);
234
235	/**
236	* Get a confidence number for the quality of the match of the byte
237	* data with the charset. Confidence numbers range from zero to 100,
238	* with 100 representing complete confidence and zero representing
239	* no confidence.
240	*
241	* The confidence values are somewhat arbitrary. They define an
242	* an ordering within the results for any single detection operation
243	* but are not generally comparable between the results for different input.
244	*
245	* A confidence value of ten does have a general meaning - it is used
246	* for charsets that can represent the input data, but for which there
247	* is no other indication that suggests that the charset is the correct one.
248	* Pure 7 bit ASCII data, for example, is compatible with a
249	* great many charsets, most of which will appear as possible matches
250	* with a confidence of 10.
251	*
252	* @param ucsm The charset match object.
253	* @param status Any error conditions are reported back in this variable.
254	* @return A confidence number for the charset match.
255	*
256	* @stable ICU 3.6
257	*/
258	U_STABLE int32_t U_EXPORT2
259	ucsdet_getConfidence(const UCharsetMatch ucsm, UErrorCode status);
260
261	/**
262	* Get the RFC 3066 code for the language of the input data.
263	*
264	* The Charset Detection service is intended primarily for detecting
265	* charsets, not language. For some, but not all, charsets, a language is
266	* identified as a byproduct of the detection process, and that is what
267	* is returned by this function.
268	*
269	* CAUTION:
270	* 1. Language information is not available for input data encoded in
271	* all charsets. In particular, no language is identified
272	* for UTF-8 input data.
273	*
274	* 2. Closely related languages may sometimes be confused.
275	*
276	* If more accurate language detection is required, a linguistic
277	* analysis package should be used.
278	*
279	* The storage for the returned name string is owned by the
280	* UCharsetMatch, and will remain valid while the UCharsetMatch
281	* is valid.
282	*
283	* @param ucsm The charset match object.
284	* @param status Any error conditions are reported back in this variable.
285	* @return The RFC 3066 code for the language of the input data, or
286	* an empty string if the language could not be determined.
287	*
288	* @stable ICU 3.6
289	*/
290	U_STABLE const char * U_EXPORT2
291	ucsdet_getLanguage(const UCharsetMatch ucsm, UErrorCode status);
292
293
294	/**
295	* Get the entire input text as a UChar string, placing it into
296	* a caller-supplied buffer. A terminating
297	* NUL character will be appended to the buffer if space is available.
298	*
299	* The number of UChars in the output string, not including the terminating
300	* NUL, is returned.
301	*
302	* If the supplied buffer is smaller than required to hold the output,
303	* the contents of the buffer are undefined. The full output string length
304	* (in UChars) is returned as always, and can be used to allocate a buffer
305	* of the correct size.
306	*
307	*
308	* @param ucsm The charset match object.
309	* @param buf A UChar buffer to be filled with the converted text data.
310	* @param cap The capacity of the buffer in UChars.
311	* @param status Any error conditions are reported back in this variable.
312	* @return The number of UChars in the output string.
313	*
314	* @stable ICU 3.6
315	*/
316	U_STABLE int32_t U_EXPORT2
317	ucsdet_getUChars(const UCharsetMatch *ucsm,
318	UChar buf, int32_t cap, UErrorCode status);
319
320
321
322	/**
323	* Get an iterator over the set of all detectable charsets -
324	* over the charsets that are known to the charset detection
325	* service.
326	*
327	* The returned UEnumeration provides access to the names of
328	* the charsets.
329	*
330	* <p>
331	* The state of the Charset detector that is passed in does not
332	* affect the result of this function, but requiring a valid, open
333	* charset detector as a parameter insures that the charset detection
334	* service has been safely initialized and that the required detection
335	* data is available.
336	*
337	* <p>
338	* <b>Note:</b> Multiple different charset encodings in a same family may use
339	* a single shared name in this implementation. For example, this method returns
340	* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
341	* (Windows Latin 1). However, actual detection result could be "windows-1252"
342	* when the input data matches Latin 1 code points with any points only available
343	* in "windows-1252".
344	*
345	* @param ucsd a Charset detector.
346	* @param status Any error conditions are reported back in this variable.
347	* @return an iterator providing access to the detectable charset names.
348	* @stable ICU 3.6
349	*/
350	U_STABLE UEnumeration * U_EXPORT2
351	ucsdet_getAllDetectableCharsets(const UCharsetDetector ucsd, UErrorCode status);
352
353	/**
354	* Test whether input filtering is enabled for this charset detector.
355	* Input filtering removes text that appears to be HTML or xml
356	* markup from the input before applying the code page detection
357	* heuristics.
358	*
359	* @param ucsd The charset detector to check.
360	* @return TRUE if filtering is enabled.
361	* @stable ICU 3.6
362	*/
363
364	U_STABLE UBool U_EXPORT2
365	ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
366
367
368	/**
369	* Enable filtering of input text. If filtering is enabled,
370	* text within angle brackets ("<" and ">") will be removed
371	* before detection, which will remove most HTML or xml markup.
372	*
373	* @param ucsd the charset detector to be modified.
374	* @param filter <code>true</code> to enable input text filtering.
375	* @return The previous setting.
376	*
377	* @stable ICU 3.6
378	*/
379	U_STABLE UBool U_EXPORT2
380	ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
381
382	#ifndef U_HIDE_INTERNAL_API
383	/**
384	* Get an iterator over the set of detectable charsets -
385	* over the charsets that are enabled by the specified charset detector.
386	*
387	* The returned UEnumeration provides access to the names of
388	* the charsets.
389	*
390	* @param ucsd a Charset detector.
391	* @param status Any error conditions are reported back in this variable.
392	* @return an iterator providing access to the detectable charset names by
393	* the specified charset detector.
394	* @internal
395	*/
396	U_INTERNAL UEnumeration * U_EXPORT2
397	ucsdet_getDetectableCharsets(const UCharsetDetector ucsd, UErrorCode status);
398
399	/**
400	* Enable or disable individual charset encoding.
401	* A name of charset encoding must be included in the names returned by
402	* {@link #ucsdet_getAllDetectableCharsets()}.
403	*
404	* @param ucsd a Charset detector.
405	* @param encoding encoding the name of charset encoding.
406	* @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
407	* charset encoding.
408	* @param status receives the return status. When the name of charset encoding
409	* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
410	* @internal
411	*/
412	U_INTERNAL void U_EXPORT2
413	ucsdet_setDetectableCharset(UCharsetDetector ucsd, const* char encoding, UBool enabled, UErrorCode status);
414	#endif /* U_HIDE_INTERNAL_API */
415
416	#endif
417	#endif /* __UCSDET_H */
418
419
420

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/unicode/ucsdet.h