TextEncoding.h source code [ClickHouse/contrib/poco/Foundation/include/Poco/TextEncoding.h]

1	//
2	// TextEncoding.h
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: TextEncoding
7	//
8	// Definition of the abstract TextEncoding class.
9	//
10	// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
11	// and Contributors.
12	//
13	// SPDX-License-Identifier: BSL-1.0
14	//
15
16
17	#ifndef Foundation_TextEncoding_INCLUDED
18	#define Foundation_TextEncoding_INCLUDED
19
20
21	#include "Poco/Foundation.h"
22	#include "Poco/SharedPtr.h"
23
24
25	namespace Poco {
26
27
28	class TextEncodingManager;
29
30
31	class Foundation_API TextEncoding
32	/// An abstract base class for implementing text encodings
33	/// like UTF-8 or ISO 8859-1.
34	///
35	/// Subclasses must override the canonicalName(), isA(),
36	/// characterMap() and convert() methods and need to be
37	/// thread safe and stateless.
38	///
39	/// TextEncoding also provides static member functions
40	/// for managing mappings from encoding names to
41	/// TextEncoding objects.
42	{
43	public:
44	typedef SharedPtr<TextEncoding> Ptr;
45
46	enum
47	{
48	MAX_SEQUENCE_LENGTH = `6` /// The maximum character byte sequence length supported.
49	};
50
51	typedef int CharacterMap[`256`];
52	/// The map[b] member gives information about byte sequences
53	/// whose first byte is b.
54	/// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
55	/// If map[b] is -1, then the byte sequence is malformed.
56	/// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
57	/// sequence that encodes a single Unicode scalar value. Byte sequences up
58	/// to 6 bytes in length are supported.
59
60	virtual ~TextEncoding();
61	/// Destroys the encoding.
62
63	virtual const char* canonicalName() const = `0`;
64	/// Returns the canonical name of this encoding,
65	/// e.g. "ISO-8859-1". Encoding name comparisons are case
66	/// insensitive.
67
68	virtual bool isA(const std::string& encodingName) const = `0`;
69	/// Returns true if the given name is one of the names of this encoding.
70	/// For example, the "ISO-8859-1" encoding is also known as "Latin-1".
71	///
72	/// Encoding name comparisons are case insensitive.
73
74	virtual const CharacterMap& characterMap() const = `0`;
75	/// Returns the CharacterMap for the encoding.
76	/// The CharacterMap should be kept in a static member. As
77	/// characterMap() can be called frequently, it should be
78	/// implemented in such a way that it just returns a static
79	/// map. If the map is built at runtime, this should be
80	/// done in the constructor.
81
82	virtual int convert(const unsigned char* bytes) const;
83	/// The convert function is used to convert multibyte sequences;
84	/// bytes will point to a byte sequence of n bytes where
85	/// sequenceLength(bytes, length) == -n, with length >= n.
86	///
87	/// The convert function must return the Unicode scalar value
88	/// represented by this byte sequence or -1 if the byte sequence is malformed.
89	/// The default implementation returns (int) bytes[0].
90
91	virtual int queryConvert(const unsigned char* bytes, int length) const;
92	/// The queryConvert function is used to convert single byte characters
93	/// or multibyte sequences;
94	/// bytes will point to a byte sequence of length bytes.
95	///
96	/// The queryConvert function must return the Unicode scalar value
97	/// represented by this byte sequence or -1 if the byte sequence is malformed
98	/// or -n where n is number of bytes requested for the sequence, if length is
99	/// shorter than the sequence.
100	/// The length of the sequence might not be determined by the first byte,
101	/// in which case the conversion becomes an iterative process:
102	/// First call with length == 1 might return -2,
103	/// Then a second call with length == 2 might return -4
104	/// Eventually, the third call with length == 4 should return either a
105	/// Unicode scalar value, or -1 if the byte sequence is malformed.
106	/// The default implementation returns (int) bytes[0].
107
108	virtual int sequenceLength(const unsigned char* bytes, int length) const;
109	/// The sequenceLength function is used to get the lenth of the sequence pointed
110	/// by bytes. The length parameter should be greater or equal to the length of
111	/// the sequence.
112	///
113	/// The sequenceLength function must return the length of the sequence
114	/// represented by this byte sequence or a negative value -n if length is
115	/// shorter than the sequence, where n is the number of byte requested
116	/// to determine the length of the sequence.
117	/// The length of the sequence might not be determined by the first byte,
118	/// in which case the conversion becomes an iterative process as long as the
119	/// result is negative:
120	/// First call with length == 1 might return -2,
121	/// Then a second call with length == 2 might return -4
122	/// Eventually, the third call with length == 4 should return 4.
123	/// The default implementation returns 1.
124
125	virtual int convert(int ch, unsigned char* bytes, int length) const;
126	/// Transform the Unicode character ch into the encoding's
127	/// byte sequence. The method returns the number of bytes
128	/// used. The method must not use more than length characters.
129	/// Bytes and length can also be null - in this case only the number
130	/// of bytes required to represent ch is returned.
131	/// If the character cannot be converted, 0 is returned and
132	/// the byte sequence remains unchanged.
133	/// The default implementation simply returns 0.
134
135	static TextEncoding& byName(const std::string& encodingName);
136	/// Returns the TextEncoding object for the given encoding name.
137	///
138	/// Throws a NotFoundException if the encoding with given name is not available.
139
140	static TextEncoding::Ptr find(const std::string& encodingName);
141	/// Returns a pointer to the TextEncoding object for the given encodingName,
142	/// or NULL if no such TextEncoding object exists.
143
144	static void add(TextEncoding::Ptr encoding);
145	/// Adds the given TextEncoding to the table of text encodings,
146	/// under the encoding's canonical name.
147	///
148	/// If an encoding with the given name is already registered,
149	/// it is replaced.
150
151	static void add(TextEncoding::Ptr encoding, const std::string& name);
152	/// Adds the given TextEncoding to the table of text encodings,
153	/// under the given name.
154	///
155	/// If an encoding with the given name is already registered,
156	/// it is replaced.
157
158	static void remove(const std::string& encodingName);
159	/// Removes the encoding with the given name from the table
160	/// of text encodings.
161
162	static TextEncoding::Ptr global(TextEncoding::Ptr encoding);
163	/// Sets global TextEncoding object.
164	///
165	/// This function sets the global encoding to the argument and returns a
166	/// reference of the previous global encoding.
167
168	static TextEncoding& global();
169	/// Return the current global TextEncoding object
170
171	static const std::string GLOBAL;
172	/// Name of the global TextEncoding, which is the empty string.
173
174	protected:
175	static TextEncodingManager& manager();
176	/// Returns the TextEncodingManager.
177	};
178
179
180	} // namespace Poco
181
182
183	#endif // Foundation_TextEncoding_INCLUDED
184

Browse the source code of ClickHouse/contrib/poco/Foundation/include/Poco/TextEncoding.h