TextEncoding.h source code [POCO/Foundation/include/Poco/TextEncoding.h]

1	//
2	// TextEncoding.h
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: TextEncoding
7	//
8	// Definition of the abstract TextEncoding class.
9	//
10	// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
11	// and Contributors.
12	//
13	// SPDX-License-Identifier: BSL-1.0
14	//
15
16
17	#ifndef Foundation_TextEncoding_INCLUDED
18	#define Foundation_TextEncoding_INCLUDED
19
20
21	#include "Poco/Foundation.h"
22	#include "Poco/SharedPtr.h"
23	#include "Poco/String.h"
24	#include "Poco/RWLock.h"
25	#include <map>
26
27
28	namespace Poco {
29
30
31	class Foundation_API TextEncodingRegistry;
32
33
34	class Foundation_API TextEncoding
35	/// An abstract base class for implementing text encodings
36	/// like UTF-8 or ISO 8859-1.
37	///
38	/// Subclasses must override the canonicalName(), isA(),
39	/// characterMap() and convert() methods and need to be
40	/// thread safe and stateless.
41	///
42	/// TextEncoding also provides static member functions
43	/// for managing mappings from encoding names to
44	/// TextEncoding objects.
45	{
46	public:
47	typedef SharedPtr<TextEncoding> Ptr;
48
49	enum
50	{
51	MAX_SEQUENCE_LENGTH = `4` /// The maximum character byte sequence length supported.
52	};
53
54	typedef int CharacterMap[`256`];
55	/// The map[b] member gives information about byte sequences
56	/// whose first byte is b.
57	/// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
58	/// If map[b] is -1, then the byte sequence is malformed.
59	/// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
60	/// sequence that encodes a single Unicode scalar value. Byte sequences up
61	/// to 4 bytes in length are supported.
62
63	virtual ~TextEncoding();
64	/// Destroys the encoding.
65
66	virtual const char* canonicalName() const = `0`;
67	/// Returns the canonical name of this encoding,
68	/// e.g. "ISO-8859-1". Encoding name comparisons are case
69	/// insensitive.
70
71	virtual bool isA(const std::string& encodingName) const = `0`;
72	/// Returns true if the given name is one of the names of this encoding.
73	/// For example, the "ISO-8859-1" encoding is also known as "Latin-1".
74	///
75	/// Encoding name comparisons are case insensitive.
76
77	virtual const CharacterMap& characterMap() const = `0`;
78	/// Returns the CharacterMap for the encoding.
79	/// The CharacterMap should be kept in a static member. As
80	/// characterMap() can be called frequently, it should be
81	/// implemented in such a way that it just returns a static
82	/// map. If the map is built at runtime, this should be
83	/// done in the constructor.
84
85	virtual int convert(const unsigned char* bytes) const;
86	/// The convert function is used to convert multibyte sequences;
87	/// bytes will point to a byte sequence of n bytes where
88	/// sequenceLength(bytes, length) == -n, with length >= n.
89	///
90	/// The convert function must return the Unicode scalar value
91	/// represented by this byte sequence or -1 if the byte sequence is malformed.
92	///
93	/// The default implementation returns (int) bytes[0].
94
95	virtual int queryConvert(const unsigned char* bytes, int length) const;
96	/// The queryConvert function is used to convert single byte characters
97	/// or multibyte sequences;
98	/// bytes will point to a byte sequence of length bytes.
99	///
100	/// The queryConvert function must return the Unicode scalar value
101	/// represented by this byte sequence or -1 if the byte sequence is malformed
102	/// or -n where n is number of bytes requested for the sequence, if length is
103	/// shorter than the sequence.
104	/// The length of the sequence might not be determined by the first byte,
105	/// in which case the conversion becomes an iterative process:
106	/// First call with length == 1 might return -2,
107	/// Then a second call with length == 2 might return -4
108	/// Eventually, the third call with length == 4 should return either a
109	/// Unicode scalar value, or -1 if the byte sequence is malformed.
110	///
111	/// The default implementation returns (int) bytes[0].
112
113	virtual int sequenceLength(const unsigned char* bytes, int length) const;
114	/// The sequenceLength function is used to get the lenth of the sequence pointed
115	/// by bytes. The length parameter should be greater or equal to the length of
116	/// the sequence.
117	///
118	/// The sequenceLength function must return the length of the sequence
119	/// represented by this byte sequence or a negative value -n if length is
120	/// shorter than the sequence, where n is the number of byte requested
121	/// to determine the length of the sequence.
122	/// The length of the sequence might not be determined by the first byte,
123	/// in which case the conversion becomes an iterative process as long as the
124	/// result is negative:
125	/// First call with length == 1 might return -2,
126	/// Then a second call with length == 2 might return -4
127	/// Eventually, the third call with length == 4 should return 4.
128	/// The default implementation returns 1.
129
130	virtual int convert(int ch, unsigned char* bytes, int length) const;
131	/// Transform the Unicode character ch into the encoding's
132	/// byte sequence. The method returns the number of bytes
133	/// used. The method must not use more than length characters.
134	/// Bytes and length can also be null - in this case only the number
135	/// of bytes required to represent ch is returned.
136	/// If the character cannot be converted, 0 is returned and
137	/// the byte sequence remains unchanged.
138	/// The default implementation simply returns 0.
139
140	static TextEncoding& byName(const std::string& encodingName);
141	/// Returns the TextEncoding object for the given encoding name.
142	///
143	/// Throws a NotFoundException if the encoding with given name is not available.
144
145	static TextEncoding::Ptr find(const std::string& encodingName);
146	/// Returns a pointer to the TextEncoding object for the given encodingName,
147	/// or NULL if no such TextEncoding object exists.
148
149	static void add(TextEncoding::Ptr encoding);
150	/// Adds the given TextEncoding to the table of text encodings,
151	/// under the encoding's canonical name.
152	///
153	/// If an encoding with the given name is already registered,
154	/// it is replaced.
155
156	static void add(TextEncoding::Ptr encoding, const std::string& name);
157	/// Adds the given TextEncoding to the table of text encodings,
158	/// under the given name.
159	///
160	/// If an encoding with the given name is already registered,
161	/// it is replaced.
162
163	static void remove(const std::string& encodingName);
164	/// Removes the encoding with the given name from the table
165	/// of text encodings.
166
167	static TextEncoding::Ptr global(TextEncoding::Ptr encoding);
168	/// Sets global TextEncoding object.
169	///
170	/// This function sets the global encoding to the argument and returns a
171	/// reference of the previous global encoding.
172
173	static TextEncoding& global();
174	/// Return the current global TextEncoding object
175
176	static const std::string GLOBAL;
177	/// Name of the global TextEncoding, which is the empty string.
178
179	static const TextEncodingRegistry& registry();
180	/// Returns the TextEncodingRegistry.
181
182	protected:
183	static TextEncodingRegistry* registry(int);
184	/// Returns the TextEncodingRegistry.
185	};
186
187
188	class Foundation_API TextEncodingRegistry
189	/// This class serves as the main registry for all
190	/// supported TextEncoding's.
191	{
192	public:
193	TextEncodingRegistry();
194	/// Constructs TextEncodingRegistry
195
196	~TextEncodingRegistry();
197	/// Destroys TextEncodingRegistry
198
199	bool has(const std::string& name) const;
200	// Returns true if requested encoding is found.
201	// it will eturn true for both canonical and
202	// alternative encoding name.
203
204	void add(TextEncoding::Ptr pEncoding);
205	/// Adds encoding to the registry under its canonnical name.
206
207	void add(TextEncoding::Ptr pEncoding, const std::string& name);
208	/// Adds encoding to the registry under the specified name.
209
210	void remove(const std::string& name);
211	/// Removes the specified encoding from the registry.
212
213	TextEncoding::Ptr find(const std::string& name) const;
214	/// Returns Ptr to the enconding registerd under the speciied
215	/// name or having the name as an alias.
216	///
217	/// If encoding is not found, the returned Ptr points to nothing.
218
219	private:
220	TextEncodingRegistry(const TextEncodingRegistry&);
221	TextEncodingRegistry& operator = (const TextEncodingRegistry&);
222
223	typedef std::map<std::string, TextEncoding::Ptr, CILess> EncodingMap;
224
225	EncodingMap _encodings;
226	mutable RWLock _lock;
227	};
228
229
230	} // namespace Poco
231
232
233	#endif // Foundation_TextEncoding_INCLUDED
234

Browse the source code of POCO/Foundation/include/Poco/TextEncoding.h