1 | // |
2 | // TextEncoding.h |
3 | // |
4 | // Library: Foundation |
5 | // Package: Text |
6 | // Module: TextEncoding |
7 | // |
8 | // Definition of the abstract TextEncoding class. |
9 | // |
10 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
11 | // and Contributors. |
12 | // |
13 | // SPDX-License-Identifier: BSL-1.0 |
14 | // |
15 | |
16 | |
17 | #ifndef Foundation_TextEncoding_INCLUDED |
18 | #define Foundation_TextEncoding_INCLUDED |
19 | |
20 | |
21 | #include "Poco/Foundation.h" |
22 | #include "Poco/SharedPtr.h" |
23 | #include "Poco/String.h" |
24 | #include "Poco/RWLock.h" |
25 | #include <map> |
26 | |
27 | |
28 | namespace Poco { |
29 | |
30 | |
31 | class Foundation_API TextEncodingRegistry; |
32 | |
33 | |
34 | class Foundation_API TextEncoding |
35 | /// An abstract base class for implementing text encodings |
36 | /// like UTF-8 or ISO 8859-1. |
37 | /// |
38 | /// Subclasses must override the canonicalName(), isA(), |
39 | /// characterMap() and convert() methods and need to be |
40 | /// thread safe and stateless. |
41 | /// |
42 | /// TextEncoding also provides static member functions |
43 | /// for managing mappings from encoding names to |
44 | /// TextEncoding objects. |
45 | { |
46 | public: |
47 | typedef SharedPtr<TextEncoding> Ptr; |
48 | |
49 | enum |
50 | { |
51 | MAX_SEQUENCE_LENGTH = 4 /// The maximum character byte sequence length supported. |
52 | }; |
53 | |
54 | typedef int CharacterMap[256]; |
55 | /// The map[b] member gives information about byte sequences |
56 | /// whose first byte is b. |
57 | /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c. |
58 | /// If map[b] is -1, then the byte sequence is malformed. |
59 | /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte |
60 | /// sequence that encodes a single Unicode scalar value. Byte sequences up |
61 | /// to 4 bytes in length are supported. |
62 | |
63 | virtual ~TextEncoding(); |
64 | /// Destroys the encoding. |
65 | |
66 | virtual const char* canonicalName() const = 0; |
67 | /// Returns the canonical name of this encoding, |
68 | /// e.g. "ISO-8859-1". Encoding name comparisons are case |
69 | /// insensitive. |
70 | |
71 | virtual bool isA(const std::string& encodingName) const = 0; |
72 | /// Returns true if the given name is one of the names of this encoding. |
73 | /// For example, the "ISO-8859-1" encoding is also known as "Latin-1". |
74 | /// |
75 | /// Encoding name comparisons are case insensitive. |
76 | |
77 | virtual const CharacterMap& characterMap() const = 0; |
78 | /// Returns the CharacterMap for the encoding. |
79 | /// The CharacterMap should be kept in a static member. As |
80 | /// characterMap() can be called frequently, it should be |
81 | /// implemented in such a way that it just returns a static |
82 | /// map. If the map is built at runtime, this should be |
83 | /// done in the constructor. |
84 | |
85 | virtual int convert(const unsigned char* bytes) const; |
86 | /// The convert function is used to convert multibyte sequences; |
87 | /// bytes will point to a byte sequence of n bytes where |
88 | /// sequenceLength(bytes, length) == -n, with length >= n. |
89 | /// |
90 | /// The convert function must return the Unicode scalar value |
91 | /// represented by this byte sequence or -1 if the byte sequence is malformed. |
92 | /// |
93 | /// The default implementation returns (int) bytes[0]. |
94 | |
95 | virtual int queryConvert(const unsigned char* bytes, int length) const; |
96 | /// The queryConvert function is used to convert single byte characters |
97 | /// or multibyte sequences; |
98 | /// bytes will point to a byte sequence of length bytes. |
99 | /// |
100 | /// The queryConvert function must return the Unicode scalar value |
101 | /// represented by this byte sequence or -1 if the byte sequence is malformed |
102 | /// or -n where n is number of bytes requested for the sequence, if length is |
103 | /// shorter than the sequence. |
104 | /// The length of the sequence might not be determined by the first byte, |
105 | /// in which case the conversion becomes an iterative process: |
106 | /// First call with length == 1 might return -2, |
107 | /// Then a second call with length == 2 might return -4 |
108 | /// Eventually, the third call with length == 4 should return either a |
109 | /// Unicode scalar value, or -1 if the byte sequence is malformed. |
110 | /// |
111 | /// The default implementation returns (int) bytes[0]. |
112 | |
113 | virtual int sequenceLength(const unsigned char* bytes, int length) const; |
114 | /// The sequenceLength function is used to get the lenth of the sequence pointed |
115 | /// by bytes. The length parameter should be greater or equal to the length of |
116 | /// the sequence. |
117 | /// |
118 | /// The sequenceLength function must return the length of the sequence |
119 | /// represented by this byte sequence or a negative value -n if length is |
120 | /// shorter than the sequence, where n is the number of byte requested |
121 | /// to determine the length of the sequence. |
122 | /// The length of the sequence might not be determined by the first byte, |
123 | /// in which case the conversion becomes an iterative process as long as the |
124 | /// result is negative: |
125 | /// First call with length == 1 might return -2, |
126 | /// Then a second call with length == 2 might return -4 |
127 | /// Eventually, the third call with length == 4 should return 4. |
128 | /// The default implementation returns 1. |
129 | |
130 | virtual int convert(int ch, unsigned char* bytes, int length) const; |
131 | /// Transform the Unicode character ch into the encoding's |
132 | /// byte sequence. The method returns the number of bytes |
133 | /// used. The method must not use more than length characters. |
134 | /// Bytes and length can also be null - in this case only the number |
135 | /// of bytes required to represent ch is returned. |
136 | /// If the character cannot be converted, 0 is returned and |
137 | /// the byte sequence remains unchanged. |
138 | /// The default implementation simply returns 0. |
139 | |
140 | static TextEncoding& byName(const std::string& encodingName); |
141 | /// Returns the TextEncoding object for the given encoding name. |
142 | /// |
143 | /// Throws a NotFoundException if the encoding with given name is not available. |
144 | |
145 | static TextEncoding::Ptr find(const std::string& encodingName); |
146 | /// Returns a pointer to the TextEncoding object for the given encodingName, |
147 | /// or NULL if no such TextEncoding object exists. |
148 | |
149 | static void add(TextEncoding::Ptr encoding); |
150 | /// Adds the given TextEncoding to the table of text encodings, |
151 | /// under the encoding's canonical name. |
152 | /// |
153 | /// If an encoding with the given name is already registered, |
154 | /// it is replaced. |
155 | |
156 | static void add(TextEncoding::Ptr encoding, const std::string& name); |
157 | /// Adds the given TextEncoding to the table of text encodings, |
158 | /// under the given name. |
159 | /// |
160 | /// If an encoding with the given name is already registered, |
161 | /// it is replaced. |
162 | |
163 | static void remove(const std::string& encodingName); |
164 | /// Removes the encoding with the given name from the table |
165 | /// of text encodings. |
166 | |
167 | static TextEncoding::Ptr global(TextEncoding::Ptr encoding); |
168 | /// Sets global TextEncoding object. |
169 | /// |
170 | /// This function sets the global encoding to the argument and returns a |
171 | /// reference of the previous global encoding. |
172 | |
173 | static TextEncoding& global(); |
174 | /// Return the current global TextEncoding object |
175 | |
176 | static const std::string GLOBAL; |
177 | /// Name of the global TextEncoding, which is the empty string. |
178 | |
179 | static const TextEncodingRegistry& registry(); |
180 | /// Returns the TextEncodingRegistry. |
181 | |
182 | protected: |
183 | static TextEncodingRegistry* registry(int); |
184 | /// Returns the TextEncodingRegistry. |
185 | }; |
186 | |
187 | |
188 | class Foundation_API TextEncodingRegistry |
189 | /// This class serves as the main registry for all |
190 | /// supported TextEncoding's. |
191 | { |
192 | public: |
193 | TextEncodingRegistry(); |
194 | /// Constructs TextEncodingRegistry |
195 | |
196 | ~TextEncodingRegistry(); |
197 | /// Destroys TextEncodingRegistry |
198 | |
199 | bool has(const std::string& name) const; |
200 | // Returns true if requested encoding is found. |
201 | // it will eturn true for both canonical and |
202 | // alternative encoding name. |
203 | |
204 | void add(TextEncoding::Ptr pEncoding); |
205 | /// Adds encoding to the registry under its canonnical name. |
206 | |
207 | void add(TextEncoding::Ptr pEncoding, const std::string& name); |
208 | /// Adds encoding to the registry under the specified name. |
209 | |
210 | void remove(const std::string& name); |
211 | /// Removes the specified encoding from the registry. |
212 | |
213 | TextEncoding::Ptr find(const std::string& name) const; |
214 | /// Returns Ptr to the enconding registerd under the speciied |
215 | /// name or having the name as an alias. |
216 | /// |
217 | /// If encoding is not found, the returned Ptr points to nothing. |
218 | |
219 | private: |
220 | TextEncodingRegistry(const TextEncodingRegistry&); |
221 | TextEncodingRegistry& operator = (const TextEncodingRegistry&); |
222 | |
223 | typedef std::map<std::string, TextEncoding::Ptr, CILess> EncodingMap; |
224 | |
225 | EncodingMap _encodings; |
226 | mutable RWLock _lock; |
227 | }; |
228 | |
229 | |
230 | } // namespace Poco |
231 | |
232 | |
233 | #endif // Foundation_TextEncoding_INCLUDED |
234 | |