1//
2// TextEncoding.h
3//
4// Library: Foundation
5// Package: Text
6// Module: TextEncoding
7//
8// Definition of the abstract TextEncoding class.
9//
10// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
11// and Contributors.
12//
13// SPDX-License-Identifier: BSL-1.0
14//
15
16
17#ifndef Foundation_TextEncoding_INCLUDED
18#define Foundation_TextEncoding_INCLUDED
19
20
21#include "Poco/Foundation.h"
22#include "Poco/SharedPtr.h"
23#include "Poco/String.h"
24#include "Poco/RWLock.h"
25#include <map>
26
27
28namespace Poco {
29
30
31class Foundation_API TextEncodingRegistry;
32
33
34class Foundation_API TextEncoding
35 /// An abstract base class for implementing text encodings
36 /// like UTF-8 or ISO 8859-1.
37 ///
38 /// Subclasses must override the canonicalName(), isA(),
39 /// characterMap() and convert() methods and need to be
40 /// thread safe and stateless.
41 ///
42 /// TextEncoding also provides static member functions
43 /// for managing mappings from encoding names to
44 /// TextEncoding objects.
45{
46public:
47 typedef SharedPtr<TextEncoding> Ptr;
48
49 enum
50 {
51 MAX_SEQUENCE_LENGTH = 4 /// The maximum character byte sequence length supported.
52 };
53
54 typedef int CharacterMap[256];
55 /// The map[b] member gives information about byte sequences
56 /// whose first byte is b.
57 /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
58 /// If map[b] is -1, then the byte sequence is malformed.
59 /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
60 /// sequence that encodes a single Unicode scalar value. Byte sequences up
61 /// to 4 bytes in length are supported.
62
63 virtual ~TextEncoding();
64 /// Destroys the encoding.
65
66 virtual const char* canonicalName() const = 0;
67 /// Returns the canonical name of this encoding,
68 /// e.g. "ISO-8859-1". Encoding name comparisons are case
69 /// insensitive.
70
71 virtual bool isA(const std::string& encodingName) const = 0;
72 /// Returns true if the given name is one of the names of this encoding.
73 /// For example, the "ISO-8859-1" encoding is also known as "Latin-1".
74 ///
75 /// Encoding name comparisons are case insensitive.
76
77 virtual const CharacterMap& characterMap() const = 0;
78 /// Returns the CharacterMap for the encoding.
79 /// The CharacterMap should be kept in a static member. As
80 /// characterMap() can be called frequently, it should be
81 /// implemented in such a way that it just returns a static
82 /// map. If the map is built at runtime, this should be
83 /// done in the constructor.
84
85 virtual int convert(const unsigned char* bytes) const;
86 /// The convert function is used to convert multibyte sequences;
87 /// bytes will point to a byte sequence of n bytes where
88 /// sequenceLength(bytes, length) == -n, with length >= n.
89 ///
90 /// The convert function must return the Unicode scalar value
91 /// represented by this byte sequence or -1 if the byte sequence is malformed.
92 ///
93 /// The default implementation returns (int) bytes[0].
94
95 virtual int queryConvert(const unsigned char* bytes, int length) const;
96 /// The queryConvert function is used to convert single byte characters
97 /// or multibyte sequences;
98 /// bytes will point to a byte sequence of length bytes.
99 ///
100 /// The queryConvert function must return the Unicode scalar value
101 /// represented by this byte sequence or -1 if the byte sequence is malformed
102 /// or -n where n is number of bytes requested for the sequence, if length is
103 /// shorter than the sequence.
104 /// The length of the sequence might not be determined by the first byte,
105 /// in which case the conversion becomes an iterative process:
106 /// First call with length == 1 might return -2,
107 /// Then a second call with length == 2 might return -4
108 /// Eventually, the third call with length == 4 should return either a
109 /// Unicode scalar value, or -1 if the byte sequence is malformed.
110 ///
111 /// The default implementation returns (int) bytes[0].
112
113 virtual int sequenceLength(const unsigned char* bytes, int length) const;
114 /// The sequenceLength function is used to get the lenth of the sequence pointed
115 /// by bytes. The length parameter should be greater or equal to the length of
116 /// the sequence.
117 ///
118 /// The sequenceLength function must return the length of the sequence
119 /// represented by this byte sequence or a negative value -n if length is
120 /// shorter than the sequence, where n is the number of byte requested
121 /// to determine the length of the sequence.
122 /// The length of the sequence might not be determined by the first byte,
123 /// in which case the conversion becomes an iterative process as long as the
124 /// result is negative:
125 /// First call with length == 1 might return -2,
126 /// Then a second call with length == 2 might return -4
127 /// Eventually, the third call with length == 4 should return 4.
128 /// The default implementation returns 1.
129
130 virtual int convert(int ch, unsigned char* bytes, int length) const;
131 /// Transform the Unicode character ch into the encoding's
132 /// byte sequence. The method returns the number of bytes
133 /// used. The method must not use more than length characters.
134 /// Bytes and length can also be null - in this case only the number
135 /// of bytes required to represent ch is returned.
136 /// If the character cannot be converted, 0 is returned and
137 /// the byte sequence remains unchanged.
138 /// The default implementation simply returns 0.
139
140 static TextEncoding& byName(const std::string& encodingName);
141 /// Returns the TextEncoding object for the given encoding name.
142 ///
143 /// Throws a NotFoundException if the encoding with given name is not available.
144
145 static TextEncoding::Ptr find(const std::string& encodingName);
146 /// Returns a pointer to the TextEncoding object for the given encodingName,
147 /// or NULL if no such TextEncoding object exists.
148
149 static void add(TextEncoding::Ptr encoding);
150 /// Adds the given TextEncoding to the table of text encodings,
151 /// under the encoding's canonical name.
152 ///
153 /// If an encoding with the given name is already registered,
154 /// it is replaced.
155
156 static void add(TextEncoding::Ptr encoding, const std::string& name);
157 /// Adds the given TextEncoding to the table of text encodings,
158 /// under the given name.
159 ///
160 /// If an encoding with the given name is already registered,
161 /// it is replaced.
162
163 static void remove(const std::string& encodingName);
164 /// Removes the encoding with the given name from the table
165 /// of text encodings.
166
167 static TextEncoding::Ptr global(TextEncoding::Ptr encoding);
168 /// Sets global TextEncoding object.
169 ///
170 /// This function sets the global encoding to the argument and returns a
171 /// reference of the previous global encoding.
172
173 static TextEncoding& global();
174 /// Return the current global TextEncoding object
175
176 static const std::string GLOBAL;
177 /// Name of the global TextEncoding, which is the empty string.
178
179 static const TextEncodingRegistry& registry();
180 /// Returns the TextEncodingRegistry.
181
182protected:
183 static TextEncodingRegistry* registry(int);
184 /// Returns the TextEncodingRegistry.
185};
186
187
188class Foundation_API TextEncodingRegistry
189 /// This class serves as the main registry for all
190 /// supported TextEncoding's.
191{
192public:
193 TextEncodingRegistry();
194 /// Constructs TextEncodingRegistry
195
196 ~TextEncodingRegistry();
197 /// Destroys TextEncodingRegistry
198
199 bool has(const std::string& name) const;
200 // Returns true if requested encoding is found.
201 // it will eturn true for both canonical and
202 // alternative encoding name.
203
204 void add(TextEncoding::Ptr pEncoding);
205 /// Adds encoding to the registry under its canonnical name.
206
207 void add(TextEncoding::Ptr pEncoding, const std::string& name);
208 /// Adds encoding to the registry under the specified name.
209
210 void remove(const std::string& name);
211 /// Removes the specified encoding from the registry.
212
213 TextEncoding::Ptr find(const std::string& name) const;
214 /// Returns Ptr to the enconding registerd under the speciied
215 /// name or having the name as an alias.
216 ///
217 /// If encoding is not found, the returned Ptr points to nothing.
218
219private:
220 TextEncodingRegistry(const TextEncodingRegistry&);
221 TextEncodingRegistry& operator = (const TextEncodingRegistry&);
222
223 typedef std::map<std::string, TextEncoding::Ptr, CILess> EncodingMap;
224
225 EncodingMap _encodings;
226 mutable RWLock _lock;
227};
228
229
230} // namespace Poco
231
232
233#endif // Foundation_TextEncoding_INCLUDED
234