1 | // |
2 | // TextEncoding.h |
3 | // |
4 | // Library: Foundation |
5 | // Package: Text |
6 | // Module: TextEncoding |
7 | // |
8 | // Definition of the abstract TextEncoding class. |
9 | // |
10 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
11 | // and Contributors. |
12 | // |
13 | // SPDX-License-Identifier: BSL-1.0 |
14 | // |
15 | |
16 | |
17 | #ifndef Foundation_TextEncoding_INCLUDED |
18 | #define Foundation_TextEncoding_INCLUDED |
19 | |
20 | |
21 | #include "Poco/Foundation.h" |
22 | #include "Poco/SharedPtr.h" |
23 | |
24 | |
25 | namespace Poco { |
26 | |
27 | |
28 | class TextEncodingManager; |
29 | |
30 | |
31 | class Foundation_API TextEncoding |
32 | /// An abstract base class for implementing text encodings |
33 | /// like UTF-8 or ISO 8859-1. |
34 | /// |
35 | /// Subclasses must override the canonicalName(), isA(), |
36 | /// characterMap() and convert() methods and need to be |
37 | /// thread safe and stateless. |
38 | /// |
39 | /// TextEncoding also provides static member functions |
40 | /// for managing mappings from encoding names to |
41 | /// TextEncoding objects. |
42 | { |
43 | public: |
44 | typedef SharedPtr<TextEncoding> Ptr; |
45 | |
46 | enum |
47 | { |
48 | MAX_SEQUENCE_LENGTH = 6 /// The maximum character byte sequence length supported. |
49 | }; |
50 | |
51 | typedef int CharacterMap[256]; |
52 | /// The map[b] member gives information about byte sequences |
53 | /// whose first byte is b. |
54 | /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c. |
55 | /// If map[b] is -1, then the byte sequence is malformed. |
56 | /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte |
57 | /// sequence that encodes a single Unicode scalar value. Byte sequences up |
58 | /// to 6 bytes in length are supported. |
59 | |
60 | virtual ~TextEncoding(); |
61 | /// Destroys the encoding. |
62 | |
63 | virtual const char* canonicalName() const = 0; |
64 | /// Returns the canonical name of this encoding, |
65 | /// e.g. "ISO-8859-1". Encoding name comparisons are case |
66 | /// insensitive. |
67 | |
68 | virtual bool isA(const std::string& encodingName) const = 0; |
69 | /// Returns true if the given name is one of the names of this encoding. |
70 | /// For example, the "ISO-8859-1" encoding is also known as "Latin-1". |
71 | /// |
72 | /// Encoding name comparisons are case insensitive. |
73 | |
74 | virtual const CharacterMap& characterMap() const = 0; |
75 | /// Returns the CharacterMap for the encoding. |
76 | /// The CharacterMap should be kept in a static member. As |
77 | /// characterMap() can be called frequently, it should be |
78 | /// implemented in such a way that it just returns a static |
79 | /// map. If the map is built at runtime, this should be |
80 | /// done in the constructor. |
81 | |
82 | virtual int convert(const unsigned char* bytes) const; |
83 | /// The convert function is used to convert multibyte sequences; |
84 | /// bytes will point to a byte sequence of n bytes where |
85 | /// sequenceLength(bytes, length) == -n, with length >= n. |
86 | /// |
87 | /// The convert function must return the Unicode scalar value |
88 | /// represented by this byte sequence or -1 if the byte sequence is malformed. |
89 | /// The default implementation returns (int) bytes[0]. |
90 | |
91 | virtual int queryConvert(const unsigned char* bytes, int length) const; |
92 | /// The queryConvert function is used to convert single byte characters |
93 | /// or multibyte sequences; |
94 | /// bytes will point to a byte sequence of length bytes. |
95 | /// |
96 | /// The queryConvert function must return the Unicode scalar value |
97 | /// represented by this byte sequence or -1 if the byte sequence is malformed |
98 | /// or -n where n is number of bytes requested for the sequence, if length is |
99 | /// shorter than the sequence. |
100 | /// The length of the sequence might not be determined by the first byte, |
101 | /// in which case the conversion becomes an iterative process: |
102 | /// First call with length == 1 might return -2, |
103 | /// Then a second call with length == 2 might return -4 |
104 | /// Eventually, the third call with length == 4 should return either a |
105 | /// Unicode scalar value, or -1 if the byte sequence is malformed. |
106 | /// The default implementation returns (int) bytes[0]. |
107 | |
108 | virtual int sequenceLength(const unsigned char* bytes, int length) const; |
109 | /// The sequenceLength function is used to get the lenth of the sequence pointed |
110 | /// by bytes. The length parameter should be greater or equal to the length of |
111 | /// the sequence. |
112 | /// |
113 | /// The sequenceLength function must return the length of the sequence |
114 | /// represented by this byte sequence or a negative value -n if length is |
115 | /// shorter than the sequence, where n is the number of byte requested |
116 | /// to determine the length of the sequence. |
117 | /// The length of the sequence might not be determined by the first byte, |
118 | /// in which case the conversion becomes an iterative process as long as the |
119 | /// result is negative: |
120 | /// First call with length == 1 might return -2, |
121 | /// Then a second call with length == 2 might return -4 |
122 | /// Eventually, the third call with length == 4 should return 4. |
123 | /// The default implementation returns 1. |
124 | |
125 | virtual int convert(int ch, unsigned char* bytes, int length) const; |
126 | /// Transform the Unicode character ch into the encoding's |
127 | /// byte sequence. The method returns the number of bytes |
128 | /// used. The method must not use more than length characters. |
129 | /// Bytes and length can also be null - in this case only the number |
130 | /// of bytes required to represent ch is returned. |
131 | /// If the character cannot be converted, 0 is returned and |
132 | /// the byte sequence remains unchanged. |
133 | /// The default implementation simply returns 0. |
134 | |
135 | static TextEncoding& byName(const std::string& encodingName); |
136 | /// Returns the TextEncoding object for the given encoding name. |
137 | /// |
138 | /// Throws a NotFoundException if the encoding with given name is not available. |
139 | |
140 | static TextEncoding::Ptr find(const std::string& encodingName); |
141 | /// Returns a pointer to the TextEncoding object for the given encodingName, |
142 | /// or NULL if no such TextEncoding object exists. |
143 | |
144 | static void add(TextEncoding::Ptr encoding); |
145 | /// Adds the given TextEncoding to the table of text encodings, |
146 | /// under the encoding's canonical name. |
147 | /// |
148 | /// If an encoding with the given name is already registered, |
149 | /// it is replaced. |
150 | |
151 | static void add(TextEncoding::Ptr encoding, const std::string& name); |
152 | /// Adds the given TextEncoding to the table of text encodings, |
153 | /// under the given name. |
154 | /// |
155 | /// If an encoding with the given name is already registered, |
156 | /// it is replaced. |
157 | |
158 | static void remove(const std::string& encodingName); |
159 | /// Removes the encoding with the given name from the table |
160 | /// of text encodings. |
161 | |
162 | static TextEncoding::Ptr global(TextEncoding::Ptr encoding); |
163 | /// Sets global TextEncoding object. |
164 | /// |
165 | /// This function sets the global encoding to the argument and returns a |
166 | /// reference of the previous global encoding. |
167 | |
168 | static TextEncoding& global(); |
169 | /// Return the current global TextEncoding object |
170 | |
171 | static const std::string GLOBAL; |
172 | /// Name of the global TextEncoding, which is the empty string. |
173 | |
174 | protected: |
175 | static TextEncodingManager& manager(); |
176 | /// Returns the TextEncodingManager. |
177 | }; |
178 | |
179 | |
180 | } // namespace Poco |
181 | |
182 | |
183 | #endif // Foundation_TextEncoding_INCLUDED |
184 | |