1//
2// TextEncoding.h
3//
4// Library: Foundation
5// Package: Text
6// Module: TextEncoding
7//
8// Definition of the abstract TextEncoding class.
9//
10// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
11// and Contributors.
12//
13// SPDX-License-Identifier: BSL-1.0
14//
15
16
17#ifndef Foundation_TextEncoding_INCLUDED
18#define Foundation_TextEncoding_INCLUDED
19
20
21#include "Poco/Foundation.h"
22#include "Poco/SharedPtr.h"
23
24
25namespace Poco {
26
27
28class TextEncodingManager;
29
30
31class Foundation_API TextEncoding
32 /// An abstract base class for implementing text encodings
33 /// like UTF-8 or ISO 8859-1.
34 ///
35 /// Subclasses must override the canonicalName(), isA(),
36 /// characterMap() and convert() methods and need to be
37 /// thread safe and stateless.
38 ///
39 /// TextEncoding also provides static member functions
40 /// for managing mappings from encoding names to
41 /// TextEncoding objects.
42{
43public:
44 typedef SharedPtr<TextEncoding> Ptr;
45
46 enum
47 {
48 MAX_SEQUENCE_LENGTH = 6 /// The maximum character byte sequence length supported.
49 };
50
51 typedef int CharacterMap[256];
52 /// The map[b] member gives information about byte sequences
53 /// whose first byte is b.
54 /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
55 /// If map[b] is -1, then the byte sequence is malformed.
56 /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
57 /// sequence that encodes a single Unicode scalar value. Byte sequences up
58 /// to 6 bytes in length are supported.
59
60 virtual ~TextEncoding();
61 /// Destroys the encoding.
62
63 virtual const char* canonicalName() const = 0;
64 /// Returns the canonical name of this encoding,
65 /// e.g. "ISO-8859-1". Encoding name comparisons are case
66 /// insensitive.
67
68 virtual bool isA(const std::string& encodingName) const = 0;
69 /// Returns true if the given name is one of the names of this encoding.
70 /// For example, the "ISO-8859-1" encoding is also known as "Latin-1".
71 ///
72 /// Encoding name comparisons are case insensitive.
73
74 virtual const CharacterMap& characterMap() const = 0;
75 /// Returns the CharacterMap for the encoding.
76 /// The CharacterMap should be kept in a static member. As
77 /// characterMap() can be called frequently, it should be
78 /// implemented in such a way that it just returns a static
79 /// map. If the map is built at runtime, this should be
80 /// done in the constructor.
81
82 virtual int convert(const unsigned char* bytes) const;
83 /// The convert function is used to convert multibyte sequences;
84 /// bytes will point to a byte sequence of n bytes where
85 /// sequenceLength(bytes, length) == -n, with length >= n.
86 ///
87 /// The convert function must return the Unicode scalar value
88 /// represented by this byte sequence or -1 if the byte sequence is malformed.
89 /// The default implementation returns (int) bytes[0].
90
91 virtual int queryConvert(const unsigned char* bytes, int length) const;
92 /// The queryConvert function is used to convert single byte characters
93 /// or multibyte sequences;
94 /// bytes will point to a byte sequence of length bytes.
95 ///
96 /// The queryConvert function must return the Unicode scalar value
97 /// represented by this byte sequence or -1 if the byte sequence is malformed
98 /// or -n where n is number of bytes requested for the sequence, if length is
99 /// shorter than the sequence.
100 /// The length of the sequence might not be determined by the first byte,
101 /// in which case the conversion becomes an iterative process:
102 /// First call with length == 1 might return -2,
103 /// Then a second call with length == 2 might return -4
104 /// Eventually, the third call with length == 4 should return either a
105 /// Unicode scalar value, or -1 if the byte sequence is malformed.
106 /// The default implementation returns (int) bytes[0].
107
108 virtual int sequenceLength(const unsigned char* bytes, int length) const;
109 /// The sequenceLength function is used to get the lenth of the sequence pointed
110 /// by bytes. The length parameter should be greater or equal to the length of
111 /// the sequence.
112 ///
113 /// The sequenceLength function must return the length of the sequence
114 /// represented by this byte sequence or a negative value -n if length is
115 /// shorter than the sequence, where n is the number of byte requested
116 /// to determine the length of the sequence.
117 /// The length of the sequence might not be determined by the first byte,
118 /// in which case the conversion becomes an iterative process as long as the
119 /// result is negative:
120 /// First call with length == 1 might return -2,
121 /// Then a second call with length == 2 might return -4
122 /// Eventually, the third call with length == 4 should return 4.
123 /// The default implementation returns 1.
124
125 virtual int convert(int ch, unsigned char* bytes, int length) const;
126 /// Transform the Unicode character ch into the encoding's
127 /// byte sequence. The method returns the number of bytes
128 /// used. The method must not use more than length characters.
129 /// Bytes and length can also be null - in this case only the number
130 /// of bytes required to represent ch is returned.
131 /// If the character cannot be converted, 0 is returned and
132 /// the byte sequence remains unchanged.
133 /// The default implementation simply returns 0.
134
135 static TextEncoding& byName(const std::string& encodingName);
136 /// Returns the TextEncoding object for the given encoding name.
137 ///
138 /// Throws a NotFoundException if the encoding with given name is not available.
139
140 static TextEncoding::Ptr find(const std::string& encodingName);
141 /// Returns a pointer to the TextEncoding object for the given encodingName,
142 /// or NULL if no such TextEncoding object exists.
143
144 static void add(TextEncoding::Ptr encoding);
145 /// Adds the given TextEncoding to the table of text encodings,
146 /// under the encoding's canonical name.
147 ///
148 /// If an encoding with the given name is already registered,
149 /// it is replaced.
150
151 static void add(TextEncoding::Ptr encoding, const std::string& name);
152 /// Adds the given TextEncoding to the table of text encodings,
153 /// under the given name.
154 ///
155 /// If an encoding with the given name is already registered,
156 /// it is replaced.
157
158 static void remove(const std::string& encodingName);
159 /// Removes the encoding with the given name from the table
160 /// of text encodings.
161
162 static TextEncoding::Ptr global(TextEncoding::Ptr encoding);
163 /// Sets global TextEncoding object.
164 ///
165 /// This function sets the global encoding to the argument and returns a
166 /// reference of the previous global encoding.
167
168 static TextEncoding& global();
169 /// Return the current global TextEncoding object
170
171 static const std::string GLOBAL;
172 /// Name of the global TextEncoding, which is the empty string.
173
174protected:
175 static TextEncodingManager& manager();
176 /// Returns the TextEncodingManager.
177};
178
179
180} // namespace Poco
181
182
183#endif // Foundation_TextEncoding_INCLUDED
184