1 | // |
---|
2 | // TextEncoding.h |
---|
3 | // |
---|
4 | // $Id: //poco/1.3/Foundation/include/Poco/TextEncoding.h#4 $ |
---|
5 | // |
---|
6 | // Library: Foundation |
---|
7 | // Package: Text |
---|
8 | // Module: TextEncoding |
---|
9 | // |
---|
10 | // Definition of the abstract TextEncoding class. |
---|
11 | // |
---|
12 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
---|
13 | // and Contributors. |
---|
14 | // |
---|
15 | // Permission is hereby granted, free of charge, to any person or organization |
---|
16 | // obtaining a copy of the software and accompanying documentation covered by |
---|
17 | // this license (the "Software") to use, reproduce, display, distribute, |
---|
18 | // execute, and transmit the Software, and to prepare derivative works of the |
---|
19 | // Software, and to permit third-parties to whom the Software is furnished to |
---|
20 | // do so, all subject to the following: |
---|
21 | // |
---|
22 | // The copyright notices in the Software and this entire statement, including |
---|
23 | // the above license grant, this restriction and the following disclaimer, |
---|
24 | // must be included in all copies of the Software, in whole or in part, and |
---|
25 | // all derivative works of the Software, unless such copies or derivative |
---|
26 | // works are solely in the form of machine-executable object code generated by |
---|
27 | // a source language processor. |
---|
28 | // |
---|
29 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
30 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
31 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
32 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
33 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
34 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
35 | // DEALINGS IN THE SOFTWARE. |
---|
36 | // |
---|
37 | |
---|
38 | |
---|
39 | #ifndef Foundation_TextEncoding_INCLUDED |
---|
40 | #define Foundation_TextEncoding_INCLUDED |
---|
41 | |
---|
42 | |
---|
43 | #include "Poco/Foundation.h" |
---|
44 | #include "Poco/SharedPtr.h" |
---|
45 | |
---|
46 | |
---|
47 | namespace Poco { |
---|
48 | |
---|
49 | |
---|
50 | class TextEncodingManager; |
---|
51 | |
---|
52 | |
---|
53 | class Foundation_API TextEncoding |
---|
54 | /// An abstract base class for implementing text encodings |
---|
55 | /// like UTF-8 or ISO 8859-1. |
---|
56 | /// |
---|
57 | /// Subclasses must override the canonicalName(), isA(), |
---|
58 | /// characterMap() and convert() methods and need to be |
---|
59 | /// thread safe and stateless. |
---|
60 | /// |
---|
61 | /// TextEncoding also provides static member functions |
---|
62 | /// for managing mappings from encoding names to |
---|
63 | /// TextEncoding objects. |
---|
64 | { |
---|
65 | public: |
---|
66 | typedef SharedPtr<TextEncoding> Ptr; |
---|
67 | |
---|
68 | enum |
---|
69 | { |
---|
70 | MAX_SEQUENCE_LENGTH = 6 /// The maximum character byte sequence length supported. |
---|
71 | }; |
---|
72 | |
---|
73 | typedef int CharacterMap[256]; |
---|
74 | /// The map[b] member gives information about byte sequences |
---|
75 | /// whose first byte is b. |
---|
76 | /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c. |
---|
77 | /// If map[b] is -1, then the byte sequence is malformed. |
---|
78 | /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte |
---|
79 | /// sequence that encodes a single Unicode scalar value. Byte sequences up |
---|
80 | /// to 6 bytes in length are supported. |
---|
81 | |
---|
82 | virtual ~TextEncoding(); |
---|
83 | /// Destroys the encoding. |
---|
84 | |
---|
85 | virtual const char* canonicalName() const = 0; |
---|
86 | /// Returns the canonical name of this encoding, |
---|
87 | /// e.g. "ISO-8859-1". Encoding name comparisons are case |
---|
88 | /// insensitive. |
---|
89 | |
---|
90 | virtual bool isA(const std::string& encodingName) const = 0; |
---|
91 | /// Returns true if the given name is one of the names of this encoding. |
---|
92 | /// For example, the "ISO-8859-1" encoding is also known as "Latin-1". |
---|
93 | /// |
---|
94 | /// Encoding name comparision are be case insensitive. |
---|
95 | |
---|
96 | virtual const CharacterMap& characterMap() const = 0; |
---|
97 | /// Returns the CharacterMap for the encoding. |
---|
98 | /// The CharacterMap should be kept in a static member. As |
---|
99 | /// characterMap() can be called frequently, it should be |
---|
100 | /// implemented in such a way that it just returns a static |
---|
101 | /// map. If the map is built at runtime, this should be |
---|
102 | /// done in the constructor. |
---|
103 | |
---|
104 | virtual int convert(const unsigned char* bytes) const; |
---|
105 | /// The convert function is used to convert multibyte sequences; |
---|
106 | /// bytes will point to a byte sequence of n bytes where |
---|
107 | /// sequenceLength(bytes, length) == -n, with length >= n. |
---|
108 | /// |
---|
109 | /// The convert function must return the Unicode scalar value |
---|
110 | /// represented by this byte sequence or -1 if the byte sequence is malformed. |
---|
111 | /// The default implementation returns (int) bytes[0]. |
---|
112 | |
---|
113 | virtual int queryConvert(const unsigned char* bytes, int length) const; |
---|
114 | /// The queryConvert function is used to convert single byte characters |
---|
115 | /// or multibyte sequences; |
---|
116 | /// bytes will point to a byte sequence of length bytes. |
---|
117 | /// |
---|
118 | /// The queryConvert function must return the Unicode scalar value |
---|
119 | /// represented by this byte sequence or -1 if the byte sequence is malformed |
---|
120 | /// or -n where n is number of bytes requested for the sequence, if lenght is |
---|
121 | /// shorter than the sequence. |
---|
122 | /// The length of the sequence might not be determined by the first byte, |
---|
123 | /// in which case the conversion becomes an iterative process: |
---|
124 | /// First call with length == 1 might return -2, |
---|
125 | /// Then a second call with lenght == 2 might return -4 |
---|
126 | /// Eventually, the third call with length == 4 should return either a |
---|
127 | /// Unicode scalar value, or -1 if the byte sequence is malformed. |
---|
128 | /// The default implementation returns (int) bytes[0]. |
---|
129 | |
---|
130 | virtual int sequenceLength(const unsigned char* bytes, int length) const; |
---|
131 | /// The sequenceLength function is used to get the lenth of the sequence pointed |
---|
132 | /// by bytes. The length paramater should be greater or equal to the length of |
---|
133 | /// the sequence. |
---|
134 | /// |
---|
135 | /// The sequenceLength function must return the lenght of the sequence |
---|
136 | /// represented by this byte sequence or a negative value -n if length is |
---|
137 | /// shorter than the sequence, where n is the number of byte requested |
---|
138 | /// to determine the length of the sequence. |
---|
139 | /// The length of the sequence might not be determined by the first byte, |
---|
140 | /// in which case the conversion becomes an iterative process as long as the |
---|
141 | /// result is negative: |
---|
142 | /// First call with length == 1 might return -2, |
---|
143 | /// Then a second call with lenght == 2 might return -4 |
---|
144 | /// Eventually, the third call with length == 4 should return 4. |
---|
145 | /// The default implementation returns 1. |
---|
146 | |
---|
147 | virtual int convert(int ch, unsigned char* bytes, int length) const; |
---|
148 | /// Transform the Unicode character ch into the encoding's |
---|
149 | /// byte sequence. The method returns the number of bytes |
---|
150 | /// used. The method must not use more than length characters. |
---|
151 | /// Bytes and length can also be null - in this case only the number |
---|
152 | /// of bytes required to represent ch is returned. |
---|
153 | /// If the character cannot be converted, 0 is returned and |
---|
154 | /// the byte sequence remains unchanged. |
---|
155 | /// The default implementation simply returns 0. |
---|
156 | |
---|
157 | static TextEncoding& byName(const std::string& encodingName); |
---|
158 | /// Returns the TextEncoding object for the given encoding name. |
---|
159 | /// |
---|
160 | /// Throws a NotFoundException if the encoding with given name is not available. |
---|
161 | |
---|
162 | static TextEncoding::Ptr find(const std::string& encodingName); |
---|
163 | /// Returns a pointer to the TextEncoding object for the given encodingName, |
---|
164 | /// or NULL if no such TextEncoding object exists. |
---|
165 | |
---|
166 | static void add(TextEncoding::Ptr encoding); |
---|
167 | /// Adds the given TextEncoding to the table of text encodings, |
---|
168 | /// under the encoding's canonical name. |
---|
169 | /// |
---|
170 | /// If an encoding with the given name is already registered, |
---|
171 | /// it is replaced. |
---|
172 | |
---|
173 | static void add(TextEncoding::Ptr encoding, const std::string& name); |
---|
174 | /// Adds the given TextEncoding to the table of text encodings, |
---|
175 | /// under the given name. |
---|
176 | /// |
---|
177 | /// If an encoding with the given name is already registered, |
---|
178 | /// it is replaced. |
---|
179 | |
---|
180 | static void remove(const std::string& encodingName); |
---|
181 | /// Removes the encoding with the given name from the table |
---|
182 | /// of text encodings. |
---|
183 | |
---|
184 | static TextEncoding::Ptr global(TextEncoding::Ptr encoding); |
---|
185 | /// Sets global TextEncoding object. |
---|
186 | /// |
---|
187 | /// This function sets the global encoding to the argument and returns a |
---|
188 | /// reference of the previous global encoding. |
---|
189 | |
---|
190 | static TextEncoding& global(); |
---|
191 | /// Return the current global TextEncoding object |
---|
192 | |
---|
193 | static const std::string GLOBAL; |
---|
194 | /// Name of the global TextEncoding, which is the empty string. |
---|
195 | |
---|
196 | protected: |
---|
197 | static TextEncodingManager& manager(); |
---|
198 | /// Returns the TextEncodingManager. |
---|
199 | }; |
---|
200 | |
---|
201 | |
---|
202 | } // namespace Poco |
---|
203 | |
---|
204 | |
---|
205 | #endif // Foundation_TextEncoding_INCLUDED |
---|