1 | // |
---|
2 | // Unicode.h |
---|
3 | // |
---|
4 | // $Id: //poco/1.3/Foundation/include/Poco/Unicode.h#2 $ |
---|
5 | // |
---|
6 | // Library: Foundation |
---|
7 | // Package: Text |
---|
8 | // Module: Unicode |
---|
9 | // |
---|
10 | // Definition of the Unicode class. |
---|
11 | // |
---|
12 | // Copyright (c) 2007, Applied Informatics Software Engineering GmbH. |
---|
13 | // and Contributors. |
---|
14 | // |
---|
15 | // Permission is hereby granted, free of charge, to any person or organization |
---|
16 | // obtaining a copy of the software and accompanying documentation covered by |
---|
17 | // this license (the "Software") to use, reproduce, display, distribute, |
---|
18 | // execute, and transmit the Software, and to prepare derivative works of the |
---|
19 | // Software, and to permit third-parties to whom the Software is furnished to |
---|
20 | // do so, all subject to the following: |
---|
21 | // |
---|
22 | // The copyright notices in the Software and this entire statement, including |
---|
23 | // the above license grant, this restriction and the following disclaimer, |
---|
24 | // must be included in all copies of the Software, in whole or in part, and |
---|
25 | // all derivative works of the Software, unless such copies or derivative |
---|
26 | // works are solely in the form of machine-executable object code generated by |
---|
27 | // a source language processor. |
---|
28 | // |
---|
29 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
30 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
31 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
32 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
33 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
34 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
35 | // DEALINGS IN THE SOFTWARE. |
---|
36 | // |
---|
37 | |
---|
38 | |
---|
39 | #ifndef Foundation_Unicode_INCLUDED |
---|
40 | #define Foundation_Unicode_INCLUDED |
---|
41 | |
---|
42 | |
---|
43 | #include "Poco/Foundation.h" |
---|
44 | |
---|
45 | |
---|
46 | namespace Poco { |
---|
47 | |
---|
48 | |
---|
49 | class Foundation_API Unicode |
---|
50 | /// This class contains enumerations and static |
---|
51 | /// utility functions for dealing with Unicode characters |
---|
52 | /// and their properties. |
---|
53 | /// |
---|
54 | /// For more information on Unicode, see <http://www.unicode.org>. |
---|
55 | /// |
---|
56 | /// The implementation is based on the Unicode support |
---|
57 | /// functions in PCRE. |
---|
58 | { |
---|
59 | public: |
---|
60 | // Implementation note: the following definitions must be kept |
---|
61 | // in sync with those from ucp.h (PCRE). |
---|
62 | enum CharacterCategory |
---|
63 | /// Unicode 5.0 character categories. |
---|
64 | { |
---|
65 | UCP_OTHER, |
---|
66 | UCP_LETTER, |
---|
67 | UCP_MARK, |
---|
68 | UCP_NUMBER, |
---|
69 | UCP_PUNCTUATION, |
---|
70 | UCP_SYMBOL, |
---|
71 | UCP_SEPARATOR |
---|
72 | }; |
---|
73 | |
---|
74 | enum CharacterType |
---|
75 | /// Unicode 5.0 character types. |
---|
76 | { |
---|
77 | UCP_CONTROL, |
---|
78 | UCP_FORMAT, |
---|
79 | UCP_UNASSIGNED, |
---|
80 | UCP_PRIVATE_USE, |
---|
81 | UCP_SURROGATE, |
---|
82 | UCP_LOWER_CASE_LETTER, |
---|
83 | UCP_MODIFIER_LETTER, |
---|
84 | UCP_OTHER_LETTER, |
---|
85 | UCP_TITLE_CASE_LETTER, |
---|
86 | UCP_UPPER_CASE_LETTER, |
---|
87 | UCP_SPACING_MARK, |
---|
88 | UCP_ENCLOSING_MARK, |
---|
89 | UCP_NON_SPACING_MARK, |
---|
90 | UCP_DECIMAL_NUMBER, |
---|
91 | UCP_LETTER_NUMBER, |
---|
92 | UCP_OTHER_NUMBER, |
---|
93 | UCP_CONNECTOR_PUNCTUATION, |
---|
94 | UCP_DASH_PUNCTUATION, |
---|
95 | UCP_CLOSE_PUNCTUATION, |
---|
96 | UCP_FINAL_PUNCTUATION, |
---|
97 | UCP_INITIAL_PUNCTUATION, |
---|
98 | UCP_OTHER_PUNCTUATION, |
---|
99 | UCP_OPEN_PUNCTUATION, |
---|
100 | UCP_CURRENCY_SYMBOL, |
---|
101 | UCP_MODIFIER_SYMBOL, |
---|
102 | UCP_MATHEMATICAL_SYMBOL, |
---|
103 | UCP_OTHER_SYMBOL, |
---|
104 | UCP_LINE_SEPARATOR, |
---|
105 | UCP_PARAGRAPH_SEPARATOR, |
---|
106 | UCP_SPACE_SEPARATOR |
---|
107 | }; |
---|
108 | |
---|
109 | enum Script |
---|
110 | /// Unicode 5.0 scripts. |
---|
111 | { |
---|
112 | UCP_ARABIC, |
---|
113 | UCP_ARMENIAN, |
---|
114 | UCP_BENGALI, |
---|
115 | UCP_BOPOMOFO, |
---|
116 | UCP_BRAILLE, |
---|
117 | UCP_BUGINESE, |
---|
118 | UCP_BUHID, |
---|
119 | UCP_CANADIAN_ABORIGINAL, |
---|
120 | UCP_CHEROKEE, |
---|
121 | UCP_COMMON, |
---|
122 | UCP_COPTIC, |
---|
123 | UCP_CYPRIOT, |
---|
124 | UCP_CYRILLIC, |
---|
125 | UCP_DESERET, |
---|
126 | UCP_DEVANAGARI, |
---|
127 | UCP_ETHIOPIC, |
---|
128 | UCP_GEORGIAN, |
---|
129 | UCP_GLAGOLITIC, |
---|
130 | UCP_GOTHIC, |
---|
131 | UCP_GREEK, |
---|
132 | UCP_GUJARATI, |
---|
133 | UCP_GURMUKHI, |
---|
134 | UCP_HAN, |
---|
135 | UCP_HANGUL, |
---|
136 | UCP_HANUNOO, |
---|
137 | UCP_HEBREW, |
---|
138 | UCP_HIRAGANA, |
---|
139 | UCP_INHERITED, |
---|
140 | UCP_KANNADA, |
---|
141 | UCP_KATAKANA, |
---|
142 | UCP_KHAROSHTHI, |
---|
143 | UCP_KHMER, |
---|
144 | UCP_LAO, |
---|
145 | UCP_LATIN, |
---|
146 | UCP_LIMBU, |
---|
147 | UCP_LINEAR_B, |
---|
148 | UCP_MALAYALAM, |
---|
149 | UCP_MONGOLIAN, |
---|
150 | UCP_MYANMAR, |
---|
151 | UCP_NEW_TAI_LUE, |
---|
152 | UCP_OGHAM, |
---|
153 | UCP_OLD_ITALIC, |
---|
154 | UCP_OLD_PERSIAN, |
---|
155 | UCP_ORIYA, |
---|
156 | UCP_OSMANYA, |
---|
157 | UCP_RUNIC, |
---|
158 | UCP_SHAVIAN, |
---|
159 | UCP_SINHALA, |
---|
160 | UCP_SYLOTI_NAGRI, |
---|
161 | UCP_SYRIAC, |
---|
162 | UCP_TAGALOG, |
---|
163 | UCP_TAGBANWA, |
---|
164 | UCP_TAI_LE, |
---|
165 | UCP_TAMIL, |
---|
166 | UCP_TELUGU, |
---|
167 | UCP_THAANA, |
---|
168 | UCP_THAI, |
---|
169 | UCP_TIBETAN, |
---|
170 | UCP_TIFINAGH, |
---|
171 | UCP_UGARITIC, |
---|
172 | UCP_YI, |
---|
173 | UCP_BALINESE, |
---|
174 | UCP_CUNEIFORM, |
---|
175 | UCP_NKO, |
---|
176 | UCP_PHAGS_PA, |
---|
177 | UCP_PHOENICIAN, |
---|
178 | UCP_CARIAN, |
---|
179 | UCP_CHAM, |
---|
180 | UCP_KAYAH_LI, |
---|
181 | UCP_LEPCHA, |
---|
182 | UCP_LYCIAN, |
---|
183 | UCP_LYDIAN, |
---|
184 | UCP_OL_CHIKI, |
---|
185 | UCP_REJANG, |
---|
186 | UCP_SAURASHTRA, |
---|
187 | UCP_SUNDANESE, |
---|
188 | UCP_VAI |
---|
189 | }; |
---|
190 | |
---|
191 | struct CharacterProperties |
---|
192 | /// This structure holds the character properties |
---|
193 | /// of an Unicode character. |
---|
194 | { |
---|
195 | CharacterCategory category; |
---|
196 | CharacterType type; |
---|
197 | Script script; |
---|
198 | }; |
---|
199 | |
---|
200 | static void properties(int ch, CharacterProperties& props); |
---|
201 | /// Return the Unicode character properties for the |
---|
202 | /// character with the given Unicode value. |
---|
203 | |
---|
204 | static bool isLower(int ch); |
---|
205 | /// Returns true iff the given character is a lowercase |
---|
206 | /// character. |
---|
207 | |
---|
208 | static bool isUpper(int ch); |
---|
209 | /// Returns true iff the given character is an uppercase |
---|
210 | /// character. |
---|
211 | |
---|
212 | static int toLower(int ch); |
---|
213 | /// If the given character is an uppercase character, |
---|
214 | /// return its lowercase counterpart, otherwise return |
---|
215 | /// the character. |
---|
216 | |
---|
217 | static int toUpper(int ch); |
---|
218 | /// If the given character is a lowercase character, |
---|
219 | /// return its uppercase counterpart, otherwise return |
---|
220 | /// the character. |
---|
221 | }; |
---|
222 | |
---|
223 | |
---|
224 | } // namespace Poco |
---|
225 | |
---|
226 | |
---|
227 | #endif // Foundation_Unicode_INCLUDED |
---|