1 | /* |
---|
2 | * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin |
---|
3 | * |
---|
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
---|
5 | * copy of this software and associated documentation files (the "Software"), |
---|
6 | * to deal in the Software without restriction, including without limitation |
---|
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
---|
8 | * and/or sell copies of the Software, and to permit persons to whom the |
---|
9 | * Software is furnished to do so, subject to the following conditions: |
---|
10 | * |
---|
11 | * The above copyright notice and this permission notice shall be included in |
---|
12 | * all copies or substantial portions of the Software. |
---|
13 | * |
---|
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
---|
17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
---|
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
---|
19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
20 | * DEALINGS IN THE SOFTWARE. |
---|
21 | */ |
---|
22 | |
---|
23 | /* |
---|
24 | * This library contains derived data from a modified version of the |
---|
25 | * Unicode data files. |
---|
26 | * |
---|
27 | * The original data files are available at |
---|
28 | * http://www.unicode.org/Public/UNIDATA/ |
---|
29 | * |
---|
30 | * Please notice the copyright statement in the file "utf8proc_data.c". |
---|
31 | */ |
---|
32 | |
---|
33 | |
---|
34 | /* |
---|
35 | * File name: utf8proc.c |
---|
36 | * Version: 1.1.1 |
---|
37 | * Last changed: 2007-07-22 |
---|
38 | * |
---|
39 | * Description: |
---|
40 | * Implementation of libutf8proc. |
---|
41 | */ |
---|
42 | |
---|
43 | |
---|
44 | #include "utf8proc.h" |
---|
45 | #include "utf8proc_data.h" |
---|
46 | |
---|
47 | const int8_t utf8proc_utf8class[256] = { |
---|
48 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
49 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
50 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
51 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
52 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
53 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
54 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
55 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
57 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
58 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
59 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
60 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
---|
61 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
---|
62 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
---|
63 | 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; |
---|
64 | |
---|
65 | #define UTF8PROC_HANGUL_SBASE 0xAC00 |
---|
66 | #define UTF8PROC_HANGUL_LBASE 0x1100 |
---|
67 | #define UTF8PROC_HANGUL_VBASE 0x1161 |
---|
68 | #define UTF8PROC_HANGUL_TBASE 0x11A7 |
---|
69 | #define UTF8PROC_HANGUL_LCOUNT 19 |
---|
70 | #define UTF8PROC_HANGUL_VCOUNT 21 |
---|
71 | #define UTF8PROC_HANGUL_TCOUNT 28 |
---|
72 | #define UTF8PROC_HANGUL_NCOUNT 588 |
---|
73 | #define UTF8PROC_HANGUL_SCOUNT 11172 |
---|
74 | /*// END is exclusive*/ |
---|
75 | #define UTF8PROC_HANGUL_L_START 0x1100 |
---|
76 | #define UTF8PROC_HANGUL_L_END 0x115A |
---|
77 | #define UTF8PROC_HANGUL_L_FILLER 0x115F |
---|
78 | #define UTF8PROC_HANGUL_V_START 0x1160 |
---|
79 | #define UTF8PROC_HANGUL_V_END 0x11A3 |
---|
80 | #define UTF8PROC_HANGUL_T_START 0x11A8 |
---|
81 | #define UTF8PROC_HANGUL_T_END 0x11FA |
---|
82 | #define UTF8PROC_HANGUL_S_START 0xAC00 |
---|
83 | #define UTF8PROC_HANGUL_S_END 0xD7A4 |
---|
84 | |
---|
85 | |
---|
86 | #define UTF8PROC_BOUNDCLASS_START 0 |
---|
87 | #define UTF8PROC_BOUNDCLASS_OTHER 1 |
---|
88 | #define UTF8PROC_BOUNDCLASS_CR 2 |
---|
89 | #define UTF8PROC_BOUNDCLASS_LF 3 |
---|
90 | #define UTF8PROC_BOUNDCLASS_CONTROL 4 |
---|
91 | #define UTF8PROC_BOUNDCLASS_EXTEND 5 |
---|
92 | #define UTF8PROC_BOUNDCLASS_L 6 |
---|
93 | #define UTF8PROC_BOUNDCLASS_V 7 |
---|
94 | #define UTF8PROC_BOUNDCLASS_T 8 |
---|
95 | #define UTF8PROC_BOUNDCLASS_LV 9 |
---|
96 | #define UTF8PROC_BOUNDCLASS_LVT 10 |
---|
97 | |
---|
98 | |
---|
99 | const char *utf8proc_errmsg(ssize_t errcode) { |
---|
100 | switch (errcode) { |
---|
101 | case UTF8PROC_ERROR_NOMEM: |
---|
102 | return "Memory for processing UTF-8 data could not be allocated."; |
---|
103 | case UTF8PROC_ERROR_OVERFLOW: |
---|
104 | return "UTF-8 string is too long to be processed."; |
---|
105 | case UTF8PROC_ERROR_INVALIDUTF8: |
---|
106 | return "Invalid UTF-8 string"; |
---|
107 | case UTF8PROC_ERROR_NOTASSIGNED: |
---|
108 | return "Unassigned Unicode code point found in UTF-8 string."; |
---|
109 | case UTF8PROC_ERROR_INVALIDOPTS: |
---|
110 | return "Invalid options for UTF-8 processing chosen."; |
---|
111 | default: |
---|
112 | return "An unknown error occured while processing UTF-8 data."; |
---|
113 | } |
---|
114 | } |
---|
115 | |
---|
116 | ssize_t utf8proc_iterate( |
---|
117 | const uint8_t *str, ssize_t slen, int32_t *dst |
---|
118 | ) { |
---|
119 | int length; |
---|
120 | int i; |
---|
121 | int32_t uc = -1; |
---|
122 | *dst = -1; |
---|
123 | if (!slen) return 0; |
---|
124 | length = utf8proc_utf8class[str[0]]; |
---|
125 | if (!length) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
126 | if (slen >= 0 && length > slen) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
127 | for (i=1; i<length; i++) { |
---|
128 | if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
129 | } |
---|
130 | switch (length) { |
---|
131 | case 1: |
---|
132 | uc = str[0]; |
---|
133 | break; |
---|
134 | case 2: |
---|
135 | uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); |
---|
136 | if (uc < 0x80) uc = -1; |
---|
137 | break; |
---|
138 | case 3: |
---|
139 | uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) |
---|
140 | + (str[2] & 0x3F); |
---|
141 | if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || |
---|
142 | (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; |
---|
143 | break; |
---|
144 | case 4: |
---|
145 | uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) |
---|
146 | + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); |
---|
147 | if (uc < 0x10000 || uc >= 0x110000) uc = -1; |
---|
148 | break; |
---|
149 | } |
---|
150 | if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) |
---|
151 | return UTF8PROC_ERROR_INVALIDUTF8; |
---|
152 | *dst = uc; |
---|
153 | return length; |
---|
154 | } |
---|
155 | |
---|
156 | bool utf8proc_codepoint_valid(int32_t uc) { |
---|
157 | if (uc < 0 || uc >= 0x110000 || |
---|
158 | ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || |
---|
159 | (uc >= 0xFDD0 && uc < 0xFDF0)) return false; |
---|
160 | else return true; |
---|
161 | } |
---|
162 | |
---|
163 | ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { |
---|
164 | if (uc < 0x00) { |
---|
165 | return 0; |
---|
166 | } else if (uc < 0x80) { |
---|
167 | dst[0] = uc; |
---|
168 | return 1; |
---|
169 | } else if (uc < 0x800) { |
---|
170 | dst[0] = 0xC0 + (uc >> 6); |
---|
171 | dst[1] = 0x80 + (uc & 0x3F); |
---|
172 | return 2; |
---|
173 | } else if (uc == 0xFFFF) { |
---|
174 | dst[0] = 0xFF; |
---|
175 | return 1; |
---|
176 | } else if (uc == 0xFFFE) { |
---|
177 | dst[0] = 0xFE; |
---|
178 | return 1; |
---|
179 | } else if (uc < 0x10000) { |
---|
180 | dst[0] = 0xE0 + (uc >> 12); |
---|
181 | dst[1] = 0x80 + ((uc >> 6) & 0x3F); |
---|
182 | dst[2] = 0x80 + (uc & 0x3F); |
---|
183 | return 3; |
---|
184 | } else if (uc < 0x110000) { |
---|
185 | dst[0] = 0xF0 + (uc >> 18); |
---|
186 | dst[1] = 0x80 + ((uc >> 12) & 0x3F); |
---|
187 | dst[2] = 0x80 + ((uc >> 6) & 0x3F); |
---|
188 | dst[3] = 0x80 + (uc & 0x3F); |
---|
189 | return 4; |
---|
190 | } else return 0; |
---|
191 | } |
---|
192 | |
---|
193 | const utf8proc_property_t *utf8proc_get_property(int32_t uc) { |
---|
194 | /* // ASSERT: uc >= 0 && uc < 0x110000*/ |
---|
195 | return utf8proc_properties + ( |
---|
196 | utf8proc_stage2table[ |
---|
197 | utf8proc_stage1table[uc >> 8] + (uc & 0xFF) |
---|
198 | ] |
---|
199 | ); |
---|
200 | } |
---|
201 | |
---|
202 | #define utf8proc_decompose_lump(replacement_uc) \ |
---|
203 | return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ |
---|
204 | options & ~UTF8PROC_LUMP, last_boundclass) |
---|
205 | |
---|
206 | ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, |
---|
207 | int options, int *last_boundclass) { |
---|
208 | /*// ASSERT: uc >= 0 && uc < 0x110000*/ |
---|
209 | const utf8proc_property_t *property; |
---|
210 | utf8proc_propval_t category; |
---|
211 | int32_t hangul_sindex; |
---|
212 | property = utf8proc_get_property(uc); |
---|
213 | category = property->category; |
---|
214 | hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; |
---|
215 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { |
---|
216 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { |
---|
217 | int32_t hangul_tindex; |
---|
218 | if (bufsize >= 1) { |
---|
219 | dst[0] = UTF8PROC_HANGUL_LBASE + |
---|
220 | hangul_sindex / UTF8PROC_HANGUL_NCOUNT; |
---|
221 | if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + |
---|
222 | (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; |
---|
223 | } |
---|
224 | hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; |
---|
225 | if (!hangul_tindex) return 2; |
---|
226 | if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; |
---|
227 | return 3; |
---|
228 | } |
---|
229 | } |
---|
230 | if (options & UTF8PROC_REJECTNA) { |
---|
231 | if (!category) return UTF8PROC_ERROR_NOTASSIGNED; |
---|
232 | } |
---|
233 | if (options & UTF8PROC_IGNORE) { |
---|
234 | if (property->ignorable) return 0; |
---|
235 | } |
---|
236 | if (options & UTF8PROC_LUMP) { |
---|
237 | if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); |
---|
238 | if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) |
---|
239 | utf8proc_decompose_lump(0x0027); |
---|
240 | if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) |
---|
241 | utf8proc_decompose_lump(0x002D); |
---|
242 | if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); |
---|
243 | if (uc == 0x2236) utf8proc_decompose_lump(0x003A); |
---|
244 | if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) |
---|
245 | utf8proc_decompose_lump(0x003C); |
---|
246 | if (uc == 0x203A || uc == 0x232A || uc == 0x3009) |
---|
247 | utf8proc_decompose_lump(0x003E); |
---|
248 | if (uc == 0x2216) utf8proc_decompose_lump(0x005C); |
---|
249 | if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) |
---|
250 | utf8proc_decompose_lump(0x005E); |
---|
251 | if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) |
---|
252 | utf8proc_decompose_lump(0x005F); |
---|
253 | if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); |
---|
254 | if (uc == 0x2223) utf8proc_decompose_lump(0x007C); |
---|
255 | if (uc == 0x223C) utf8proc_decompose_lump(0x007E); |
---|
256 | if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { |
---|
257 | if (category == UTF8PROC_CATEGORY_ZL || |
---|
258 | category == UTF8PROC_CATEGORY_ZP) |
---|
259 | utf8proc_decompose_lump(0x000A); |
---|
260 | } |
---|
261 | } |
---|
262 | if (options & UTF8PROC_STRIPMARK) { |
---|
263 | if (category == UTF8PROC_CATEGORY_MN || |
---|
264 | category == UTF8PROC_CATEGORY_MC || |
---|
265 | category == UTF8PROC_CATEGORY_ME) return 0; |
---|
266 | } |
---|
267 | if (options & UTF8PROC_CASEFOLD) { |
---|
268 | if (property->casefold_mapping) { |
---|
269 | const int32_t *casefold_entry; |
---|
270 | ssize_t written = 0; |
---|
271 | for (casefold_entry = property->casefold_mapping; |
---|
272 | *casefold_entry >= 0; casefold_entry++) { |
---|
273 | written += utf8proc_decompose_char(*casefold_entry, dst+written, |
---|
274 | (bufsize > written) ? (bufsize - written) : 0, options, |
---|
275 | last_boundclass); |
---|
276 | if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
277 | } |
---|
278 | return written; |
---|
279 | } |
---|
280 | } |
---|
281 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { |
---|
282 | if (property->decomp_mapping && |
---|
283 | (!property->decomp_type || (options & UTF8PROC_COMPAT))) { |
---|
284 | const int32_t *decomp_entry; |
---|
285 | ssize_t written = 0; |
---|
286 | for (decomp_entry = property->decomp_mapping; |
---|
287 | *decomp_entry >= 0; decomp_entry++) { |
---|
288 | written += utf8proc_decompose_char(*decomp_entry, dst+written, |
---|
289 | (bufsize > written) ? (bufsize - written) : 0, options, |
---|
290 | last_boundclass); |
---|
291 | if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
292 | } |
---|
293 | return written; |
---|
294 | } |
---|
295 | } |
---|
296 | if (options & UTF8PROC_CHARBOUND) { |
---|
297 | bool boundary; |
---|
298 | int tbc, lbc; |
---|
299 | tbc = |
---|
300 | (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : |
---|
301 | (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : |
---|
302 | ((category == UTF8PROC_CATEGORY_ZL || |
---|
303 | category == UTF8PROC_CATEGORY_ZP || |
---|
304 | category == UTF8PROC_CATEGORY_CC || |
---|
305 | category == UTF8PROC_CATEGORY_CF) && |
---|
306 | !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : |
---|
307 | property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : |
---|
308 | ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) || |
---|
309 | uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : |
---|
310 | (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? |
---|
311 | UTF8PROC_BOUNDCLASS_V : |
---|
312 | (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? |
---|
313 | UTF8PROC_BOUNDCLASS_T : |
---|
314 | (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( |
---|
315 | ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? |
---|
316 | UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT |
---|
317 | ) : |
---|
318 | UTF8PROC_BOUNDCLASS_OTHER; |
---|
319 | lbc = *last_boundclass; |
---|
320 | boundary = |
---|
321 | (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : |
---|
322 | (lbc == UTF8PROC_BOUNDCLASS_START) ? true : |
---|
323 | (lbc == UTF8PROC_BOUNDCLASS_CR && |
---|
324 | tbc == UTF8PROC_BOUNDCLASS_LF) ? false : |
---|
325 | (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : |
---|
326 | (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : |
---|
327 | (lbc == UTF8PROC_BOUNDCLASS_L && |
---|
328 | (tbc == UTF8PROC_BOUNDCLASS_L || |
---|
329 | tbc == UTF8PROC_BOUNDCLASS_V || |
---|
330 | tbc == UTF8PROC_BOUNDCLASS_LV || |
---|
331 | tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : |
---|
332 | ((lbc == UTF8PROC_BOUNDCLASS_LV || |
---|
333 | lbc == UTF8PROC_BOUNDCLASS_V) && |
---|
334 | (tbc == UTF8PROC_BOUNDCLASS_V || |
---|
335 | tbc == UTF8PROC_BOUNDCLASS_T)) ? false : |
---|
336 | ((lbc == UTF8PROC_BOUNDCLASS_LVT || |
---|
337 | lbc == UTF8PROC_BOUNDCLASS_T) && |
---|
338 | tbc == UTF8PROC_BOUNDCLASS_T) ? false : |
---|
339 | true; |
---|
340 | *last_boundclass = tbc; |
---|
341 | if (boundary) { |
---|
342 | if (bufsize >= 1) dst[0] = 0xFFFF; |
---|
343 | if (bufsize >= 2) dst[1] = uc; |
---|
344 | return 2; |
---|
345 | } |
---|
346 | } |
---|
347 | if (bufsize >= 1) *dst = uc; |
---|
348 | return 1; |
---|
349 | } |
---|
350 | |
---|
351 | ssize_t utf8proc_decompose( |
---|
352 | const uint8_t *str, ssize_t slen, |
---|
353 | int32_t *buffer, ssize_t bufsize, int options |
---|
354 | ) { |
---|
355 | /*// slen will be ignored, if UTF8PROC_NULLTERM is set in options*/ |
---|
356 | ssize_t wpos = 0; |
---|
357 | if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) |
---|
358 | return UTF8PROC_ERROR_INVALIDOPTS; |
---|
359 | if ((options & UTF8PROC_STRIPMARK) && |
---|
360 | !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) |
---|
361 | return UTF8PROC_ERROR_INVALIDOPTS; |
---|
362 | { |
---|
363 | int32_t uc; |
---|
364 | ssize_t rpos = 0; |
---|
365 | ssize_t decomp_result; |
---|
366 | int boundclass = UTF8PROC_BOUNDCLASS_START; |
---|
367 | while (1) { |
---|
368 | if (options & UTF8PROC_NULLTERM) { |
---|
369 | rpos += utf8proc_iterate(str + rpos, -1, &uc); |
---|
370 | /* checking of return value is not neccessary, |
---|
371 | as 'uc' is < 0 in case of error. */ |
---|
372 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
373 | if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
374 | if (uc == 0) break; |
---|
375 | } else { |
---|
376 | if (rpos >= slen) break; |
---|
377 | rpos += utf8proc_iterate(str + rpos, slen - rpos, &uc); |
---|
378 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
379 | } |
---|
380 | decomp_result = utf8proc_decompose_char( |
---|
381 | uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, |
---|
382 | &boundclass |
---|
383 | ); |
---|
384 | if (decomp_result < 0) return decomp_result; |
---|
385 | wpos += decomp_result; |
---|
386 | /* // prohibiting integer overflows due to too long strings:*/ |
---|
387 | if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) |
---|
388 | return UTF8PROC_ERROR_OVERFLOW; |
---|
389 | } |
---|
390 | } |
---|
391 | if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { |
---|
392 | ssize_t pos = 0; |
---|
393 | while (pos < wpos-1) { |
---|
394 | int32_t uc1, uc2; |
---|
395 | const utf8proc_property_t *property1, *property2; |
---|
396 | uc1 = buffer[pos]; |
---|
397 | uc2 = buffer[pos+1]; |
---|
398 | property1 = utf8proc_get_property(uc1); |
---|
399 | property2 = utf8proc_get_property(uc2); |
---|
400 | if (property1->combining_class > property2->combining_class && |
---|
401 | property2->combining_class > 0) { |
---|
402 | buffer[pos] = uc2; |
---|
403 | buffer[pos+1] = uc1; |
---|
404 | if (pos > 0) pos--; else pos++; |
---|
405 | } else { |
---|
406 | pos++; |
---|
407 | } |
---|
408 | } |
---|
409 | } |
---|
410 | return wpos; |
---|
411 | } |
---|
412 | |
---|
413 | ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { |
---|
414 | /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored |
---|
415 | ASSERT: 'buffer' has one spare byte of free space at the end! */ |
---|
416 | if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { |
---|
417 | ssize_t rpos; |
---|
418 | ssize_t wpos = 0; |
---|
419 | int32_t uc; |
---|
420 | for (rpos = 0; rpos < length; rpos++) { |
---|
421 | uc = buffer[rpos]; |
---|
422 | if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; |
---|
423 | if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || |
---|
424 | ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { |
---|
425 | if (options & UTF8PROC_NLF2LS) { |
---|
426 | if (options & UTF8PROC_NLF2PS) { |
---|
427 | buffer[wpos++] = 0x000A; |
---|
428 | } else { |
---|
429 | buffer[wpos++] = 0x2028; |
---|
430 | } |
---|
431 | } else { |
---|
432 | if (options & UTF8PROC_NLF2PS) { |
---|
433 | buffer[wpos++] = 0x2029; |
---|
434 | } else { |
---|
435 | buffer[wpos++] = 0x0020; |
---|
436 | } |
---|
437 | } |
---|
438 | } else if ((options & UTF8PROC_STRIPCC) && |
---|
439 | (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { |
---|
440 | if (uc == 0x0009) buffer[wpos++] = 0x0020; |
---|
441 | } else { |
---|
442 | buffer[wpos++] = uc; |
---|
443 | } |
---|
444 | } |
---|
445 | length = wpos; |
---|
446 | } |
---|
447 | if (options & UTF8PROC_COMPOSE) { |
---|
448 | int32_t *starter = NULL; |
---|
449 | int32_t current_char; |
---|
450 | const utf8proc_property_t *starter_property = NULL, *current_property; |
---|
451 | utf8proc_propval_t max_combining_class = -1; |
---|
452 | ssize_t rpos; |
---|
453 | ssize_t wpos = 0; |
---|
454 | int32_t composition; |
---|
455 | for (rpos = 0; rpos < length; rpos++) { |
---|
456 | current_char = buffer[rpos]; |
---|
457 | current_property = utf8proc_get_property(current_char); |
---|
458 | if (starter && current_property->combining_class > max_combining_class) { |
---|
459 | /* // combination perhaps possible*/ |
---|
460 | int32_t hangul_lindex; |
---|
461 | int32_t hangul_sindex; |
---|
462 | hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; |
---|
463 | if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { |
---|
464 | int32_t hangul_vindex; |
---|
465 | hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; |
---|
466 | if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { |
---|
467 | *starter = UTF8PROC_HANGUL_SBASE + |
---|
468 | (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * |
---|
469 | UTF8PROC_HANGUL_TCOUNT; |
---|
470 | starter_property = NULL; |
---|
471 | continue; |
---|
472 | } |
---|
473 | } |
---|
474 | hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; |
---|
475 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && |
---|
476 | (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { |
---|
477 | int32_t hangul_tindex; |
---|
478 | hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; |
---|
479 | if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { |
---|
480 | *starter += hangul_tindex; |
---|
481 | starter_property = NULL; |
---|
482 | continue; |
---|
483 | } |
---|
484 | } |
---|
485 | if (!starter_property) { |
---|
486 | starter_property = utf8proc_get_property(*starter); |
---|
487 | } |
---|
488 | if (starter_property->comb1st_index >= 0 && |
---|
489 | current_property->comb2nd_index >= 0) { |
---|
490 | composition = utf8proc_combinations[ |
---|
491 | starter_property->comb1st_index + |
---|
492 | current_property->comb2nd_index |
---|
493 | ]; |
---|
494 | if (composition >= 0 && (!(options & UTF8PROC_STABLE) || |
---|
495 | !(utf8proc_get_property(composition)->comp_exclusion))) { |
---|
496 | *starter = composition; |
---|
497 | starter_property = NULL; |
---|
498 | continue; |
---|
499 | } |
---|
500 | } |
---|
501 | } |
---|
502 | buffer[wpos] = current_char; |
---|
503 | if (current_property->combining_class) { |
---|
504 | if (current_property->combining_class > max_combining_class) { |
---|
505 | max_combining_class = current_property->combining_class; |
---|
506 | } |
---|
507 | } else { |
---|
508 | starter = buffer + wpos; |
---|
509 | starter_property = NULL; |
---|
510 | max_combining_class = -1; |
---|
511 | } |
---|
512 | wpos++; |
---|
513 | } |
---|
514 | length = wpos; |
---|
515 | } |
---|
516 | { |
---|
517 | ssize_t rpos, wpos = 0; |
---|
518 | int32_t uc; |
---|
519 | for (rpos = 0; rpos < length; rpos++) { |
---|
520 | uc = buffer[rpos]; |
---|
521 | wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); |
---|
522 | } |
---|
523 | ((uint8_t *)buffer)[wpos] = 0; |
---|
524 | return wpos; |
---|
525 | } |
---|
526 | } |
---|
527 | |
---|
528 | ssize_t utf8proc_map( |
---|
529 | const uint8_t *str, ssize_t slen, uint8_t **dstptr, int options |
---|
530 | ) { |
---|
531 | int32_t *buffer; |
---|
532 | ssize_t result; |
---|
533 | *dstptr = NULL; |
---|
534 | result = utf8proc_decompose(str, slen, NULL, 0, options); |
---|
535 | if (result < 0) return result; |
---|
536 | buffer = malloc(result * sizeof(int32_t) + 1); |
---|
537 | if (!buffer) return UTF8PROC_ERROR_NOMEM; |
---|
538 | result = utf8proc_decompose(str, slen, buffer, result, options); |
---|
539 | if (result < 0) { |
---|
540 | free(buffer); |
---|
541 | return result; |
---|
542 | } |
---|
543 | result = utf8proc_reencode(buffer, result, options); |
---|
544 | if (result < 0) { |
---|
545 | free(buffer); |
---|
546 | return result; |
---|
547 | } |
---|
548 | { |
---|
549 | int32_t *newptr; |
---|
550 | newptr = realloc(buffer, result+1); |
---|
551 | if (newptr) buffer = newptr; |
---|
552 | } |
---|
553 | *dstptr = (uint8_t *)buffer; |
---|
554 | return result; |
---|
555 | } |
---|
556 | |
---|
557 | uint8_t *utf8proc_NFD(const uint8_t *str) { |
---|
558 | uint8_t *retval; |
---|
559 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
560 | UTF8PROC_DECOMPOSE); |
---|
561 | return retval; |
---|
562 | } |
---|
563 | |
---|
564 | uint8_t *utf8proc_NFC(const uint8_t *str) { |
---|
565 | uint8_t *retval; |
---|
566 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
567 | UTF8PROC_COMPOSE); |
---|
568 | return retval; |
---|
569 | } |
---|
570 | |
---|
571 | uint8_t *utf8proc_NFKD(const uint8_t *str) { |
---|
572 | uint8_t *retval; |
---|
573 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
574 | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); |
---|
575 | return retval; |
---|
576 | } |
---|
577 | |
---|
578 | uint8_t *utf8proc_NFKC(const uint8_t *str) { |
---|
579 | uint8_t *retval; |
---|
580 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
581 | UTF8PROC_COMPOSE | UTF8PROC_COMPAT); |
---|
582 | return retval; |
---|
583 | } |
---|
584 | |
---|
585 | ssize_t utf8proc_check(const uint8_t *str) { |
---|
586 | ssize_t result; |
---|
587 | result = utf8proc_decompose(str, 0, NULL, 0, |
---|
588 | UTF8PROC_NULLTERM | UTF8PROC_STABLE); |
---|
589 | return result; |
---|
590 | } |
---|