[409] | 1 | /* |
---|
| 2 | * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin |
---|
| 3 | * |
---|
| 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
---|
| 5 | * copy of this software and associated documentation files (the "Software"), |
---|
| 6 | * to deal in the Software without restriction, including without limitation |
---|
| 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
---|
| 8 | * and/or sell copies of the Software, and to permit persons to whom the |
---|
| 9 | * Software is furnished to do so, subject to the following conditions: |
---|
| 10 | * |
---|
| 11 | * The above copyright notice and this permission notice shall be included in |
---|
| 12 | * all copies or substantial portions of the Software. |
---|
| 13 | * |
---|
| 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
| 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
| 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
---|
| 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
---|
| 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
---|
| 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
| 20 | * DEALINGS IN THE SOFTWARE. |
---|
| 21 | */ |
---|
| 22 | |
---|
| 23 | /* |
---|
| 24 | * This library contains derived data from a modified version of the |
---|
| 25 | * Unicode data files. |
---|
| 26 | * |
---|
| 27 | * The original data files are available at |
---|
| 28 | * http://www.unicode.org/Public/UNIDATA/ |
---|
| 29 | * |
---|
| 30 | * Please notice the copyright statement in the file "utf8proc_data.c". |
---|
| 31 | */ |
---|
| 32 | |
---|
| 33 | |
---|
| 34 | /* |
---|
| 35 | * File name: utf8proc.c |
---|
| 36 | * Version: 1.1.1 |
---|
| 37 | * Last changed: 2007-07-22 |
---|
| 38 | * |
---|
| 39 | * Description: |
---|
| 40 | * Implementation of libutf8proc. |
---|
| 41 | */ |
---|
| 42 | |
---|
| 43 | |
---|
| 44 | #include "utf8proc.h" |
---|
| 45 | #include "utf8proc_data.h" |
---|
| 46 | |
---|
| 47 | const int8_t utf8proc_utf8class[256] = { |
---|
| 48 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 49 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 50 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 51 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 52 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 53 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 54 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 55 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
---|
| 56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
| 57 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
| 58 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
| 59 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
---|
| 60 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
---|
| 61 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
---|
| 62 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
---|
| 63 | 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; |
---|
| 64 | |
---|
| 65 | #define UTF8PROC_HANGUL_SBASE 0xAC00 |
---|
| 66 | #define UTF8PROC_HANGUL_LBASE 0x1100 |
---|
| 67 | #define UTF8PROC_HANGUL_VBASE 0x1161 |
---|
| 68 | #define UTF8PROC_HANGUL_TBASE 0x11A7 |
---|
| 69 | #define UTF8PROC_HANGUL_LCOUNT 19 |
---|
| 70 | #define UTF8PROC_HANGUL_VCOUNT 21 |
---|
| 71 | #define UTF8PROC_HANGUL_TCOUNT 28 |
---|
| 72 | #define UTF8PROC_HANGUL_NCOUNT 588 |
---|
| 73 | #define UTF8PROC_HANGUL_SCOUNT 11172 |
---|
| 74 | /*// END is exclusive*/ |
---|
| 75 | #define UTF8PROC_HANGUL_L_START 0x1100 |
---|
| 76 | #define UTF8PROC_HANGUL_L_END 0x115A |
---|
| 77 | #define UTF8PROC_HANGUL_L_FILLER 0x115F |
---|
| 78 | #define UTF8PROC_HANGUL_V_START 0x1160 |
---|
| 79 | #define UTF8PROC_HANGUL_V_END 0x11A3 |
---|
| 80 | #define UTF8PROC_HANGUL_T_START 0x11A8 |
---|
| 81 | #define UTF8PROC_HANGUL_T_END 0x11FA |
---|
| 82 | #define UTF8PROC_HANGUL_S_START 0xAC00 |
---|
| 83 | #define UTF8PROC_HANGUL_S_END 0xD7A4 |
---|
| 84 | |
---|
| 85 | |
---|
| 86 | #define UTF8PROC_BOUNDCLASS_START 0 |
---|
| 87 | #define UTF8PROC_BOUNDCLASS_OTHER 1 |
---|
| 88 | #define UTF8PROC_BOUNDCLASS_CR 2 |
---|
| 89 | #define UTF8PROC_BOUNDCLASS_LF 3 |
---|
| 90 | #define UTF8PROC_BOUNDCLASS_CONTROL 4 |
---|
| 91 | #define UTF8PROC_BOUNDCLASS_EXTEND 5 |
---|
| 92 | #define UTF8PROC_BOUNDCLASS_L 6 |
---|
| 93 | #define UTF8PROC_BOUNDCLASS_V 7 |
---|
| 94 | #define UTF8PROC_BOUNDCLASS_T 8 |
---|
| 95 | #define UTF8PROC_BOUNDCLASS_LV 9 |
---|
| 96 | #define UTF8PROC_BOUNDCLASS_LVT 10 |
---|
| 97 | |
---|
| 98 | |
---|
| 99 | const char *utf8proc_errmsg(ssize_t errcode) { |
---|
| 100 | switch (errcode) { |
---|
| 101 | case UTF8PROC_ERROR_NOMEM: |
---|
| 102 | return "Memory for processing UTF-8 data could not be allocated."; |
---|
| 103 | case UTF8PROC_ERROR_OVERFLOW: |
---|
| 104 | return "UTF-8 string is too long to be processed."; |
---|
| 105 | case UTF8PROC_ERROR_INVALIDUTF8: |
---|
| 106 | return "Invalid UTF-8 string"; |
---|
| 107 | case UTF8PROC_ERROR_NOTASSIGNED: |
---|
| 108 | return "Unassigned Unicode code point found in UTF-8 string."; |
---|
| 109 | case UTF8PROC_ERROR_INVALIDOPTS: |
---|
| 110 | return "Invalid options for UTF-8 processing chosen."; |
---|
| 111 | default: |
---|
| 112 | return "An unknown error occured while processing UTF-8 data."; |
---|
| 113 | } |
---|
| 114 | } |
---|
| 115 | |
---|
| 116 | ssize_t utf8proc_iterate( |
---|
| 117 | const uint8_t *str, ssize_t slen, int32_t *dst |
---|
| 118 | ) { |
---|
| 119 | int length; |
---|
| 120 | int i; |
---|
| 121 | int32_t uc = -1; |
---|
| 122 | *dst = -1; |
---|
| 123 | if (!slen) return 0; |
---|
| 124 | length = utf8proc_utf8class[str[0]]; |
---|
| 125 | if (!length) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 126 | if (slen >= 0 && length > slen) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 127 | for (i=1; i<length; i++) { |
---|
| 128 | if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 129 | } |
---|
| 130 | switch (length) { |
---|
| 131 | case 1: |
---|
| 132 | uc = str[0]; |
---|
| 133 | break; |
---|
| 134 | case 2: |
---|
| 135 | uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); |
---|
| 136 | if (uc < 0x80) uc = -1; |
---|
| 137 | break; |
---|
| 138 | case 3: |
---|
| 139 | uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) |
---|
| 140 | + (str[2] & 0x3F); |
---|
| 141 | if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || |
---|
| 142 | (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; |
---|
| 143 | break; |
---|
| 144 | case 4: |
---|
| 145 | uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) |
---|
| 146 | + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); |
---|
| 147 | if (uc < 0x10000 || uc >= 0x110000) uc = -1; |
---|
| 148 | break; |
---|
| 149 | } |
---|
| 150 | if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) |
---|
| 151 | return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 152 | *dst = uc; |
---|
| 153 | return length; |
---|
| 154 | } |
---|
| 155 | |
---|
| 156 | bool utf8proc_codepoint_valid(int32_t uc) { |
---|
| 157 | if (uc < 0 || uc >= 0x110000 || |
---|
| 158 | ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || |
---|
| 159 | (uc >= 0xFDD0 && uc < 0xFDF0)) return false; |
---|
| 160 | else return true; |
---|
| 161 | } |
---|
| 162 | |
---|
| 163 | ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { |
---|
| 164 | if (uc < 0x00) { |
---|
| 165 | return 0; |
---|
| 166 | } else if (uc < 0x80) { |
---|
| 167 | dst[0] = uc; |
---|
| 168 | return 1; |
---|
| 169 | } else if (uc < 0x800) { |
---|
| 170 | dst[0] = 0xC0 + (uc >> 6); |
---|
| 171 | dst[1] = 0x80 + (uc & 0x3F); |
---|
| 172 | return 2; |
---|
| 173 | } else if (uc == 0xFFFF) { |
---|
| 174 | dst[0] = 0xFF; |
---|
| 175 | return 1; |
---|
| 176 | } else if (uc == 0xFFFE) { |
---|
| 177 | dst[0] = 0xFE; |
---|
| 178 | return 1; |
---|
| 179 | } else if (uc < 0x10000) { |
---|
| 180 | dst[0] = 0xE0 + (uc >> 12); |
---|
| 181 | dst[1] = 0x80 + ((uc >> 6) & 0x3F); |
---|
| 182 | dst[2] = 0x80 + (uc & 0x3F); |
---|
| 183 | return 3; |
---|
| 184 | } else if (uc < 0x110000) { |
---|
| 185 | dst[0] = 0xF0 + (uc >> 18); |
---|
| 186 | dst[1] = 0x80 + ((uc >> 12) & 0x3F); |
---|
| 187 | dst[2] = 0x80 + ((uc >> 6) & 0x3F); |
---|
| 188 | dst[3] = 0x80 + (uc & 0x3F); |
---|
| 189 | return 4; |
---|
| 190 | } else return 0; |
---|
| 191 | } |
---|
| 192 | |
---|
| 193 | const utf8proc_property_t *utf8proc_get_property(int32_t uc) { |
---|
| 194 | /* // ASSERT: uc >= 0 && uc < 0x110000*/ |
---|
| 195 | return utf8proc_properties + ( |
---|
| 196 | utf8proc_stage2table[ |
---|
| 197 | utf8proc_stage1table[uc >> 8] + (uc & 0xFF) |
---|
| 198 | ] |
---|
| 199 | ); |
---|
| 200 | } |
---|
| 201 | |
---|
| 202 | #define utf8proc_decompose_lump(replacement_uc) \ |
---|
| 203 | return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ |
---|
| 204 | options & ~UTF8PROC_LUMP, last_boundclass) |
---|
| 205 | |
---|
| 206 | ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, |
---|
| 207 | int options, int *last_boundclass) { |
---|
| 208 | /*// ASSERT: uc >= 0 && uc < 0x110000*/ |
---|
| 209 | const utf8proc_property_t *property; |
---|
| 210 | utf8proc_propval_t category; |
---|
| 211 | int32_t hangul_sindex; |
---|
| 212 | property = utf8proc_get_property(uc); |
---|
| 213 | category = property->category; |
---|
| 214 | hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; |
---|
| 215 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { |
---|
| 216 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { |
---|
| 217 | int32_t hangul_tindex; |
---|
| 218 | if (bufsize >= 1) { |
---|
| 219 | dst[0] = UTF8PROC_HANGUL_LBASE + |
---|
| 220 | hangul_sindex / UTF8PROC_HANGUL_NCOUNT; |
---|
| 221 | if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + |
---|
| 222 | (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; |
---|
| 223 | } |
---|
| 224 | hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; |
---|
| 225 | if (!hangul_tindex) return 2; |
---|
| 226 | if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; |
---|
| 227 | return 3; |
---|
| 228 | } |
---|
| 229 | } |
---|
| 230 | if (options & UTF8PROC_REJECTNA) { |
---|
| 231 | if (!category) return UTF8PROC_ERROR_NOTASSIGNED; |
---|
| 232 | } |
---|
| 233 | if (options & UTF8PROC_IGNORE) { |
---|
| 234 | if (property->ignorable) return 0; |
---|
| 235 | } |
---|
| 236 | if (options & UTF8PROC_LUMP) { |
---|
| 237 | if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); |
---|
| 238 | if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) |
---|
| 239 | utf8proc_decompose_lump(0x0027); |
---|
| 240 | if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) |
---|
| 241 | utf8proc_decompose_lump(0x002D); |
---|
| 242 | if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); |
---|
| 243 | if (uc == 0x2236) utf8proc_decompose_lump(0x003A); |
---|
| 244 | if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) |
---|
| 245 | utf8proc_decompose_lump(0x003C); |
---|
| 246 | if (uc == 0x203A || uc == 0x232A || uc == 0x3009) |
---|
| 247 | utf8proc_decompose_lump(0x003E); |
---|
| 248 | if (uc == 0x2216) utf8proc_decompose_lump(0x005C); |
---|
| 249 | if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) |
---|
| 250 | utf8proc_decompose_lump(0x005E); |
---|
| 251 | if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) |
---|
| 252 | utf8proc_decompose_lump(0x005F); |
---|
| 253 | if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); |
---|
| 254 | if (uc == 0x2223) utf8proc_decompose_lump(0x007C); |
---|
| 255 | if (uc == 0x223C) utf8proc_decompose_lump(0x007E); |
---|
| 256 | if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { |
---|
| 257 | if (category == UTF8PROC_CATEGORY_ZL || |
---|
| 258 | category == UTF8PROC_CATEGORY_ZP) |
---|
| 259 | utf8proc_decompose_lump(0x000A); |
---|
| 260 | } |
---|
| 261 | } |
---|
| 262 | if (options & UTF8PROC_STRIPMARK) { |
---|
| 263 | if (category == UTF8PROC_CATEGORY_MN || |
---|
| 264 | category == UTF8PROC_CATEGORY_MC || |
---|
| 265 | category == UTF8PROC_CATEGORY_ME) return 0; |
---|
| 266 | } |
---|
| 267 | if (options & UTF8PROC_CASEFOLD) { |
---|
| 268 | if (property->casefold_mapping) { |
---|
| 269 | const int32_t *casefold_entry; |
---|
| 270 | ssize_t written = 0; |
---|
| 271 | for (casefold_entry = property->casefold_mapping; |
---|
| 272 | *casefold_entry >= 0; casefold_entry++) { |
---|
| 273 | written += utf8proc_decompose_char(*casefold_entry, dst+written, |
---|
| 274 | (bufsize > written) ? (bufsize - written) : 0, options, |
---|
| 275 | last_boundclass); |
---|
| 276 | if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
| 277 | } |
---|
| 278 | return written; |
---|
| 279 | } |
---|
| 280 | } |
---|
| 281 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { |
---|
| 282 | if (property->decomp_mapping && |
---|
| 283 | (!property->decomp_type || (options & UTF8PROC_COMPAT))) { |
---|
| 284 | const int32_t *decomp_entry; |
---|
| 285 | ssize_t written = 0; |
---|
| 286 | for (decomp_entry = property->decomp_mapping; |
---|
| 287 | *decomp_entry >= 0; decomp_entry++) { |
---|
| 288 | written += utf8proc_decompose_char(*decomp_entry, dst+written, |
---|
| 289 | (bufsize > written) ? (bufsize - written) : 0, options, |
---|
| 290 | last_boundclass); |
---|
| 291 | if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
| 292 | } |
---|
| 293 | return written; |
---|
| 294 | } |
---|
| 295 | } |
---|
| 296 | if (options & UTF8PROC_CHARBOUND) { |
---|
| 297 | bool boundary; |
---|
| 298 | int tbc, lbc; |
---|
| 299 | tbc = |
---|
| 300 | (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : |
---|
| 301 | (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : |
---|
| 302 | ((category == UTF8PROC_CATEGORY_ZL || |
---|
| 303 | category == UTF8PROC_CATEGORY_ZP || |
---|
| 304 | category == UTF8PROC_CATEGORY_CC || |
---|
| 305 | category == UTF8PROC_CATEGORY_CF) && |
---|
| 306 | !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : |
---|
| 307 | property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : |
---|
| 308 | ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) || |
---|
| 309 | uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : |
---|
| 310 | (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? |
---|
| 311 | UTF8PROC_BOUNDCLASS_V : |
---|
| 312 | (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? |
---|
| 313 | UTF8PROC_BOUNDCLASS_T : |
---|
| 314 | (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( |
---|
| 315 | ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? |
---|
| 316 | UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT |
---|
| 317 | ) : |
---|
| 318 | UTF8PROC_BOUNDCLASS_OTHER; |
---|
| 319 | lbc = *last_boundclass; |
---|
| 320 | boundary = |
---|
| 321 | (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : |
---|
| 322 | (lbc == UTF8PROC_BOUNDCLASS_START) ? true : |
---|
| 323 | (lbc == UTF8PROC_BOUNDCLASS_CR && |
---|
| 324 | tbc == UTF8PROC_BOUNDCLASS_LF) ? false : |
---|
| 325 | (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : |
---|
| 326 | (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : |
---|
| 327 | (lbc == UTF8PROC_BOUNDCLASS_L && |
---|
| 328 | (tbc == UTF8PROC_BOUNDCLASS_L || |
---|
| 329 | tbc == UTF8PROC_BOUNDCLASS_V || |
---|
| 330 | tbc == UTF8PROC_BOUNDCLASS_LV || |
---|
| 331 | tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : |
---|
| 332 | ((lbc == UTF8PROC_BOUNDCLASS_LV || |
---|
| 333 | lbc == UTF8PROC_BOUNDCLASS_V) && |
---|
| 334 | (tbc == UTF8PROC_BOUNDCLASS_V || |
---|
| 335 | tbc == UTF8PROC_BOUNDCLASS_T)) ? false : |
---|
| 336 | ((lbc == UTF8PROC_BOUNDCLASS_LVT || |
---|
| 337 | lbc == UTF8PROC_BOUNDCLASS_T) && |
---|
| 338 | tbc == UTF8PROC_BOUNDCLASS_T) ? false : |
---|
| 339 | true; |
---|
| 340 | *last_boundclass = tbc; |
---|
| 341 | if (boundary) { |
---|
| 342 | if (bufsize >= 1) dst[0] = 0xFFFF; |
---|
| 343 | if (bufsize >= 2) dst[1] = uc; |
---|
| 344 | return 2; |
---|
| 345 | } |
---|
| 346 | } |
---|
| 347 | if (bufsize >= 1) *dst = uc; |
---|
| 348 | return 1; |
---|
| 349 | } |
---|
| 350 | |
---|
| 351 | ssize_t utf8proc_decompose( |
---|
| 352 | const uint8_t *str, ssize_t slen, |
---|
| 353 | int32_t *buffer, ssize_t bufsize, int options |
---|
| 354 | ) { |
---|
| 355 | /*// slen will be ignored, if UTF8PROC_NULLTERM is set in options*/ |
---|
| 356 | ssize_t wpos = 0; |
---|
| 357 | if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) |
---|
| 358 | return UTF8PROC_ERROR_INVALIDOPTS; |
---|
| 359 | if ((options & UTF8PROC_STRIPMARK) && |
---|
| 360 | !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) |
---|
| 361 | return UTF8PROC_ERROR_INVALIDOPTS; |
---|
| 362 | { |
---|
| 363 | int32_t uc; |
---|
| 364 | ssize_t rpos = 0; |
---|
| 365 | ssize_t decomp_result; |
---|
| 366 | int boundclass = UTF8PROC_BOUNDCLASS_START; |
---|
| 367 | while (1) { |
---|
| 368 | if (options & UTF8PROC_NULLTERM) { |
---|
| 369 | rpos += utf8proc_iterate(str + rpos, -1, &uc); |
---|
| 370 | /* checking of return value is not neccessary, |
---|
| 371 | as 'uc' is < 0 in case of error. */ |
---|
| 372 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 373 | if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; |
---|
| 374 | if (uc == 0) break; |
---|
| 375 | } else { |
---|
| 376 | if (rpos >= slen) break; |
---|
| 377 | rpos += utf8proc_iterate(str + rpos, slen - rpos, &uc); |
---|
| 378 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; |
---|
| 379 | } |
---|
| 380 | decomp_result = utf8proc_decompose_char( |
---|
| 381 | uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, |
---|
| 382 | &boundclass |
---|
| 383 | ); |
---|
| 384 | if (decomp_result < 0) return decomp_result; |
---|
| 385 | wpos += decomp_result; |
---|
| 386 | /* // prohibiting integer overflows due to too long strings:*/ |
---|
| 387 | if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) |
---|
| 388 | return UTF8PROC_ERROR_OVERFLOW; |
---|
| 389 | } |
---|
| 390 | } |
---|
| 391 | if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { |
---|
| 392 | ssize_t pos = 0; |
---|
| 393 | while (pos < wpos-1) { |
---|
| 394 | int32_t uc1, uc2; |
---|
| 395 | const utf8proc_property_t *property1, *property2; |
---|
| 396 | uc1 = buffer[pos]; |
---|
| 397 | uc2 = buffer[pos+1]; |
---|
| 398 | property1 = utf8proc_get_property(uc1); |
---|
| 399 | property2 = utf8proc_get_property(uc2); |
---|
| 400 | if (property1->combining_class > property2->combining_class && |
---|
| 401 | property2->combining_class > 0) { |
---|
| 402 | buffer[pos] = uc2; |
---|
| 403 | buffer[pos+1] = uc1; |
---|
| 404 | if (pos > 0) pos--; else pos++; |
---|
| 405 | } else { |
---|
| 406 | pos++; |
---|
| 407 | } |
---|
| 408 | } |
---|
| 409 | } |
---|
| 410 | return wpos; |
---|
| 411 | } |
---|
| 412 | |
---|
| 413 | ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { |
---|
| 414 | /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored |
---|
| 415 | ASSERT: 'buffer' has one spare byte of free space at the end! */ |
---|
| 416 | if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { |
---|
| 417 | ssize_t rpos; |
---|
| 418 | ssize_t wpos = 0; |
---|
| 419 | int32_t uc; |
---|
| 420 | for (rpos = 0; rpos < length; rpos++) { |
---|
| 421 | uc = buffer[rpos]; |
---|
| 422 | if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; |
---|
| 423 | if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || |
---|
| 424 | ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { |
---|
| 425 | if (options & UTF8PROC_NLF2LS) { |
---|
| 426 | if (options & UTF8PROC_NLF2PS) { |
---|
| 427 | buffer[wpos++] = 0x000A; |
---|
| 428 | } else { |
---|
| 429 | buffer[wpos++] = 0x2028; |
---|
| 430 | } |
---|
| 431 | } else { |
---|
| 432 | if (options & UTF8PROC_NLF2PS) { |
---|
| 433 | buffer[wpos++] = 0x2029; |
---|
| 434 | } else { |
---|
| 435 | buffer[wpos++] = 0x0020; |
---|
| 436 | } |
---|
| 437 | } |
---|
| 438 | } else if ((options & UTF8PROC_STRIPCC) && |
---|
| 439 | (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { |
---|
| 440 | if (uc == 0x0009) buffer[wpos++] = 0x0020; |
---|
| 441 | } else { |
---|
| 442 | buffer[wpos++] = uc; |
---|
| 443 | } |
---|
| 444 | } |
---|
| 445 | length = wpos; |
---|
| 446 | } |
---|
| 447 | if (options & UTF8PROC_COMPOSE) { |
---|
| 448 | int32_t *starter = NULL; |
---|
| 449 | int32_t current_char; |
---|
| 450 | const utf8proc_property_t *starter_property = NULL, *current_property; |
---|
| 451 | utf8proc_propval_t max_combining_class = -1; |
---|
| 452 | ssize_t rpos; |
---|
| 453 | ssize_t wpos = 0; |
---|
| 454 | int32_t composition; |
---|
| 455 | for (rpos = 0; rpos < length; rpos++) { |
---|
| 456 | current_char = buffer[rpos]; |
---|
| 457 | current_property = utf8proc_get_property(current_char); |
---|
| 458 | if (starter && current_property->combining_class > max_combining_class) { |
---|
| 459 | /* // combination perhaps possible*/ |
---|
| 460 | int32_t hangul_lindex; |
---|
| 461 | int32_t hangul_sindex; |
---|
| 462 | hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; |
---|
| 463 | if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { |
---|
| 464 | int32_t hangul_vindex; |
---|
| 465 | hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; |
---|
| 466 | if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { |
---|
| 467 | *starter = UTF8PROC_HANGUL_SBASE + |
---|
| 468 | (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * |
---|
| 469 | UTF8PROC_HANGUL_TCOUNT; |
---|
| 470 | starter_property = NULL; |
---|
| 471 | continue; |
---|
| 472 | } |
---|
| 473 | } |
---|
| 474 | hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; |
---|
| 475 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && |
---|
| 476 | (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { |
---|
| 477 | int32_t hangul_tindex; |
---|
| 478 | hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; |
---|
| 479 | if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { |
---|
| 480 | *starter += hangul_tindex; |
---|
| 481 | starter_property = NULL; |
---|
| 482 | continue; |
---|
| 483 | } |
---|
| 484 | } |
---|
| 485 | if (!starter_property) { |
---|
| 486 | starter_property = utf8proc_get_property(*starter); |
---|
| 487 | } |
---|
| 488 | if (starter_property->comb1st_index >= 0 && |
---|
| 489 | current_property->comb2nd_index >= 0) { |
---|
| 490 | composition = utf8proc_combinations[ |
---|
| 491 | starter_property->comb1st_index + |
---|
| 492 | current_property->comb2nd_index |
---|
| 493 | ]; |
---|
| 494 | if (composition >= 0 && (!(options & UTF8PROC_STABLE) || |
---|
| 495 | !(utf8proc_get_property(composition)->comp_exclusion))) { |
---|
| 496 | *starter = composition; |
---|
| 497 | starter_property = NULL; |
---|
| 498 | continue; |
---|
| 499 | } |
---|
| 500 | } |
---|
| 501 | } |
---|
| 502 | buffer[wpos] = current_char; |
---|
| 503 | if (current_property->combining_class) { |
---|
| 504 | if (current_property->combining_class > max_combining_class) { |
---|
| 505 | max_combining_class = current_property->combining_class; |
---|
| 506 | } |
---|
| 507 | } else { |
---|
| 508 | starter = buffer + wpos; |
---|
| 509 | starter_property = NULL; |
---|
| 510 | max_combining_class = -1; |
---|
| 511 | } |
---|
| 512 | wpos++; |
---|
| 513 | } |
---|
| 514 | length = wpos; |
---|
| 515 | } |
---|
| 516 | { |
---|
| 517 | ssize_t rpos, wpos = 0; |
---|
| 518 | int32_t uc; |
---|
| 519 | for (rpos = 0; rpos < length; rpos++) { |
---|
| 520 | uc = buffer[rpos]; |
---|
| 521 | wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); |
---|
| 522 | } |
---|
| 523 | ((uint8_t *)buffer)[wpos] = 0; |
---|
| 524 | return wpos; |
---|
| 525 | } |
---|
| 526 | } |
---|
| 527 | |
---|
| 528 | ssize_t utf8proc_map( |
---|
| 529 | const uint8_t *str, ssize_t slen, uint8_t **dstptr, int options |
---|
| 530 | ) { |
---|
| 531 | int32_t *buffer; |
---|
| 532 | ssize_t result; |
---|
| 533 | *dstptr = NULL; |
---|
| 534 | result = utf8proc_decompose(str, slen, NULL, 0, options); |
---|
| 535 | if (result < 0) return result; |
---|
| 536 | buffer = malloc(result * sizeof(int32_t) + 1); |
---|
| 537 | if (!buffer) return UTF8PROC_ERROR_NOMEM; |
---|
| 538 | result = utf8proc_decompose(str, slen, buffer, result, options); |
---|
| 539 | if (result < 0) { |
---|
| 540 | free(buffer); |
---|
| 541 | return result; |
---|
| 542 | } |
---|
| 543 | result = utf8proc_reencode(buffer, result, options); |
---|
| 544 | if (result < 0) { |
---|
| 545 | free(buffer); |
---|
| 546 | return result; |
---|
| 547 | } |
---|
| 548 | { |
---|
| 549 | int32_t *newptr; |
---|
| 550 | newptr = realloc(buffer, result+1); |
---|
| 551 | if (newptr) buffer = newptr; |
---|
| 552 | } |
---|
| 553 | *dstptr = (uint8_t *)buffer; |
---|
| 554 | return result; |
---|
| 555 | } |
---|
| 556 | |
---|
| 557 | uint8_t *utf8proc_NFD(const uint8_t *str) { |
---|
| 558 | uint8_t *retval; |
---|
| 559 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
| 560 | UTF8PROC_DECOMPOSE); |
---|
| 561 | return retval; |
---|
| 562 | } |
---|
| 563 | |
---|
| 564 | uint8_t *utf8proc_NFC(const uint8_t *str) { |
---|
| 565 | uint8_t *retval; |
---|
| 566 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
| 567 | UTF8PROC_COMPOSE); |
---|
| 568 | return retval; |
---|
| 569 | } |
---|
| 570 | |
---|
| 571 | uint8_t *utf8proc_NFKD(const uint8_t *str) { |
---|
| 572 | uint8_t *retval; |
---|
| 573 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
| 574 | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); |
---|
| 575 | return retval; |
---|
| 576 | } |
---|
| 577 | |
---|
| 578 | uint8_t *utf8proc_NFKC(const uint8_t *str) { |
---|
| 579 | uint8_t *retval; |
---|
| 580 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
---|
| 581 | UTF8PROC_COMPOSE | UTF8PROC_COMPAT); |
---|
| 582 | return retval; |
---|
| 583 | } |
---|
| 584 | |
---|
| 585 | ssize_t utf8proc_check(const uint8_t *str) { |
---|
| 586 | ssize_t result; |
---|
| 587 | result = utf8proc_decompose(str, 0, NULL, 0, |
---|
| 588 | UTF8PROC_NULLTERM | UTF8PROC_STABLE); |
---|
| 589 | return result; |
---|
| 590 | } |
---|