Context Navigation

source: XIOS/trunk/extern/src_netcdf4/utf8proc.h @ 2371

Last change on this file since 2371 was 409, checked in by ymipsl, 12 years ago

Add improved nectdf internal library src

YM

Property svn:eol-style set to native

File size: 15.4 KB

Line
1	/*
2	* Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
3	*
4	* Permission is hereby granted, free of charge, to any person obtaining a
5	* copy of this software and associated documentation files (the "Software"),
6	* to deal in the Software without restriction, including without limitation
7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8	* and/or sell copies of the Software, and to permit persons to whom the
9	* Software is furnished to do so, subject to the following conditions:
10	*
11	* The above copyright notice and this permission notice shall be included in
12	* all copies or substantial portions of the Software.
13	*
14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20	* DEALINGS IN THE SOFTWARE.
21	*/
22
23
24	/*
25	* File name: utf8proc.h
26	* Version: 1.1.1
27	* Last changed: 2007-07-22
28	* Changed 2008-05-16 by rkr to add config.h and replacement for stdbool.h
29	* for pre-C99 compilers that don't support bool.
30	* Changed 2008-06-05 by rkr to add utf8proc_check(str, options) function for
31	* for just checking UTF-8 validity
32	* Description:
33	* Header files for libutf8proc, which is a mapping tool for UTF-8 strings
34	* with following features:
35	* - decomposing and composing of strings
36	* - replacing compatibility characters with their equivalents
37	* - stripping of "default ignorable characters"
38	* like SOFT-HYPHEN or ZERO-WIDTH-SPACE
39	* - folding of certain characters for string comparison
40	* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
41	* (see "LUMP" option)
42	* - optional rejection of strings containing non-assigned code points
43	* - stripping of control characters
44	* - stripping of character marks (accents, etc.)
45	* - transformation of LF, CRLF, CR and NEL to line-feed (LF)
46	* or to the unicode chararacters for paragraph separation (PS)
47	* or line separation (LS).
48	* - unicode case folding (for case insensitive string comparisons)
49	* - rejection of illegal UTF-8 data
50	* (i.e. UTF-8 encoded UTF-16 surrogates)
51	* - support for korean hangul characters
52	* Unicode Version 5.0.0 is supported.
53	*/
54
55
56	#ifndef UTF8PROC_H
57	#define UTF8PROC_H
58
59
60	#include "config.h"
61
62	#include <stdlib.h>
63	#ifdef HAVE_STDBOOL_H
64	#include <stdbool.h>
65	#else
66	# if ! HAVE__BOOL
67	# ifdef __cplusplus
68	typedef bool _Bool;
69	# else
70	typedef unsigned char _Bool;
71	# endif
72	# endif
73	# define bool _Bool
74	# define false 0
75	# define true 1
76	# define __bool_true_false_are_defined 1
77	#endif
78	#include <sys/types.h>
79	#ifdef HAVE_INTTYPES_H
80	#include <inttypes.h>
81	#else /* HAVE_INTTYPES_H */
82	#include <pstdint.h>
83	#endif /* HAVE_INTTYPES_H */
84	#include <limits.h>
85
86	#ifndef HAVE_SSIZE_T
87	#define ssize_t int
88	#endif
89
90	#ifndef SSIZE_MAX
91	#define SSIZE_MAX (SIZE_MAX/2)
92	#endif
93
94	#define UTF8PROC_NULLTERM (1<<0)
95	#define UTF8PROC_STABLE (1<<1)
96	#define UTF8PROC_COMPAT (1<<2)
97	#define UTF8PROC_COMPOSE (1<<3)
98	#define UTF8PROC_DECOMPOSE (1<<4)
99	#define UTF8PROC_IGNORE (1<<5)
100	#define UTF8PROC_REJECTNA (1<<6)
101	#define UTF8PROC_NLF2LS (1<<7)
102	#define UTF8PROC_NLF2PS (1<<8)
103	#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS)
104	#define UTF8PROC_STRIPCC (1<<9)
105	#define UTF8PROC_CASEFOLD (1<<10)
106	#define UTF8PROC_CHARBOUND (1<<11)
107	#define UTF8PROC_LUMP (1<<12)
108	#define UTF8PROC_STRIPMARK (1<<13)
109	/*
110	* Flags being regarded by several functions in the library:
111	* NULLTERM: The given UTF-8 input is NULL terminated.
112	* STABLE: Unicode Versioning Stability has to be respected.
113	* COMPAT: Compatiblity decomposition
114	* (i.e. formatting information is lost)
115	* COMPOSE: Return a result with composed characters.
116	* DECOMPOSE: Return a result with decomposed characters.
117	* IGNORE: Strip "default ignorable characters"
118	* REJECTNA: Return an error, if the input contains unassigned
119	* code points.
120	* NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
121	* representing a line break, and should be converted to the
122	* unicode character for line separation (LS).
123	* NLF2PS: Indicating that NLF-sequences are representing a paragraph
124	* break, and should be converted to the unicode character for
125	* paragraph separation (PS).
126	* NLF2LF: Indicating that the meaning of NLF-sequences is unknown.
127	* STRIPCC: Strips and/or convers control characters.
128	* NLF-sequences are transformed into space, except if one of
129	* the NLF2LS/PS/LF options is given.
130	* HorizontalTab (HT) and FormFeed (FF) are treated as a
131	* NLF-sequence in this case.
132	* All other control characters are simply removed.
133	* CASEFOLD: Performs unicode case folding, to be able to do a
134	* case-insensitive string comparison.
135	* CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
136	* is representing a single grapheme cluster (see UAX#29).
137	* LUMP: Lumps certain characters together
138	* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
139	* (See lump.txt for details.)
140	* If NLF2LF is set, this includes a transformation of
141	* paragraph and line separators to ASCII line-feed (LF).
142	* STRIPMARK: Strips all character markings
143	* (non-spacing, spacing and enclosing) (i.e. accents)
144	* NOTE: this option works only with COMPOSE or DECOMPOSE
145	*/
146
147	#define UTF8PROC_ERROR_NOMEM -1
148	#define UTF8PROC_ERROR_OVERFLOW -2
149	#define UTF8PROC_ERROR_INVALIDUTF8 -3
150	#define UTF8PROC_ERROR_NOTASSIGNED -4
151	#define UTF8PROC_ERROR_INVALIDOPTS -5
152	/*
153	* Error codes being returned by almost all functions:
154	* ERROR_NOMEM: Memory could not be allocated.
155	* ERROR_OVERFLOW: The given string is too long to be processed.
156	* ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
157	* ERROR_NOTASSIGNED: The REJECTNA flag was set,
158	* and an unassigned code point was found.
159	* ERROR_INVALIDOPTS: Invalid options have been used.
160	*/
161
162	typedef int16_t utf8proc_propval_t;
163	typedef struct utf8proc_property_struct {
164	utf8proc_propval_t category;
165	utf8proc_propval_t combining_class;
166	utf8proc_propval_t bidi_class;
167	utf8proc_propval_t decomp_type;
168	const int32_t *decomp_mapping;
169	unsigned bidi_mirrored:1;
170	int32_t uppercase_mapping;
171	int32_t lowercase_mapping;
172	int32_t titlecase_mapping;
173	int32_t comb1st_index;
174	int32_t comb2nd_index;
175	unsigned comp_exclusion:1;
176	unsigned ignorable:1;
177	unsigned control_boundary:1;
178	unsigned extend:1;
179	const int32_t *casefold_mapping;
180	} utf8proc_property_t;
181
182	#define UTF8PROC_CATEGORY_LU 1
183	#define UTF8PROC_CATEGORY_LL 2
184	#define UTF8PROC_CATEGORY_LT 3
185	#define UTF8PROC_CATEGORY_LM 4
186	#define UTF8PROC_CATEGORY_LO 5
187	#define UTF8PROC_CATEGORY_MN 6
188	#define UTF8PROC_CATEGORY_MC 7
189	#define UTF8PROC_CATEGORY_ME 8
190	#define UTF8PROC_CATEGORY_ND 9
191	#define UTF8PROC_CATEGORY_NL 10
192	#define UTF8PROC_CATEGORY_NO 11
193	#define UTF8PROC_CATEGORY_PC 12
194	#define UTF8PROC_CATEGORY_PD 13
195	#define UTF8PROC_CATEGORY_PS 14
196	#define UTF8PROC_CATEGORY_PE 15
197	#define UTF8PROC_CATEGORY_PI 16
198	#define UTF8PROC_CATEGORY_PF 17
199	#define UTF8PROC_CATEGORY_PO 18
200	#define UTF8PROC_CATEGORY_SM 19
201	#define UTF8PROC_CATEGORY_SC 20
202	#define UTF8PROC_CATEGORY_SK 21
203	#define UTF8PROC_CATEGORY_SO 22
204	#define UTF8PROC_CATEGORY_ZS 23
205	#define UTF8PROC_CATEGORY_ZL 24
206	#define UTF8PROC_CATEGORY_ZP 25
207	#define UTF8PROC_CATEGORY_CC 26
208	#define UTF8PROC_CATEGORY_CF 27
209	#define UTF8PROC_CATEGORY_CS 28
210	#define UTF8PROC_CATEGORY_CO 29
211	#define UTF8PROC_CATEGORY_CN 30
212	#define UTF8PROC_BIDI_CLASS_L 1
213	#define UTF8PROC_BIDI_CLASS_LRE 2
214	#define UTF8PROC_BIDI_CLASS_LRO 3
215	#define UTF8PROC_BIDI_CLASS_R 4
216	#define UTF8PROC_BIDI_CLASS_AL 5
217	#define UTF8PROC_BIDI_CLASS_RLE 6
218	#define UTF8PROC_BIDI_CLASS_RLO 7
219	#define UTF8PROC_BIDI_CLASS_PDF 8
220	#define UTF8PROC_BIDI_CLASS_EN 9
221	#define UTF8PROC_BIDI_CLASS_ES 10
222	#define UTF8PROC_BIDI_CLASS_ET 11
223	#define UTF8PROC_BIDI_CLASS_AN 12
224	#define UTF8PROC_BIDI_CLASS_CS 13
225	#define UTF8PROC_BIDI_CLASS_NSM 14
226	#define UTF8PROC_BIDI_CLASS_BN 15
227	#define UTF8PROC_BIDI_CLASS_B 16
228	#define UTF8PROC_BIDI_CLASS_S 17
229	#define UTF8PROC_BIDI_CLASS_WS 18
230	#define UTF8PROC_BIDI_CLASS_ON 19
231	#define UTF8PROC_DECOMP_TYPE_FONT 1
232	#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
233	#define UTF8PROC_DECOMP_TYPE_INITIAL 3
234	#define UTF8PROC_DECOMP_TYPE_MEDIAL 4
235	#define UTF8PROC_DECOMP_TYPE_FINAL 5
236	#define UTF8PROC_DECOMP_TYPE_ISOLATED 6
237	#define UTF8PROC_DECOMP_TYPE_CIRCLE 7
238	#define UTF8PROC_DECOMP_TYPE_SUPER 8
239	#define UTF8PROC_DECOMP_TYPE_SUB 9
240	#define UTF8PROC_DECOMP_TYPE_VERTICAL 10
241	#define UTF8PROC_DECOMP_TYPE_WIDE 11
242	#define UTF8PROC_DECOMP_TYPE_NARROW 12
243	#define UTF8PROC_DECOMP_TYPE_SMALL 13
244	#define UTF8PROC_DECOMP_TYPE_SQUARE 14
245	#define UTF8PROC_DECOMP_TYPE_FRACTION 15
246	#define UTF8PROC_DECOMP_TYPE_COMPAT 16
247
248	extern const int8_t utf8proc_utf8class[256];
249
250	const char *utf8proc_errmsg(ssize_t errcode);
251	/*
252	* Returns a static error string for the given error code.
253	*/
254
255	ssize_t utf8proc_iterate(const uint8_t str, ssize_t strlen, int32_t dst);
256	/*
257	* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
258	* The maximum number of bytes read is 'strlen', unless 'strlen' is
259	* negative.
260	* If a valid unicode char could be read, it is stored in the variable
261	* being pointed to by 'dst', otherwise that variable will be set to -1.
262	* In case of success the number of bytes read is returned, otherwise a
263	* negative error code is returned.
264	*/
265
266	bool utf8proc_codepoint_valid(int32_t uc);
267	/*
268	* Returns 1, if the given unicode code-point is valid, otherwise 0.
269	*/
270
271	ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
272	/*
273	* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
274	* the byte array being pointed to by 'dst'. This array has to be at least
275	* 4 bytes long.
276	* In case of success the number of bytes written is returned,
277	* otherwise 0.
278	* This function does not check if 'uc' is a valid unicode code point.
279	*/
280
281	const utf8proc_property_t *utf8proc_get_property(int32_t uc);
282	/*
283	* Returns a pointer to a (constant) struct containing information about
284	* the unicode char with the given code point 'uc'.
285	* If the character is not existent a pointer to a special struct is
286	* returned, where 'category' is a NULL pointer.
287	* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
288	* 0x10FFFF, otherwise the program might crash!
289	*/
290
291	ssize_t utf8proc_decompose_char(
292	int32_t uc, int32_t *dst, ssize_t bufsize,
293	int options, int *last_boundclass
294	);
295	/*
296	* Writes a decomposition of the unicode char 'uc' into the array being
297	* pointed to by 'dst'.
298	* Following flags in the 'options' field are regarded:
299	* REJECTNA: an unassigned unicode code point leads to an error
300	* IGNORE: "default ignorable" chars are stripped
301	* CASEFOLD: unicode casefolding is applied
302	* COMPAT: replace certain characters with their
303	* compatibility decomposition
304	* CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
305	* LUMP: lumps certain different characters together
306	* STRIPMARK: removes all character marks
307	* The pointer 'last_boundclass' has to point to an integer variable which
308	* is storing the last character boundary class, if the CHARBOUND option
309	* is used.
310	* In case of success the number of chars written is returned,
311	* in case of an error, a negative error code is returned.
312	* If the number of written chars would be bigger than 'bufsize',
313	* the buffer (up to 'bufsize') has inpredictable data, and the needed
314	* buffer size is returned.
315	* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
316	* 0x10FFFF, otherwise the program might crash!
317	*/
318
319	ssize_t utf8proc_decompose(
320	const uint8_t *str, ssize_t strlen,
321	int32_t *buffer, ssize_t bufsize, int options
322	);
323	/*
324	* Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
325	* string, and orders the decomposed sequences correctly.
326	* If the NULLTERM flag in 'options' is set, processing will be stopped,
327	* when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
328	* The result in form of unicode code points is written into the buffer
329	* being pointed to by 'buffer', having the length of 'bufsize' entries.
330	* In case of success the number of chars written is returned,
331	* in case of an error, a negative error code is returned.
332	* If the number of written chars would be bigger than 'bufsize',
333	* the buffer (up to 'bufsize') has inpredictable data, and the needed
334	* buffer size is returned.
335	*/
336
337	ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
338	/*
339	* Reencodes the sequence of unicode characters given by the pointer
340	* 'buffer' and 'length' as UTF-8.
341	* The result is stored in the same memory area where the data is read.
342	* Following flags in the 'options' field are regarded:
343	* NLF2LS: converts LF, CRLF, CR and NEL into LS
344	* NLF2PS: converts LF, CRLF, CR and NEL into PS
345	* NLF2LF: converts LF, CRLF, CR and NEL into LF
346	* STRIPCC: strips or converts all non-affected control characters
347	* COMPOSE: tries to combine decomposed characters into composite
348	* characters
349	* STABLE: prohibits combining characters which would violate
350	* the unicode versioning stability
351	* In case of success the length of the resulting UTF-8 string is
352	* returned, otherwise a negative error code is returned.
353	* WARNING: The amount of free space being pointed to by 'buffer', has to
354	* exceed the amount of the input data by one byte, and the
355	* entries of the array pointed to by 'str' have to be in the
356	* range of 0x0000 to 0x10FFFF, otherwise the program might
357	* crash!
358	*/
359
360	ssize_t utf8proc_map(
361	const uint8_t str, ssize_t strlen, uint8_t *dstptr, int options
362	);
363	/*
364	* Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
365	* string, which is allocated dynamically, and afterwards pointed to by
366	* the pointer being pointed to by 'dstptr'.
367	* If the NULLTERM flag in the 'options' field is set, the length is
368	* determined by a NULL terminator, otherwise the parameter 'strlen' is
369	* evaluated to determine the string length, but in any case the result
370	* will be NULL terminated (though it might contain NULL characters
371	* before). Other flags in the 'options' field are passed to the functions
372	* defined above, and regarded as described.
373	* In case of success the length of the new string is returned,
374	* otherwise a negative error code is returned.
375	* NOTICE: The memory of the new UTF-8 string will have been allocated with
376	* 'malloc', and has theirfore to be freed with 'free'.
377	*/
378
379	uint8_t utf8proc_NFD(const uint8_t str);
380	uint8_t utf8proc_NFC(const uint8_t str);
381	uint8_t utf8proc_NFKD(const uint8_t str);
382	uint8_t utf8proc_NFKC(const uint8_t str);
383	/*
384	* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
385	* normalized version of the null-terminated string 'str'.
386	*/
387
388	ssize_t utf8proc_check(const uint8_t *str);
389	/*
390	* Just checks UTF-8 string for validity, returns 0 if valid or one of
391	* the negative UTF8PROC_ERROR_* codes if invalid or memory exhausted
392	* checking. Assumes null-terminated string str and UTF8PROC_STABLE
393	* option.
394	*/
395
396	#endif
397

Note: See TracBrowser for help on using the repository browser.

Download in other formats: