Context Navigation

source: XIOS/trunk/extern/src_netcdf4/utf8proc.h @ 1119

Last change on this file since 1119 was 409, checked in by ymipsl, 12 years ago

Add improved nectdf internal library src

YM

Property svn:eol-style set to native

File size: 15.4 KB

Rev	Line
[409]	1	/*
	2	* Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
	3	*
	4	* Permission is hereby granted, free of charge, to any person obtaining a
	5	* copy of this software and associated documentation files (the "Software"),
	6	* to deal in the Software without restriction, including without limitation
	7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	8	* and/or sell copies of the Software, and to permit persons to whom the
	9	* Software is furnished to do so, subject to the following conditions:
	10	*
	11	* The above copyright notice and this permission notice shall be included in
	12	* all copies or substantial portions of the Software.
	13	*
	14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	17	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	19	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	20	* DEALINGS IN THE SOFTWARE.
	21	*/
	22
	23
	24	/*
	25	* File name: utf8proc.h
	26	* Version: 1.1.1
	27	* Last changed: 2007-07-22
	28	* Changed 2008-05-16 by rkr to add config.h and replacement for stdbool.h
	29	* for pre-C99 compilers that don't support bool.
	30	* Changed 2008-06-05 by rkr to add utf8proc_check(str, options) function for
	31	* for just checking UTF-8 validity
	32	* Description:
	33	* Header files for libutf8proc, which is a mapping tool for UTF-8 strings
	34	* with following features:
	35	* - decomposing and composing of strings
	36	* - replacing compatibility characters with their equivalents
	37	* - stripping of "default ignorable characters"
	38	* like SOFT-HYPHEN or ZERO-WIDTH-SPACE
	39	* - folding of certain characters for string comparison
	40	* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
	41	* (see "LUMP" option)
	42	* - optional rejection of strings containing non-assigned code points
	43	* - stripping of control characters
	44	* - stripping of character marks (accents, etc.)
	45	* - transformation of LF, CRLF, CR and NEL to line-feed (LF)
	46	* or to the unicode chararacters for paragraph separation (PS)
	47	* or line separation (LS).
	48	* - unicode case folding (for case insensitive string comparisons)
	49	* - rejection of illegal UTF-8 data
	50	* (i.e. UTF-8 encoded UTF-16 surrogates)
	51	* - support for korean hangul characters
	52	* Unicode Version 5.0.0 is supported.
	53	*/
	54
	55
	56	#ifndef UTF8PROC_H
	57	#define UTF8PROC_H
	58
	59
	60	#include "config.h"
	61
	62	#include <stdlib.h>
	63	#ifdef HAVE_STDBOOL_H
	64	#include <stdbool.h>
	65	#else
	66	# if ! HAVE__BOOL
	67	# ifdef __cplusplus
	68	typedef bool _Bool;
	69	# else
	70	typedef unsigned char _Bool;
	71	# endif
	72	# endif
	73	# define bool _Bool
	74	# define false 0
	75	# define true 1
	76	# define __bool_true_false_are_defined 1
	77	#endif
	78	#include <sys/types.h>
	79	#ifdef HAVE_INTTYPES_H
	80	#include <inttypes.h>
	81	#else /* HAVE_INTTYPES_H */
	82	#include <pstdint.h>
	83	#endif /* HAVE_INTTYPES_H */
	84	#include <limits.h>
	85
	86	#ifndef HAVE_SSIZE_T
	87	#define ssize_t int
	88	#endif
	89
	90	#ifndef SSIZE_MAX
	91	#define SSIZE_MAX (SIZE_MAX/2)
	92	#endif
	93
	94	#define UTF8PROC_NULLTERM (1<<0)
	95	#define UTF8PROC_STABLE (1<<1)
	96	#define UTF8PROC_COMPAT (1<<2)
	97	#define UTF8PROC_COMPOSE (1<<3)
	98	#define UTF8PROC_DECOMPOSE (1<<4)
	99	#define UTF8PROC_IGNORE (1<<5)
	100	#define UTF8PROC_REJECTNA (1<<6)
	101	#define UTF8PROC_NLF2LS (1<<7)
	102	#define UTF8PROC_NLF2PS (1<<8)
	103	#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS)
	104	#define UTF8PROC_STRIPCC (1<<9)
	105	#define UTF8PROC_CASEFOLD (1<<10)
	106	#define UTF8PROC_CHARBOUND (1<<11)
	107	#define UTF8PROC_LUMP (1<<12)
	108	#define UTF8PROC_STRIPMARK (1<<13)
	109	/*
	110	* Flags being regarded by several functions in the library:
	111	* NULLTERM: The given UTF-8 input is NULL terminated.
	112	* STABLE: Unicode Versioning Stability has to be respected.
	113	* COMPAT: Compatiblity decomposition
	114	* (i.e. formatting information is lost)
	115	* COMPOSE: Return a result with composed characters.
	116	* DECOMPOSE: Return a result with decomposed characters.
	117	* IGNORE: Strip "default ignorable characters"
	118	* REJECTNA: Return an error, if the input contains unassigned
	119	* code points.
	120	* NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
	121	* representing a line break, and should be converted to the
	122	* unicode character for line separation (LS).
	123	* NLF2PS: Indicating that NLF-sequences are representing a paragraph
	124	* break, and should be converted to the unicode character for
	125	* paragraph separation (PS).
	126	* NLF2LF: Indicating that the meaning of NLF-sequences is unknown.
	127	* STRIPCC: Strips and/or convers control characters.
	128	* NLF-sequences are transformed into space, except if one of
	129	* the NLF2LS/PS/LF options is given.
	130	* HorizontalTab (HT) and FormFeed (FF) are treated as a
	131	* NLF-sequence in this case.
	132	* All other control characters are simply removed.
	133	* CASEFOLD: Performs unicode case folding, to be able to do a
	134	* case-insensitive string comparison.
	135	* CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
	136	* is representing a single grapheme cluster (see UAX#29).
	137	* LUMP: Lumps certain characters together
	138	* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
	139	* (See lump.txt for details.)
	140	* If NLF2LF is set, this includes a transformation of
	141	* paragraph and line separators to ASCII line-feed (LF).
	142	* STRIPMARK: Strips all character markings
	143	* (non-spacing, spacing and enclosing) (i.e. accents)
	144	* NOTE: this option works only with COMPOSE or DECOMPOSE
	145	*/
	146
	147	#define UTF8PROC_ERROR_NOMEM -1
	148	#define UTF8PROC_ERROR_OVERFLOW -2
	149	#define UTF8PROC_ERROR_INVALIDUTF8 -3
	150	#define UTF8PROC_ERROR_NOTASSIGNED -4
	151	#define UTF8PROC_ERROR_INVALIDOPTS -5
	152	/*
	153	* Error codes being returned by almost all functions:
	154	* ERROR_NOMEM: Memory could not be allocated.
	155	* ERROR_OVERFLOW: The given string is too long to be processed.
	156	* ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
	157	* ERROR_NOTASSIGNED: The REJECTNA flag was set,
	158	* and an unassigned code point was found.
	159	* ERROR_INVALIDOPTS: Invalid options have been used.
	160	*/
	161
	162	typedef int16_t utf8proc_propval_t;
	163	typedef struct utf8proc_property_struct {
	164	utf8proc_propval_t category;
	165	utf8proc_propval_t combining_class;
	166	utf8proc_propval_t bidi_class;
	167	utf8proc_propval_t decomp_type;
	168	const int32_t *decomp_mapping;
	169	unsigned bidi_mirrored:1;
	170	int32_t uppercase_mapping;
	171	int32_t lowercase_mapping;
	172	int32_t titlecase_mapping;
	173	int32_t comb1st_index;
	174	int32_t comb2nd_index;
	175	unsigned comp_exclusion:1;
	176	unsigned ignorable:1;
	177	unsigned control_boundary:1;
	178	unsigned extend:1;
	179	const int32_t *casefold_mapping;
	180	} utf8proc_property_t;
	181
	182	#define UTF8PROC_CATEGORY_LU 1
	183	#define UTF8PROC_CATEGORY_LL 2
	184	#define UTF8PROC_CATEGORY_LT 3
	185	#define UTF8PROC_CATEGORY_LM 4
	186	#define UTF8PROC_CATEGORY_LO 5
	187	#define UTF8PROC_CATEGORY_MN 6
	188	#define UTF8PROC_CATEGORY_MC 7
	189	#define UTF8PROC_CATEGORY_ME 8
	190	#define UTF8PROC_CATEGORY_ND 9
	191	#define UTF8PROC_CATEGORY_NL 10
	192	#define UTF8PROC_CATEGORY_NO 11
	193	#define UTF8PROC_CATEGORY_PC 12
	194	#define UTF8PROC_CATEGORY_PD 13
	195	#define UTF8PROC_CATEGORY_PS 14
	196	#define UTF8PROC_CATEGORY_PE 15
	197	#define UTF8PROC_CATEGORY_PI 16
	198	#define UTF8PROC_CATEGORY_PF 17
	199	#define UTF8PROC_CATEGORY_PO 18
	200	#define UTF8PROC_CATEGORY_SM 19
	201	#define UTF8PROC_CATEGORY_SC 20
	202	#define UTF8PROC_CATEGORY_SK 21
	203	#define UTF8PROC_CATEGORY_SO 22
	204	#define UTF8PROC_CATEGORY_ZS 23
	205	#define UTF8PROC_CATEGORY_ZL 24
	206	#define UTF8PROC_CATEGORY_ZP 25
	207	#define UTF8PROC_CATEGORY_CC 26
	208	#define UTF8PROC_CATEGORY_CF 27
	209	#define UTF8PROC_CATEGORY_CS 28
	210	#define UTF8PROC_CATEGORY_CO 29
	211	#define UTF8PROC_CATEGORY_CN 30
	212	#define UTF8PROC_BIDI_CLASS_L 1
	213	#define UTF8PROC_BIDI_CLASS_LRE 2
	214	#define UTF8PROC_BIDI_CLASS_LRO 3
	215	#define UTF8PROC_BIDI_CLASS_R 4
	216	#define UTF8PROC_BIDI_CLASS_AL 5
	217	#define UTF8PROC_BIDI_CLASS_RLE 6
	218	#define UTF8PROC_BIDI_CLASS_RLO 7
	219	#define UTF8PROC_BIDI_CLASS_PDF 8
	220	#define UTF8PROC_BIDI_CLASS_EN 9
	221	#define UTF8PROC_BIDI_CLASS_ES 10
	222	#define UTF8PROC_BIDI_CLASS_ET 11
	223	#define UTF8PROC_BIDI_CLASS_AN 12
	224	#define UTF8PROC_BIDI_CLASS_CS 13
	225	#define UTF8PROC_BIDI_CLASS_NSM 14
	226	#define UTF8PROC_BIDI_CLASS_BN 15
	227	#define UTF8PROC_BIDI_CLASS_B 16
	228	#define UTF8PROC_BIDI_CLASS_S 17
	229	#define UTF8PROC_BIDI_CLASS_WS 18
	230	#define UTF8PROC_BIDI_CLASS_ON 19
	231	#define UTF8PROC_DECOMP_TYPE_FONT 1
	232	#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
	233	#define UTF8PROC_DECOMP_TYPE_INITIAL 3
	234	#define UTF8PROC_DECOMP_TYPE_MEDIAL 4
	235	#define UTF8PROC_DECOMP_TYPE_FINAL 5
	236	#define UTF8PROC_DECOMP_TYPE_ISOLATED 6
	237	#define UTF8PROC_DECOMP_TYPE_CIRCLE 7
	238	#define UTF8PROC_DECOMP_TYPE_SUPER 8
	239	#define UTF8PROC_DECOMP_TYPE_SUB 9
	240	#define UTF8PROC_DECOMP_TYPE_VERTICAL 10
	241	#define UTF8PROC_DECOMP_TYPE_WIDE 11
	242	#define UTF8PROC_DECOMP_TYPE_NARROW 12
	243	#define UTF8PROC_DECOMP_TYPE_SMALL 13
	244	#define UTF8PROC_DECOMP_TYPE_SQUARE 14
	245	#define UTF8PROC_DECOMP_TYPE_FRACTION 15
	246	#define UTF8PROC_DECOMP_TYPE_COMPAT 16
	247
	248	extern const int8_t utf8proc_utf8class[256];
	249
	250	const char *utf8proc_errmsg(ssize_t errcode);
	251	/*
	252	* Returns a static error string for the given error code.
	253	*/
	254
	255	ssize_t utf8proc_iterate(const uint8_t str, ssize_t strlen, int32_t dst);
	256	/*
	257	* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
	258	* The maximum number of bytes read is 'strlen', unless 'strlen' is
	259	* negative.
	260	* If a valid unicode char could be read, it is stored in the variable
	261	* being pointed to by 'dst', otherwise that variable will be set to -1.
	262	* In case of success the number of bytes read is returned, otherwise a
	263	* negative error code is returned.
	264	*/
	265
	266	bool utf8proc_codepoint_valid(int32_t uc);
	267	/*
	268	* Returns 1, if the given unicode code-point is valid, otherwise 0.
	269	*/
	270
	271	ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
	272	/*
	273	* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
	274	* the byte array being pointed to by 'dst'. This array has to be at least
	275	* 4 bytes long.
	276	* In case of success the number of bytes written is returned,
	277	* otherwise 0.
	278	* This function does not check if 'uc' is a valid unicode code point.
	279	*/
	280
	281	const utf8proc_property_t *utf8proc_get_property(int32_t uc);
	282	/*
	283	* Returns a pointer to a (constant) struct containing information about
	284	* the unicode char with the given code point 'uc'.
	285	* If the character is not existent a pointer to a special struct is
	286	* returned, where 'category' is a NULL pointer.
	287	* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
	288	* 0x10FFFF, otherwise the program might crash!
	289	*/
	290
	291	ssize_t utf8proc_decompose_char(
	292	int32_t uc, int32_t *dst, ssize_t bufsize,
	293	int options, int *last_boundclass
	294	);
	295	/*
	296	* Writes a decomposition of the unicode char 'uc' into the array being
	297	* pointed to by 'dst'.
	298	* Following flags in the 'options' field are regarded:
	299	* REJECTNA: an unassigned unicode code point leads to an error
	300	* IGNORE: "default ignorable" chars are stripped
	301	* CASEFOLD: unicode casefolding is applied
	302	* COMPAT: replace certain characters with their
	303	* compatibility decomposition
	304	* CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
	305	* LUMP: lumps certain different characters together
	306	* STRIPMARK: removes all character marks
	307	* The pointer 'last_boundclass' has to point to an integer variable which
	308	* is storing the last character boundary class, if the CHARBOUND option
	309	* is used.
	310	* In case of success the number of chars written is returned,
	311	* in case of an error, a negative error code is returned.
	312	* If the number of written chars would be bigger than 'bufsize',
	313	* the buffer (up to 'bufsize') has inpredictable data, and the needed
	314	* buffer size is returned.
	315	* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
	316	* 0x10FFFF, otherwise the program might crash!
	317	*/
	318
	319	ssize_t utf8proc_decompose(
	320	const uint8_t *str, ssize_t strlen,
	321	int32_t *buffer, ssize_t bufsize, int options
	322	);
	323	/*
	324	* Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
	325	* string, and orders the decomposed sequences correctly.
	326	* If the NULLTERM flag in 'options' is set, processing will be stopped,
	327	* when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
	328	* The result in form of unicode code points is written into the buffer
	329	* being pointed to by 'buffer', having the length of 'bufsize' entries.
	330	* In case of success the number of chars written is returned,
	331	* in case of an error, a negative error code is returned.
	332	* If the number of written chars would be bigger than 'bufsize',
	333	* the buffer (up to 'bufsize') has inpredictable data, and the needed
	334	* buffer size is returned.
	335	*/
	336
	337	ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
	338	/*
	339	* Reencodes the sequence of unicode characters given by the pointer
	340	* 'buffer' and 'length' as UTF-8.
	341	* The result is stored in the same memory area where the data is read.
	342	* Following flags in the 'options' field are regarded:
	343	* NLF2LS: converts LF, CRLF, CR and NEL into LS
	344	* NLF2PS: converts LF, CRLF, CR and NEL into PS
	345	* NLF2LF: converts LF, CRLF, CR and NEL into LF
	346	* STRIPCC: strips or converts all non-affected control characters
	347	* COMPOSE: tries to combine decomposed characters into composite
	348	* characters
	349	* STABLE: prohibits combining characters which would violate
	350	* the unicode versioning stability
	351	* In case of success the length of the resulting UTF-8 string is
	352	* returned, otherwise a negative error code is returned.
	353	* WARNING: The amount of free space being pointed to by 'buffer', has to
	354	* exceed the amount of the input data by one byte, and the
	355	* entries of the array pointed to by 'str' have to be in the
	356	* range of 0x0000 to 0x10FFFF, otherwise the program might
	357	* crash!
	358	*/
	359
	360	ssize_t utf8proc_map(
	361	const uint8_t str, ssize_t strlen, uint8_t *dstptr, int options
	362	);
	363	/*
	364	* Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
	365	* string, which is allocated dynamically, and afterwards pointed to by
	366	* the pointer being pointed to by 'dstptr'.
	367	* If the NULLTERM flag in the 'options' field is set, the length is
	368	* determined by a NULL terminator, otherwise the parameter 'strlen' is
	369	* evaluated to determine the string length, but in any case the result
	370	* will be NULL terminated (though it might contain NULL characters
	371	* before). Other flags in the 'options' field are passed to the functions
	372	* defined above, and regarded as described.
	373	* In case of success the length of the new string is returned,
	374	* otherwise a negative error code is returned.
	375	* NOTICE: The memory of the new UTF-8 string will have been allocated with
	376	* 'malloc', and has theirfore to be freed with 'free'.
	377	*/
	378
	379	uint8_t utf8proc_NFD(const uint8_t str);
	380	uint8_t utf8proc_NFC(const uint8_t str);
	381	uint8_t utf8proc_NFKD(const uint8_t str);
	382	uint8_t utf8proc_NFKC(const uint8_t str);
	383	/*
	384	* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
	385	* normalized version of the null-terminated string 'str'.
	386	*/
	387
	388	ssize_t utf8proc_check(const uint8_t *str);
	389	/*
	390	* Just checks UTF-8 string for validity, returns 0 if valid or one of
	391	* the negative UTF8PROC_ERROR_* codes if invalid or memory exhausted
	392	* checking. Assumes null-terminated string str and UTF8PROC_STABLE
	393	* option.
	394	*/
	395
	396	#endif
	397

Note: See TracBrowser for help on using the repository browser.

Download in other formats: