Context Navigation

source: XIOS/trunk/extern/src_netcdf4/dstring.c @ 1120

Last change on this file since 1120 was 409, checked in by ymipsl, 12 years ago

Add improved nectdf internal library src

YM

Property svn:eol-style set to native

File size: 7.9 KB

Rev	Line
[409]	1	/*
	2	* Copyright 1996, University Corporation for Atmospheric Research
	3	* See netcdf/COPYRIGHT file for copying and redistribution conditions.
	4	*/
	5	/* $Id: string.c,v 1.76 2010/05/26 21:43:33 dmh Exp $ */
	6
	7	#include "config.h"
	8	#include <stdio.h>
	9	#include <stdlib.h>
	10	#include <string.h>
	11	#include <ctype.h>
	12	#include <assert.h>
	13	#include "nc.h"
	14	#include "rnd.h"
	15	#include "utf8proc.h"
	16
	17
	18	/* There are 3 levels of UTF8 checking: 1=> (exact)validating 2=>relaxed
	19	and 3=>very relaxed
	20	*/
	21	/* Use semi-relaxed check */
	22	#define UTF8_CHECK 2
	23
	24	/*
	25	* Free string, and, if needed, its values.
	26	* Formerly
	27	NC_free_string()
	28	*/
	29	void
	30	free_NC_string(NC_string *ncstrp)
	31	{
	32	if(ncstrp==NULL)
	33	return;
	34	free(ncstrp);
	35	}
	36
	37
	38	int
	39	nextUTF8(const char* cp)
	40	{
	41	/* The goal here is to recognize the length of each
	42	multibyte utf8 character sequence and skip it.
	43	Again, we assume that every non-ascii character is legal.
	44	We can define three possible tests of decreasing correctness
	45	(in the sense that the least correct will allow some sequences that
	46	are technically illegal UTF8).
	47	As Regular expressions they are as follows:
	48	1. most correct:
	49	UTF8 ([\xC2-\xDF][\x80-\xBF]) \
	50	\| (\xE0[\xA0-\xBF][\x80-\xBF]) \
	51	\| ([\xE1-\xEC][\x80-\xBF][\x80-\xBF]) \
	52	\| (\xED[\x80-\x9F][\x80-\xBF]) \
	53	\| ([\xEE-\xEF][\x80-\xBF][\x80-\xBF]) \
	54	\| (\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]) \
	55	\| ([\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]) \
	56	\| (\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]) \
	57
	58	2. partially relaxed:
	59	UTF8 ([\xC0-\xDF][\x80-\xBF])
	60	\|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])
	61	\|([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])
	62
	63	3. The most relaxed version of UTF8:
	64	UTF8 ([\xC0-\xD6].)\|([\xE0-\xEF]..)\|([\xF0-\xF7]...)
	65
	66	We use #2 here.
	67
	68	The tests are derived from the table at
	69	http://www.w3.org/2005/03/23-lex-U
	70	*/
	71
	72	/* Define a test macro to test against a range */
	73	#define RANGE(c,lo,hi) (((uchar)c) >= lo && ((uchar)c) <= hi)
	74	/* Define a common RANGE */
	75	#define RANGE0(c) RANGE(c,0x80,0xBF)
	76
	77	int ch0;
	78
	79	int skip = -1; /* assume failed */
	80
	81	ch0 = (uchar)*cp;
	82	if(ch0 <= 0x7f) skip = 1; /* remove ascii case */
	83	else
	84
	85	#if UTF8_CHECK == 2
	86	/* Do relaxed validation check */
	87	if(RANGE(ch0,0xC0,0XDF)) {/* 2-bytes, but check */
	88	if(cp[1] != 0 && RANGE0(cp[1]))
	89	skip = 2; /* two bytes */
	90	} else if(RANGE(ch0,0xE0,0XEF)) {/* 3-bytes, but check */
	91	if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0 && RANGE0(cp[1]))
	92	skip = 3; /* three bytes */
	93	} else if(RANGE(ch0,0xF0,0XF7)) {/* 3-bytes, but check */
	94	if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0
	95	&& RANGE0(cp[1]) && cp[3] != 0 && RANGE0(cp[1]))
	96	skip = 4; /* four bytes*/
	97	}
	98	#elif UTF8_CHECK == 1
	99	/* Do exact validation check */
	100	if(RANGE(ch0,0xC2,0xDF)) {/* non-overlong 2-bytes */
	101	int ch1 = (uchar)cp[1];
	102	if(ch1 != 0 && RANGE0(ch1)) skip = 2;
	103	} else if((ch0 == 0xE0)) {/* 3-bytes, not overlong */
	104	int ch1 = (uchar)cp[1];
	105	if(ch1 != 0 && RANGE(ch1,0xA0,0xBF)) {
	106	int ch2 = (uchar)cp[2];
	107	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
	108	} else if((ch0 == 0xED)) {/* 3-bytes minus surrogates */
	109	int ch1 = (uchar)cp[1];
	110	if(ch1 != 0 && RANGE(ch1,0x80,0x9f)) {
	111	int ch2 = (uchar)cp[2];
	112	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
	113	} else if(RANGE(ch0,0xE1,0xEC) \|\| ch0 == 0xEE \|\| ch0 == 0xEF)
	114	int ch1 = (uchar)cp[1];
	115	if(ch1 != 0 && RANGE0(ch1)) {
	116	int ch2 = (uchar)cp[2];
	117	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
	118	}
	119	} else if((ch0 == 0xF0)) {/* planes 1-3 */
	120	int ch1 = (uchar)cp[1];
	121	if(ch1 != 0 && RANGE(ch1,0x90,0xBF) {
	122	int ch2 = (uchar)cp[2];
	123	if(ch2 != 0 && RANGE0(ch2)) {
	124	int ch3 = (uchar)cp[3];
	125	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
	126	}
	127	}
	128	} else if((ch0 == 0xF4)) {/* plane 16 */
	129	int ch1 = (uchar)cp[1];
	130	if(ch1 != 0 && RANGE0(ch1)) {
	131	int ch2 = (uchar)cp[2];
	132	if(ch2 != 0 && RANGE0(ch2)) {
	133	int ch3 = (uchar)cp[3];
	134	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
	135	}
	136	}
	137	} else if(RANGE(ch0,0xF1,0xF3) { /* planes 4-15 */
	138	int ch1 = (uchar)cp[1];
	139	if(ch1 != 0 && RANGE0(ch1)) {
	140	int ch2 = (uchar)cp[2];
	141	if(ch2 != 0 && RANGE0(ch2)) {
	142	int ch3 = (uchar)cp[3];
	143	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
	144	}
	145	}
	146	}
	147	#else
	148	#error "Must Define UTF8_CHECK as 1 or 2"
	149	#endif
	150	return skip;
	151	}
	152
	153
	154	/*
	155	* Verify that a name string is valid syntax. The allowed name
	156	* syntax (in RE form) is:
	157	*
	158	* ([a-zA-Z0-9_]\|{UTF8})([^\x00-\x1F\x7F/]\|{UTF8})*
	159	*
	160	* where UTF8 represents a multibyte UTF-8 encoding. Also, no
	161	* trailing spaces are permitted in names. This definition
	162	* must be consistent with the one in ncgen.l. We do not allow '/'
	163	* because HDF5 does not permit slashes in names as slash is used as a
	164	* group separator. If UTF-8 is supported, then a multi-byte UTF-8
	165	* character can occur anywhere within an identifier. We later
	166	* normalize UTF-8 strings to NFC to facilitate matching and queries.
	167	*/
	168	int
	169	NC_check_name(const char *name)
	170	{
	171	int skip;
	172	int ch;
	173	const char *cp = name;
	174	ssize_t utf8_stat;
	175
	176	assert(name != NULL);
	177
	178	if(name == 0 / empty names disallowed */
	179	\|\| strchr(cp, '/')) /* '/' can't be in a name */
	180	goto fail;
	181
	182	/* check validity of any UTF-8 */
	183	utf8_stat = utf8proc_check((const unsigned char *)name);
	184	if (utf8_stat < 0)
	185	goto fail;
	186
	187	/* First char must be [a-z][A-Z][0-9]_ \| UTF8 */
	188	ch = (uchar)*cp;
	189	if(ch <= 0x7f) {
	190	if( !('A' <= ch && ch <= 'Z')
	191	&& !('a' <= ch && ch <= 'z')
	192	&& !('0' <= ch && ch <= '9')
	193	&& ch != '_' )
	194	goto fail;
	195	cp++;
	196	} else {
	197	if((skip = nextUTF8(cp)) < 0)
	198	goto fail;
	199	cp += skip;
	200	}
	201
	202	while(*cp != 0) {
	203	ch = (uchar)*cp;
	204	/* handle simple 0x00-0x7f characters here */
	205	if(ch <= 0x7f) {
	206	if( ch < ' ' \|\| ch > 0x7E) /* control char or DEL */
	207	goto fail;
	208	cp++;
	209	} else {
	210	if((skip = nextUTF8(cp)) < 0) goto fail;
	211	cp += skip;
	212	}
	213	if(cp - name > NC_MAX_NAME)
	214	return NC_EMAXNAME;
	215	}
	216	if(ch <= 0x7f && isspace(ch)) /* trailing spaces disallowed */
	217	goto fail;
	218	return NC_NOERR;
	219	fail:
	220	return NC_EBADNAME;
	221	}
	222
	223
	224	/*
	225	* Allocate a NC_string structure large enough
	226	* to hold slen characters.
	227	* Formerly
	228	NC_new_string(count, str)
	229	*/
	230	NC_string *
	231	new_NC_string(size_t slen, const char *str)
	232	{
	233	NC_string *ncstrp;
	234	size_t sz = M_RNDUP(sizeof(NC_string)) + slen + 1;
	235
	236	#if 0
	237	sz = _RNDUP(sz, X_ALIGN);
	238	#endif
	239
	240	ncstrp = (NC_string *)malloc(sz);
	241	if( ncstrp == NULL )
	242	return NULL;
	243	(void) memset(ncstrp, 0, sz);
	244
	245	ncstrp->nchars = sz - M_RNDUP(sizeof(NC_string)) - 1;
	246	assert(ncstrp->nchars + 1 > slen);
	247	ncstrp->cp = (char *)ncstrp + M_RNDUP(sizeof(NC_string));
	248
	249	if(str != NULL && *str != 0)
	250	{
	251	(void) strncpy(ncstrp->cp, str, ncstrp->nchars +1);
	252	ncstrp->cp[ncstrp->nchars] = 0;
	253	}
	254
	255	return(ncstrp);
	256	}
	257
	258
	259	/*
	260	* If possible, change the value of an NC_string to 'str'.
	261	*
	262	* Formerly
	263	NC_re_string()
	264	*/
	265	int
	266	set_NC_string(NC_string ncstrp, const char str)
	267	{
	268	size_t slen;
	269
	270	assert(str != NULL && *str != 0);
	271
	272	slen = strlen(str);
	273
	274	if(ncstrp->nchars < slen)
	275	return NC_ENOTINDEFINE;
	276
	277	strncpy(ncstrp->cp, str, ncstrp->nchars);
	278	/* Don't adjust ncstrp->nchars, it includes extra space in the
	279	* header for potential later expansion of string. */
	280
	281	return NC_NOERR;
	282	}
	283
	284	/**************************************************/
	285	/* Provide local alternatives for unix functions
	286	not available on all machines. Place here so that
	287	all subsequence code modules can use it.
	288	*/
	289
	290	#ifndef HAVE_STRDUP
	291	char*
	292	strdup(const char* s)
	293	{
	294	char* dup;
	295	if(s == NULL) return NULL;
	296	dup = malloc(strlen(s)+1);
	297	strcpy(dup,s);
	298	return dup;
	299	}
	300	#endif
	301
	302	/**************************************************/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: