Context Navigation

source: XIOS/dev/branch_yushan/extern/src_netcdf4/dstring.c @ 1126

Last change on this file since 1126 was 409, checked in by ymipsl, 12 years ago

Add improved nectdf internal library src

YM

Property svn:eol-style set to native

File size: 7.9 KB

Line
1	/*
2	* Copyright 1996, University Corporation for Atmospheric Research
3	* See netcdf/COPYRIGHT file for copying and redistribution conditions.
4	*/
5	/* $Id: string.c,v 1.76 2010/05/26 21:43:33 dmh Exp $ */
6
7	#include "config.h"
8	#include <stdio.h>
9	#include <stdlib.h>
10	#include <string.h>
11	#include <ctype.h>
12	#include <assert.h>
13	#include "nc.h"
14	#include "rnd.h"
15	#include "utf8proc.h"
16
17
18	/* There are 3 levels of UTF8 checking: 1=> (exact)validating 2=>relaxed
19	and 3=>very relaxed
20	*/
21	/* Use semi-relaxed check */
22	#define UTF8_CHECK 2
23
24	/*
25	* Free string, and, if needed, its values.
26	* Formerly
27	NC_free_string()
28	*/
29	void
30	free_NC_string(NC_string *ncstrp)
31	{
32	if(ncstrp==NULL)
33	return;
34	free(ncstrp);
35	}
36
37
38	int
39	nextUTF8(const char* cp)
40	{
41	/* The goal here is to recognize the length of each
42	multibyte utf8 character sequence and skip it.
43	Again, we assume that every non-ascii character is legal.
44	We can define three possible tests of decreasing correctness
45	(in the sense that the least correct will allow some sequences that
46	are technically illegal UTF8).
47	As Regular expressions they are as follows:
48	1. most correct:
49	UTF8 ([\xC2-\xDF][\x80-\xBF]) \
50	\| (\xE0[\xA0-\xBF][\x80-\xBF]) \
51	\| ([\xE1-\xEC][\x80-\xBF][\x80-\xBF]) \
52	\| (\xED[\x80-\x9F][\x80-\xBF]) \
53	\| ([\xEE-\xEF][\x80-\xBF][\x80-\xBF]) \
54	\| (\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]) \
55	\| ([\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]) \
56	\| (\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]) \
57
58	2. partially relaxed:
59	UTF8 ([\xC0-\xDF][\x80-\xBF])
60	\|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])
61	\|([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])
62
63	3. The most relaxed version of UTF8:
64	UTF8 ([\xC0-\xD6].)\|([\xE0-\xEF]..)\|([\xF0-\xF7]...)
65
66	We use #2 here.
67
68	The tests are derived from the table at
69	http://www.w3.org/2005/03/23-lex-U
70	*/
71
72	/* Define a test macro to test against a range */
73	#define RANGE(c,lo,hi) (((uchar)c) >= lo && ((uchar)c) <= hi)
74	/* Define a common RANGE */
75	#define RANGE0(c) RANGE(c,0x80,0xBF)
76
77	int ch0;
78
79	int skip = -1; /* assume failed */
80
81	ch0 = (uchar)*cp;
82	if(ch0 <= 0x7f) skip = 1; /* remove ascii case */
83	else
84
85	#if UTF8_CHECK == 2
86	/* Do relaxed validation check */
87	if(RANGE(ch0,0xC0,0XDF)) {/* 2-bytes, but check */
88	if(cp[1] != 0 && RANGE0(cp[1]))
89	skip = 2; /* two bytes */
90	} else if(RANGE(ch0,0xE0,0XEF)) {/* 3-bytes, but check */
91	if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0 && RANGE0(cp[1]))
92	skip = 3; /* three bytes */
93	} else if(RANGE(ch0,0xF0,0XF7)) {/* 3-bytes, but check */
94	if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0
95	&& RANGE0(cp[1]) && cp[3] != 0 && RANGE0(cp[1]))
96	skip = 4; /* four bytes*/
97	}
98	#elif UTF8_CHECK == 1
99	/* Do exact validation check */
100	if(RANGE(ch0,0xC2,0xDF)) {/* non-overlong 2-bytes */
101	int ch1 = (uchar)cp[1];
102	if(ch1 != 0 && RANGE0(ch1)) skip = 2;
103	} else if((ch0 == 0xE0)) {/* 3-bytes, not overlong */
104	int ch1 = (uchar)cp[1];
105	if(ch1 != 0 && RANGE(ch1,0xA0,0xBF)) {
106	int ch2 = (uchar)cp[2];
107	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
108	} else if((ch0 == 0xED)) {/* 3-bytes minus surrogates */
109	int ch1 = (uchar)cp[1];
110	if(ch1 != 0 && RANGE(ch1,0x80,0x9f)) {
111	int ch2 = (uchar)cp[2];
112	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
113	} else if(RANGE(ch0,0xE1,0xEC) \|\| ch0 == 0xEE \|\| ch0 == 0xEF)
114	int ch1 = (uchar)cp[1];
115	if(ch1 != 0 && RANGE0(ch1)) {
116	int ch2 = (uchar)cp[2];
117	if(ch2 != 0 && RANGE0(ch2)) skip = 3;
118	}
119	} else if((ch0 == 0xF0)) {/* planes 1-3 */
120	int ch1 = (uchar)cp[1];
121	if(ch1 != 0 && RANGE(ch1,0x90,0xBF) {
122	int ch2 = (uchar)cp[2];
123	if(ch2 != 0 && RANGE0(ch2)) {
124	int ch3 = (uchar)cp[3];
125	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
126	}
127	}
128	} else if((ch0 == 0xF4)) {/* plane 16 */
129	int ch1 = (uchar)cp[1];
130	if(ch1 != 0 && RANGE0(ch1)) {
131	int ch2 = (uchar)cp[2];
132	if(ch2 != 0 && RANGE0(ch2)) {
133	int ch3 = (uchar)cp[3];
134	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
135	}
136	}
137	} else if(RANGE(ch0,0xF1,0xF3) { /* planes 4-15 */
138	int ch1 = (uchar)cp[1];
139	if(ch1 != 0 && RANGE0(ch1)) {
140	int ch2 = (uchar)cp[2];
141	if(ch2 != 0 && RANGE0(ch2)) {
142	int ch3 = (uchar)cp[3];
143	if(ch3 != 0 && RANGE0(ch3)) skip = 4;
144	}
145	}
146	}
147	#else
148	#error "Must Define UTF8_CHECK as 1 or 2"
149	#endif
150	return skip;
151	}
152
153
154	/*
155	* Verify that a name string is valid syntax. The allowed name
156	* syntax (in RE form) is:
157	*
158	* ([a-zA-Z0-9_]\|{UTF8})([^\x00-\x1F\x7F/]\|{UTF8})*
159	*
160	* where UTF8 represents a multibyte UTF-8 encoding. Also, no
161	* trailing spaces are permitted in names. This definition
162	* must be consistent with the one in ncgen.l. We do not allow '/'
163	* because HDF5 does not permit slashes in names as slash is used as a
164	* group separator. If UTF-8 is supported, then a multi-byte UTF-8
165	* character can occur anywhere within an identifier. We later
166	* normalize UTF-8 strings to NFC to facilitate matching and queries.
167	*/
168	int
169	NC_check_name(const char *name)
170	{
171	int skip;
172	int ch;
173	const char *cp = name;
174	ssize_t utf8_stat;
175
176	assert(name != NULL);
177
178	if(name == 0 / empty names disallowed */
179	\|\| strchr(cp, '/')) /* '/' can't be in a name */
180	goto fail;
181
182	/* check validity of any UTF-8 */
183	utf8_stat = utf8proc_check((const unsigned char *)name);
184	if (utf8_stat < 0)
185	goto fail;
186
187	/* First char must be [a-z][A-Z][0-9]_ \| UTF8 */
188	ch = (uchar)*cp;
189	if(ch <= 0x7f) {
190	if( !('A' <= ch && ch <= 'Z')
191	&& !('a' <= ch && ch <= 'z')
192	&& !('0' <= ch && ch <= '9')
193	&& ch != '_' )
194	goto fail;
195	cp++;
196	} else {
197	if((skip = nextUTF8(cp)) < 0)
198	goto fail;
199	cp += skip;
200	}
201
202	while(*cp != 0) {
203	ch = (uchar)*cp;
204	/* handle simple 0x00-0x7f characters here */
205	if(ch <= 0x7f) {
206	if( ch < ' ' \|\| ch > 0x7E) /* control char or DEL */
207	goto fail;
208	cp++;
209	} else {
210	if((skip = nextUTF8(cp)) < 0) goto fail;
211	cp += skip;
212	}
213	if(cp - name > NC_MAX_NAME)
214	return NC_EMAXNAME;
215	}
216	if(ch <= 0x7f && isspace(ch)) /* trailing spaces disallowed */
217	goto fail;
218	return NC_NOERR;
219	fail:
220	return NC_EBADNAME;
221	}
222
223
224	/*
225	* Allocate a NC_string structure large enough
226	* to hold slen characters.
227	* Formerly
228	NC_new_string(count, str)
229	*/
230	NC_string *
231	new_NC_string(size_t slen, const char *str)
232	{
233	NC_string *ncstrp;
234	size_t sz = M_RNDUP(sizeof(NC_string)) + slen + 1;
235
236	#if 0
237	sz = _RNDUP(sz, X_ALIGN);
238	#endif
239
240	ncstrp = (NC_string *)malloc(sz);
241	if( ncstrp == NULL )
242	return NULL;
243	(void) memset(ncstrp, 0, sz);
244
245	ncstrp->nchars = sz - M_RNDUP(sizeof(NC_string)) - 1;
246	assert(ncstrp->nchars + 1 > slen);
247	ncstrp->cp = (char *)ncstrp + M_RNDUP(sizeof(NC_string));
248
249	if(str != NULL && *str != 0)
250	{
251	(void) strncpy(ncstrp->cp, str, ncstrp->nchars +1);
252	ncstrp->cp[ncstrp->nchars] = 0;
253	}
254
255	return(ncstrp);
256	}
257
258
259	/*
260	* If possible, change the value of an NC_string to 'str'.
261	*
262	* Formerly
263	NC_re_string()
264	*/
265	int
266	set_NC_string(NC_string ncstrp, const char str)
267	{
268	size_t slen;
269
270	assert(str != NULL && *str != 0);
271
272	slen = strlen(str);
273
274	if(ncstrp->nchars < slen)
275	return NC_ENOTINDEFINE;
276
277	strncpy(ncstrp->cp, str, ncstrp->nchars);
278	/* Don't adjust ncstrp->nchars, it includes extra space in the
279	* header for potential later expansion of string. */
280
281	return NC_NOERR;
282	}
283
284	/**************************************************/
285	/* Provide local alternatives for unix functions
286	not available on all machines. Place here so that
287	all subsequence code modules can use it.
288	*/
289
290	#ifndef HAVE_STRDUP
291	char*
292	strdup(const char* s)
293	{
294	char* dup;
295	if(s == NULL) return NULL;
296	dup = malloc(strlen(s)+1);
297	strcpy(dup,s);
298	return dup;
299	}
300	#endif
301
302	/**************************************************/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: