1 | /* |
---|
2 | * Copyright 1996, University Corporation for Atmospheric Research |
---|
3 | * See netcdf/COPYRIGHT file for copying and redistribution conditions. |
---|
4 | */ |
---|
5 | /* $Id: string.c,v 1.76 2010/05/26 21:43:33 dmh Exp $ */ |
---|
6 | |
---|
7 | #include "config.h" |
---|
8 | #include <stdio.h> |
---|
9 | #include <stdlib.h> |
---|
10 | #include <string.h> |
---|
11 | #include <ctype.h> |
---|
12 | #include <assert.h> |
---|
13 | #include "nc.h" |
---|
14 | #include "rnd.h" |
---|
15 | #include "utf8proc.h" |
---|
16 | |
---|
17 | |
---|
18 | /* There are 3 levels of UTF8 checking: 1=> (exact)validating 2=>relaxed |
---|
19 | and 3=>very relaxed |
---|
20 | */ |
---|
21 | /* Use semi-relaxed check */ |
---|
22 | #define UTF8_CHECK 2 |
---|
23 | |
---|
24 | /* |
---|
25 | * Free string, and, if needed, its values. |
---|
26 | * Formerly |
---|
27 | NC_free_string() |
---|
28 | */ |
---|
29 | void |
---|
30 | free_NC_string(NC_string *ncstrp) |
---|
31 | { |
---|
32 | if(ncstrp==NULL) |
---|
33 | return; |
---|
34 | free(ncstrp); |
---|
35 | } |
---|
36 | |
---|
37 | |
---|
38 | int |
---|
39 | nextUTF8(const char* cp) |
---|
40 | { |
---|
41 | /* The goal here is to recognize the length of each |
---|
42 | multibyte utf8 character sequence and skip it. |
---|
43 | Again, we assume that every non-ascii character is legal. |
---|
44 | We can define three possible tests of decreasing correctness |
---|
45 | (in the sense that the least correct will allow some sequences that |
---|
46 | are technically illegal UTF8). |
---|
47 | As Regular expressions they are as follows: |
---|
48 | 1. most correct: |
---|
49 | UTF8 ([\xC2-\xDF][\x80-\xBF]) \ |
---|
50 | | (\xE0[\xA0-\xBF][\x80-\xBF]) \ |
---|
51 | | ([\xE1-\xEC][\x80-\xBF][\x80-\xBF]) \ |
---|
52 | | (\xED[\x80-\x9F][\x80-\xBF]) \ |
---|
53 | | ([\xEE-\xEF][\x80-\xBF][\x80-\xBF]) \ |
---|
54 | | (\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]) \ |
---|
55 | | ([\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]) \ |
---|
56 | | (\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]) \ |
---|
57 | |
---|
58 | 2. partially relaxed: |
---|
59 | UTF8 ([\xC0-\xDF][\x80-\xBF]) |
---|
60 | |([\xE0-\xEF][\x80-\xBF][\x80-\xBF]) |
---|
61 | |([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]) |
---|
62 | |
---|
63 | 3. The most relaxed version of UTF8: |
---|
64 | UTF8 ([\xC0-\xD6].)|([\xE0-\xEF]..)|([\xF0-\xF7]...) |
---|
65 | |
---|
66 | We use #2 here. |
---|
67 | |
---|
68 | The tests are derived from the table at |
---|
69 | http://www.w3.org/2005/03/23-lex-U |
---|
70 | */ |
---|
71 | |
---|
72 | /* Define a test macro to test against a range */ |
---|
73 | #define RANGE(c,lo,hi) (((uchar)c) >= lo && ((uchar)c) <= hi) |
---|
74 | /* Define a common RANGE */ |
---|
75 | #define RANGE0(c) RANGE(c,0x80,0xBF) |
---|
76 | |
---|
77 | int ch0; |
---|
78 | |
---|
79 | int skip = -1; /* assume failed */ |
---|
80 | |
---|
81 | ch0 = (uchar)*cp; |
---|
82 | if(ch0 <= 0x7f) skip = 1; /* remove ascii case */ |
---|
83 | else |
---|
84 | |
---|
85 | #if UTF8_CHECK == 2 |
---|
86 | /* Do relaxed validation check */ |
---|
87 | if(RANGE(ch0,0xC0,0XDF)) {/* 2-bytes, but check */ |
---|
88 | if(cp[1] != 0 && RANGE0(cp[1])) |
---|
89 | skip = 2; /* two bytes */ |
---|
90 | } else if(RANGE(ch0,0xE0,0XEF)) {/* 3-bytes, but check */ |
---|
91 | if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0 && RANGE0(cp[1])) |
---|
92 | skip = 3; /* three bytes */ |
---|
93 | } else if(RANGE(ch0,0xF0,0XF7)) {/* 3-bytes, but check */ |
---|
94 | if(cp[1] != 0 && RANGE0(cp[1]) && cp[2] != 0 |
---|
95 | && RANGE0(cp[1]) && cp[3] != 0 && RANGE0(cp[1])) |
---|
96 | skip = 4; /* four bytes*/ |
---|
97 | } |
---|
98 | #elif UTF8_CHECK == 1 |
---|
99 | /* Do exact validation check */ |
---|
100 | if(RANGE(ch0,0xC2,0xDF)) {/* non-overlong 2-bytes */ |
---|
101 | int ch1 = (uchar)cp[1]; |
---|
102 | if(ch1 != 0 && RANGE0(ch1)) skip = 2; |
---|
103 | } else if((ch0 == 0xE0)) {/* 3-bytes, not overlong */ |
---|
104 | int ch1 = (uchar)cp[1]; |
---|
105 | if(ch1 != 0 && RANGE(ch1,0xA0,0xBF)) { |
---|
106 | int ch2 = (uchar)cp[2]; |
---|
107 | if(ch2 != 0 && RANGE0(ch2)) skip = 3; |
---|
108 | } else if((ch0 == 0xED)) {/* 3-bytes minus surrogates */ |
---|
109 | int ch1 = (uchar)cp[1]; |
---|
110 | if(ch1 != 0 && RANGE(ch1,0x80,0x9f)) { |
---|
111 | int ch2 = (uchar)cp[2]; |
---|
112 | if(ch2 != 0 && RANGE0(ch2)) skip = 3; |
---|
113 | } else if(RANGE(ch0,0xE1,0xEC) || ch0 == 0xEE || ch0 == 0xEF) |
---|
114 | int ch1 = (uchar)cp[1]; |
---|
115 | if(ch1 != 0 && RANGE0(ch1)) { |
---|
116 | int ch2 = (uchar)cp[2]; |
---|
117 | if(ch2 != 0 && RANGE0(ch2)) skip = 3; |
---|
118 | } |
---|
119 | } else if((ch0 == 0xF0)) {/* planes 1-3 */ |
---|
120 | int ch1 = (uchar)cp[1]; |
---|
121 | if(ch1 != 0 && RANGE(ch1,0x90,0xBF) { |
---|
122 | int ch2 = (uchar)cp[2]; |
---|
123 | if(ch2 != 0 && RANGE0(ch2)) { |
---|
124 | int ch3 = (uchar)cp[3]; |
---|
125 | if(ch3 != 0 && RANGE0(ch3)) skip = 4; |
---|
126 | } |
---|
127 | } |
---|
128 | } else if((ch0 == 0xF4)) {/* plane 16 */ |
---|
129 | int ch1 = (uchar)cp[1]; |
---|
130 | if(ch1 != 0 && RANGE0(ch1)) { |
---|
131 | int ch2 = (uchar)cp[2]; |
---|
132 | if(ch2 != 0 && RANGE0(ch2)) { |
---|
133 | int ch3 = (uchar)cp[3]; |
---|
134 | if(ch3 != 0 && RANGE0(ch3)) skip = 4; |
---|
135 | } |
---|
136 | } |
---|
137 | } else if(RANGE(ch0,0xF1,0xF3) { /* planes 4-15 */ |
---|
138 | int ch1 = (uchar)cp[1]; |
---|
139 | if(ch1 != 0 && RANGE0(ch1)) { |
---|
140 | int ch2 = (uchar)cp[2]; |
---|
141 | if(ch2 != 0 && RANGE0(ch2)) { |
---|
142 | int ch3 = (uchar)cp[3]; |
---|
143 | if(ch3 != 0 && RANGE0(ch3)) skip = 4; |
---|
144 | } |
---|
145 | } |
---|
146 | } |
---|
147 | #else |
---|
148 | #error "Must Define UTF8_CHECK as 1 or 2" |
---|
149 | #endif |
---|
150 | return skip; |
---|
151 | } |
---|
152 | |
---|
153 | |
---|
154 | /* |
---|
155 | * Verify that a name string is valid syntax. The allowed name |
---|
156 | * syntax (in RE form) is: |
---|
157 | * |
---|
158 | * ([a-zA-Z0-9_]|{UTF8})([^\x00-\x1F\x7F/]|{UTF8})* |
---|
159 | * |
---|
160 | * where UTF8 represents a multibyte UTF-8 encoding. Also, no |
---|
161 | * trailing spaces are permitted in names. This definition |
---|
162 | * must be consistent with the one in ncgen.l. We do not allow '/' |
---|
163 | * because HDF5 does not permit slashes in names as slash is used as a |
---|
164 | * group separator. If UTF-8 is supported, then a multi-byte UTF-8 |
---|
165 | * character can occur anywhere within an identifier. We later |
---|
166 | * normalize UTF-8 strings to NFC to facilitate matching and queries. |
---|
167 | */ |
---|
168 | int |
---|
169 | NC_check_name(const char *name) |
---|
170 | { |
---|
171 | int skip; |
---|
172 | int ch; |
---|
173 | const char *cp = name; |
---|
174 | ssize_t utf8_stat; |
---|
175 | |
---|
176 | assert(name != NULL); |
---|
177 | |
---|
178 | if(*name == 0 /* empty names disallowed */ |
---|
179 | || strchr(cp, '/')) /* '/' can't be in a name */ |
---|
180 | goto fail; |
---|
181 | |
---|
182 | /* check validity of any UTF-8 */ |
---|
183 | utf8_stat = utf8proc_check((const unsigned char *)name); |
---|
184 | if (utf8_stat < 0) |
---|
185 | goto fail; |
---|
186 | |
---|
187 | /* First char must be [a-z][A-Z][0-9]_ | UTF8 */ |
---|
188 | ch = (uchar)*cp; |
---|
189 | if(ch <= 0x7f) { |
---|
190 | if( !('A' <= ch && ch <= 'Z') |
---|
191 | && !('a' <= ch && ch <= 'z') |
---|
192 | && !('0' <= ch && ch <= '9') |
---|
193 | && ch != '_' ) |
---|
194 | goto fail; |
---|
195 | cp++; |
---|
196 | } else { |
---|
197 | if((skip = nextUTF8(cp)) < 0) |
---|
198 | goto fail; |
---|
199 | cp += skip; |
---|
200 | } |
---|
201 | |
---|
202 | while(*cp != 0) { |
---|
203 | ch = (uchar)*cp; |
---|
204 | /* handle simple 0x00-0x7f characters here */ |
---|
205 | if(ch <= 0x7f) { |
---|
206 | if( ch < ' ' || ch > 0x7E) /* control char or DEL */ |
---|
207 | goto fail; |
---|
208 | cp++; |
---|
209 | } else { |
---|
210 | if((skip = nextUTF8(cp)) < 0) goto fail; |
---|
211 | cp += skip; |
---|
212 | } |
---|
213 | if(cp - name > NC_MAX_NAME) |
---|
214 | return NC_EMAXNAME; |
---|
215 | } |
---|
216 | if(ch <= 0x7f && isspace(ch)) /* trailing spaces disallowed */ |
---|
217 | goto fail; |
---|
218 | return NC_NOERR; |
---|
219 | fail: |
---|
220 | return NC_EBADNAME; |
---|
221 | } |
---|
222 | |
---|
223 | |
---|
224 | /* |
---|
225 | * Allocate a NC_string structure large enough |
---|
226 | * to hold slen characters. |
---|
227 | * Formerly |
---|
228 | NC_new_string(count, str) |
---|
229 | */ |
---|
230 | NC_string * |
---|
231 | new_NC_string(size_t slen, const char *str) |
---|
232 | { |
---|
233 | NC_string *ncstrp; |
---|
234 | size_t sz = M_RNDUP(sizeof(NC_string)) + slen + 1; |
---|
235 | |
---|
236 | #if 0 |
---|
237 | sz = _RNDUP(sz, X_ALIGN); |
---|
238 | #endif |
---|
239 | |
---|
240 | ncstrp = (NC_string *)malloc(sz); |
---|
241 | if( ncstrp == NULL ) |
---|
242 | return NULL; |
---|
243 | (void) memset(ncstrp, 0, sz); |
---|
244 | |
---|
245 | ncstrp->nchars = sz - M_RNDUP(sizeof(NC_string)) - 1; |
---|
246 | assert(ncstrp->nchars + 1 > slen); |
---|
247 | ncstrp->cp = (char *)ncstrp + M_RNDUP(sizeof(NC_string)); |
---|
248 | |
---|
249 | if(str != NULL && *str != 0) |
---|
250 | { |
---|
251 | (void) strncpy(ncstrp->cp, str, ncstrp->nchars +1); |
---|
252 | ncstrp->cp[ncstrp->nchars] = 0; |
---|
253 | } |
---|
254 | |
---|
255 | return(ncstrp); |
---|
256 | } |
---|
257 | |
---|
258 | |
---|
259 | /* |
---|
260 | * If possible, change the value of an NC_string to 'str'. |
---|
261 | * |
---|
262 | * Formerly |
---|
263 | NC_re_string() |
---|
264 | */ |
---|
265 | int |
---|
266 | set_NC_string(NC_string *ncstrp, const char *str) |
---|
267 | { |
---|
268 | size_t slen; |
---|
269 | |
---|
270 | assert(str != NULL && *str != 0); |
---|
271 | |
---|
272 | slen = strlen(str); |
---|
273 | |
---|
274 | if(ncstrp->nchars < slen) |
---|
275 | return NC_ENOTINDEFINE; |
---|
276 | |
---|
277 | strncpy(ncstrp->cp, str, ncstrp->nchars); |
---|
278 | /* Don't adjust ncstrp->nchars, it includes extra space in the |
---|
279 | * header for potential later expansion of string. */ |
---|
280 | |
---|
281 | return NC_NOERR; |
---|
282 | } |
---|
283 | |
---|
284 | /**************************************************/ |
---|
285 | /* Provide local alternatives for unix functions |
---|
286 | not available on all machines. Place here so that |
---|
287 | all subsequence code modules can use it. |
---|
288 | */ |
---|
289 | |
---|
290 | #ifndef HAVE_STRDUP |
---|
291 | char* |
---|
292 | strdup(const char* s) |
---|
293 | { |
---|
294 | char* dup; |
---|
295 | if(s == NULL) return NULL; |
---|
296 | dup = malloc(strlen(s)+1); |
---|
297 | strcpy(dup,s); |
---|
298 | return dup; |
---|
299 | } |
---|
300 | #endif |
---|
301 | |
---|
302 | /**************************************************/ |
---|