1 | /* Copyright 2009, UCAR/Unidata and OPeNDAP, Inc. |
---|
2 | See the COPYRIGHT file for more information. */ |
---|
3 | |
---|
4 | #include "config.h" |
---|
5 | #include <strings.h> |
---|
6 | #include "dapparselex.h" |
---|
7 | |
---|
8 | #undef URLCVT /* NEVER turn this on */ |
---|
9 | #define DAP2ENCODE |
---|
10 | |
---|
11 | /* Forward */ |
---|
12 | static void dumptoken(DAPlexstate* lexstate); |
---|
13 | static void dapaddyytext(DAPlexstate* lex, int c); |
---|
14 | #ifndef DAP2ENCODE |
---|
15 | static int tohex(int c); |
---|
16 | #endif |
---|
17 | |
---|
18 | /****************************************************/ |
---|
19 | |
---|
20 | #if 0 /* Following definitions are for informational purposes */ |
---|
21 | /* Set of all ascii printable characters */ |
---|
22 | static char ascii[] = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~"; |
---|
23 | |
---|
24 | /* Define the set of legal nonalphanum characters as specified in the DAP2 spec. */ |
---|
25 | static char* daplegal ="_!~*'-\""; |
---|
26 | #endif |
---|
27 | |
---|
28 | static char* ddsworddelims = |
---|
29 | "{}[]:;=,"; |
---|
30 | |
---|
31 | /* Define 1 and > 1st legal characters */ |
---|
32 | static char* ddswordchars1 = |
---|
33 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*"; |
---|
34 | static char* ddswordcharsn = |
---|
35 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#"; |
---|
36 | static char* daswordcharsn = |
---|
37 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#:"; |
---|
38 | static char* cewordchars1 = |
---|
39 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\"; |
---|
40 | static char* cewordcharsn = |
---|
41 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\"; |
---|
42 | |
---|
43 | /* Current sets of legal characters */ |
---|
44 | /* |
---|
45 | static char* wordchars1 = NULL; |
---|
46 | static char* wordcharsn = NULL; |
---|
47 | static char* worddelims = NULL; |
---|
48 | */ |
---|
49 | |
---|
50 | static char* keywords[] = { |
---|
51 | "alias", |
---|
52 | "array", |
---|
53 | "attributes", |
---|
54 | "byte", |
---|
55 | "dataset", |
---|
56 | "error", |
---|
57 | "float32", |
---|
58 | "float64", |
---|
59 | "grid", |
---|
60 | "int16", |
---|
61 | "int32", |
---|
62 | "maps", |
---|
63 | "sequence", |
---|
64 | "string", |
---|
65 | "structure", |
---|
66 | "uint16", |
---|
67 | "uint32", |
---|
68 | "url", |
---|
69 | "code", |
---|
70 | "message", |
---|
71 | "program_type", |
---|
72 | "program", |
---|
73 | NULL /* mark end of the keywords list */ |
---|
74 | }; |
---|
75 | |
---|
76 | static int keytokens[] = { |
---|
77 | SCAN_ALIAS, |
---|
78 | SCAN_ARRAY, |
---|
79 | SCAN_ATTR, |
---|
80 | SCAN_BYTE, |
---|
81 | SCAN_DATASET, |
---|
82 | SCAN_ERROR, |
---|
83 | SCAN_FLOAT32, |
---|
84 | SCAN_FLOAT64, |
---|
85 | SCAN_GRID, |
---|
86 | SCAN_INT16, |
---|
87 | SCAN_INT32, |
---|
88 | SCAN_MAPS, |
---|
89 | SCAN_SEQUENCE, |
---|
90 | SCAN_STRING, |
---|
91 | SCAN_STRUCTURE, |
---|
92 | SCAN_UINT16, |
---|
93 | SCAN_UINT32, |
---|
94 | SCAN_URL, |
---|
95 | SCAN_CODE, |
---|
96 | SCAN_MESSAGE, |
---|
97 | SCAN_PTYPE, |
---|
98 | SCAN_PROG |
---|
99 | }; |
---|
100 | |
---|
101 | /**************************************************/ |
---|
102 | |
---|
103 | int |
---|
104 | daplex(YYSTYPE* lvalp, DAPparsestate* state) |
---|
105 | { |
---|
106 | DAPlexstate* lexstate = state->lexstate; |
---|
107 | int token; |
---|
108 | int c; |
---|
109 | unsigned int i; |
---|
110 | char* p; |
---|
111 | char* tmp; |
---|
112 | |
---|
113 | token = 0; |
---|
114 | ocbytesclear(lexstate->yytext); |
---|
115 | /* invariant: p always points to current char */ |
---|
116 | for(p=lexstate->next;token==0&&(c=*p);p++) { |
---|
117 | if(c == '\n') { |
---|
118 | lexstate->lineno++; |
---|
119 | } else if(c <= ' ' || c == '\177') { |
---|
120 | /* whitespace: ignore */ |
---|
121 | } else if(c == '#') { |
---|
122 | /* single line comment */ |
---|
123 | while((c=*(++p))) {if(c == '\n') break;} |
---|
124 | } else if(strchr(lexstate->worddelims,c) != NULL) { |
---|
125 | /* don't put in lexstate->yytext to avoid memory leak */ |
---|
126 | token = c; |
---|
127 | } else if(c == '"') { |
---|
128 | int more = 1; |
---|
129 | /* We have a string token; will be reported as WORD_STRING */ |
---|
130 | while(more && (c=*(++p))) { |
---|
131 | #ifdef DAP2ENCODE |
---|
132 | if(c == '"') |
---|
133 | more = 0; |
---|
134 | else if(c == '\\') { |
---|
135 | /* Remove spec ambiguity by convering \c to c |
---|
136 | for any character c */ |
---|
137 | c=*(++p); |
---|
138 | if(c == '\0') more = 0; |
---|
139 | } |
---|
140 | #else /*Non-standard*/ |
---|
141 | switch (c) { |
---|
142 | case '"': more=0; break; |
---|
143 | case '\\': |
---|
144 | c=*(++p); |
---|
145 | switch (c) { |
---|
146 | case 'r': c = '\r'; break; |
---|
147 | case 'n': c = '\n'; break; |
---|
148 | case 'f': c = '\f'; break; |
---|
149 | case 't': c = '\t'; break; |
---|
150 | case 'x': { |
---|
151 | int d1,d2; |
---|
152 | c = '?'; |
---|
153 | ++p; |
---|
154 | d1 = tohex(*p++); |
---|
155 | if(d1 < 0) { |
---|
156 | daperror(state,"Illegal \\xDD in TOKEN_STRING"); |
---|
157 | } else { |
---|
158 | d2 = tohex(*p++); |
---|
159 | if(d2 < 0) { |
---|
160 | daperror(state,"Illegal \\xDD in TOKEN_STRING"); |
---|
161 | } else { |
---|
162 | c=(((unsigned int)d1)<<4) | (unsigned int)d2; |
---|
163 | } |
---|
164 | } |
---|
165 | } break; |
---|
166 | default: break; |
---|
167 | } |
---|
168 | break; |
---|
169 | default: break; |
---|
170 | } |
---|
171 | #endif /*!DAP2ENCODE*/ |
---|
172 | if(more) dapaddyytext(lexstate,c); |
---|
173 | } |
---|
174 | token=WORD_STRING; |
---|
175 | } else if(strchr(lexstate->wordchars1,c) != NULL) { |
---|
176 | int isdatamark = 0; |
---|
177 | /* we have a WORD_WORD */ |
---|
178 | dapaddyytext(lexstate,c); |
---|
179 | while((c=*(++p))) { |
---|
180 | #ifdef URLCVT |
---|
181 | if(c == '%' && p[1] != 0 && p[2] != 0 |
---|
182 | && strchr(hexdigits,p[1]) != NULL |
---|
183 | && strchr(hexdigits,p[2]) != NULL) { |
---|
184 | int d1,d2; |
---|
185 | d1 = tohex(p[1]); |
---|
186 | d2 = tohex(p[2]); |
---|
187 | if(d1 >= 0 || d2 >= 0) { |
---|
188 | c=(((unsigned int)d1)<<4) | (unsigned int)d2; |
---|
189 | p+=2; |
---|
190 | } |
---|
191 | } else { |
---|
192 | if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} |
---|
193 | } |
---|
194 | dapaddyytext(lexstate,c); |
---|
195 | #else |
---|
196 | if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} |
---|
197 | dapaddyytext(lexstate,c); |
---|
198 | #endif |
---|
199 | } |
---|
200 | /* Special check for Data: */ |
---|
201 | tmp = ocbytescontents(lexstate->yytext); |
---|
202 | if(strcmp(tmp,"Data")==0 && *p == ':') { |
---|
203 | dapaddyytext(lexstate,*p); p++; |
---|
204 | if(p[0] == '\n') { |
---|
205 | token = SCAN_DATA; |
---|
206 | isdatamark = 1; |
---|
207 | p++; |
---|
208 | } else if(p[0] == '\r' && p[1] == '\n') { |
---|
209 | token = SCAN_DATA; |
---|
210 | isdatamark = 1; |
---|
211 | p+=2; |
---|
212 | } |
---|
213 | } |
---|
214 | if(!isdatamark) { |
---|
215 | /* check for keyword */ |
---|
216 | token=WORD_WORD; /* assume */ |
---|
217 | for(i=0;;i++) { |
---|
218 | if(keywords[i] == NULL) break; |
---|
219 | if(strcasecmp(keywords[i],tmp)==0) { |
---|
220 | token=keytokens[i]; |
---|
221 | break; |
---|
222 | } |
---|
223 | } |
---|
224 | } |
---|
225 | } else { /* illegal */ |
---|
226 | } |
---|
227 | } |
---|
228 | lexstate->next = p; |
---|
229 | strncpy(lexstate->lasttokentext,ocbytescontents(lexstate->yytext),MAX_TOKEN_LENGTH); |
---|
230 | lexstate->lasttoken = token; |
---|
231 | if(ocdebug >= 2) |
---|
232 | dumptoken(lexstate); |
---|
233 | |
---|
234 | /*Put return value onto Bison stack*/ |
---|
235 | |
---|
236 | if(ocbyteslength(lexstate->yytext) == 0) |
---|
237 | *lvalp = NULL; |
---|
238 | else { |
---|
239 | *lvalp = ocbytesdup(lexstate->yytext); |
---|
240 | oclistpush(lexstate->reclaim,(ocelem)*lvalp); |
---|
241 | } |
---|
242 | return token; /* Return the type of the token. */ |
---|
243 | } |
---|
244 | |
---|
245 | static void |
---|
246 | dapaddyytext(DAPlexstate* lex, int c) |
---|
247 | { |
---|
248 | ocbytesappend(lex->yytext,(char)c); |
---|
249 | } |
---|
250 | |
---|
251 | #ifndef DAP2ENCODE |
---|
252 | static int |
---|
253 | tohex(int c) |
---|
254 | { |
---|
255 | if(c >= 'a' && c <= 'f') return (c - 'a') + 0xa; |
---|
256 | if(c >= 'A' && c <= 'F') return (c - 'A') + 0xa; |
---|
257 | if(c >= '0' && c <= '9') return (c - '0'); |
---|
258 | return -1; |
---|
259 | } |
---|
260 | #endif |
---|
261 | |
---|
262 | static void |
---|
263 | dumptoken(DAPlexstate* lexstate) |
---|
264 | { |
---|
265 | fprintf(stderr,"TOKEN = |%s|\n",ocbytescontents(lexstate->yytext)); |
---|
266 | } |
---|
267 | |
---|
268 | /* |
---|
269 | Simple lexer |
---|
270 | */ |
---|
271 | |
---|
272 | void |
---|
273 | dapsetwordchars(DAPlexstate* lexstate, int kind) |
---|
274 | { |
---|
275 | switch (kind) { |
---|
276 | case 0: |
---|
277 | lexstate->worddelims = ddsworddelims; |
---|
278 | lexstate->wordchars1 = ddswordchars1; |
---|
279 | lexstate->wordcharsn = ddswordcharsn; |
---|
280 | break; |
---|
281 | case 1: |
---|
282 | lexstate->worddelims = ddsworddelims; |
---|
283 | lexstate->wordchars1 = ddswordchars1; |
---|
284 | lexstate->wordcharsn = daswordcharsn; |
---|
285 | break; |
---|
286 | case 2: |
---|
287 | lexstate->worddelims = ddsworddelims; |
---|
288 | lexstate->wordchars1 = cewordchars1; |
---|
289 | lexstate->wordcharsn = cewordcharsn; |
---|
290 | break; |
---|
291 | default: break; |
---|
292 | } |
---|
293 | } |
---|
294 | |
---|
295 | void |
---|
296 | daplexinit(char* input, DAPlexstate** lexstatep) |
---|
297 | { |
---|
298 | DAPlexstate* lexstate = (DAPlexstate*)malloc(sizeof(DAPlexstate)); |
---|
299 | if(lexstatep) *lexstatep = lexstate; |
---|
300 | if(lexstate == NULL) return; |
---|
301 | memset((void*)lexstate,0,sizeof(DAPlexstate)); |
---|
302 | lexstate->input = strdup(input); |
---|
303 | lexstate->next = lexstate->input; |
---|
304 | lexstate->yytext = ocbytesnew(); |
---|
305 | lexstate->reclaim = oclistnew(); |
---|
306 | dapsetwordchars(lexstate,0); /* Assume DDS */ |
---|
307 | } |
---|
308 | |
---|
309 | void |
---|
310 | daplexcleanup(DAPlexstate** lexstatep) |
---|
311 | { |
---|
312 | DAPlexstate* lexstate = *lexstatep; |
---|
313 | if(lexstate == NULL) return; |
---|
314 | if(lexstate->input != NULL) ocfree(lexstate->input); |
---|
315 | if(lexstate->reclaim != NULL) { |
---|
316 | while(oclistlength(lexstate->reclaim) > 0) { |
---|
317 | char* word = (char*)oclistpop(lexstate->reclaim); |
---|
318 | if(word) free(word); |
---|
319 | } |
---|
320 | oclistfree(lexstate->reclaim); |
---|
321 | } |
---|
322 | ocbytesfree(lexstate->yytext); |
---|
323 | free(lexstate); |
---|
324 | *lexstatep = NULL; |
---|
325 | } |
---|
326 | |
---|
327 | /* Dap identifiers will come to us with some |
---|
328 | characters escaped using the URL notation of |
---|
329 | %HH. The assumption here is that any character |
---|
330 | that is encoded is left encoded, except as follows: |
---|
331 | 1. if the encoded character is in fact a legal DAP2 character |
---|
332 | (alphanum+"_!~*'-\"") then it is decoded, otherwise not. |
---|
333 | */ |
---|
334 | #ifndef DECODE_IDENTIFIERS |
---|
335 | static char* decodelist = |
---|
336 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!~*'-\""; |
---|
337 | #endif |
---|
338 | |
---|
339 | char* |
---|
340 | dapdecode(DAPlexstate* lexstate, char* name) |
---|
341 | { |
---|
342 | char* decoded; |
---|
343 | #ifdef DECODE_IDENTIFIERS |
---|
344 | decoded = ocuridecode(name); |
---|
345 | #else |
---|
346 | decoded = ocuridecodeonly(name,decodelist); |
---|
347 | #endif |
---|
348 | oclistpush(lexstate->reclaim,(ocelem)decoded); |
---|
349 | return decoded; |
---|
350 | } |
---|