[409] | 1 | /* Copyright 2009, UCAR/Unidata and OPeNDAP, Inc. |
---|
| 2 | See the COPYRIGHT file for more information. */ |
---|
| 3 | |
---|
| 4 | #include "config.h" |
---|
| 5 | #include <strings.h> |
---|
| 6 | #include "dapparselex.h" |
---|
| 7 | |
---|
| 8 | #undef URLCVT /* NEVER turn this on */ |
---|
| 9 | #define DAP2ENCODE |
---|
| 10 | |
---|
| 11 | /* Forward */ |
---|
| 12 | static void dumptoken(DAPlexstate* lexstate); |
---|
| 13 | static void dapaddyytext(DAPlexstate* lex, int c); |
---|
| 14 | #ifndef DAP2ENCODE |
---|
| 15 | static int tohex(int c); |
---|
| 16 | #endif |
---|
| 17 | |
---|
| 18 | /****************************************************/ |
---|
| 19 | |
---|
| 20 | #if 0 /* Following definitions are for informational purposes */ |
---|
| 21 | /* Set of all ascii printable characters */ |
---|
| 22 | static char ascii[] = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~"; |
---|
| 23 | |
---|
| 24 | /* Define the set of legal nonalphanum characters as specified in the DAP2 spec. */ |
---|
| 25 | static char* daplegal ="_!~*'-\""; |
---|
| 26 | #endif |
---|
| 27 | |
---|
| 28 | static char* ddsworddelims = |
---|
| 29 | "{}[]:;=,"; |
---|
| 30 | |
---|
| 31 | /* Define 1 and > 1st legal characters */ |
---|
| 32 | static char* ddswordchars1 = |
---|
| 33 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*"; |
---|
| 34 | static char* ddswordcharsn = |
---|
| 35 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#"; |
---|
| 36 | static char* daswordcharsn = |
---|
| 37 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#:"; |
---|
| 38 | static char* cewordchars1 = |
---|
| 39 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\"; |
---|
| 40 | static char* cewordcharsn = |
---|
| 41 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\"; |
---|
| 42 | |
---|
| 43 | /* Current sets of legal characters */ |
---|
| 44 | /* |
---|
| 45 | static char* wordchars1 = NULL; |
---|
| 46 | static char* wordcharsn = NULL; |
---|
| 47 | static char* worddelims = NULL; |
---|
| 48 | */ |
---|
| 49 | |
---|
| 50 | static char* keywords[] = { |
---|
| 51 | "alias", |
---|
| 52 | "array", |
---|
| 53 | "attributes", |
---|
| 54 | "byte", |
---|
| 55 | "dataset", |
---|
| 56 | "error", |
---|
| 57 | "float32", |
---|
| 58 | "float64", |
---|
| 59 | "grid", |
---|
| 60 | "int16", |
---|
| 61 | "int32", |
---|
| 62 | "maps", |
---|
| 63 | "sequence", |
---|
| 64 | "string", |
---|
| 65 | "structure", |
---|
| 66 | "uint16", |
---|
| 67 | "uint32", |
---|
| 68 | "url", |
---|
| 69 | "code", |
---|
| 70 | "message", |
---|
| 71 | "program_type", |
---|
| 72 | "program", |
---|
| 73 | NULL /* mark end of the keywords list */ |
---|
| 74 | }; |
---|
| 75 | |
---|
| 76 | static int keytokens[] = { |
---|
| 77 | SCAN_ALIAS, |
---|
| 78 | SCAN_ARRAY, |
---|
| 79 | SCAN_ATTR, |
---|
| 80 | SCAN_BYTE, |
---|
| 81 | SCAN_DATASET, |
---|
| 82 | SCAN_ERROR, |
---|
| 83 | SCAN_FLOAT32, |
---|
| 84 | SCAN_FLOAT64, |
---|
| 85 | SCAN_GRID, |
---|
| 86 | SCAN_INT16, |
---|
| 87 | SCAN_INT32, |
---|
| 88 | SCAN_MAPS, |
---|
| 89 | SCAN_SEQUENCE, |
---|
| 90 | SCAN_STRING, |
---|
| 91 | SCAN_STRUCTURE, |
---|
| 92 | SCAN_UINT16, |
---|
| 93 | SCAN_UINT32, |
---|
| 94 | SCAN_URL, |
---|
| 95 | SCAN_CODE, |
---|
| 96 | SCAN_MESSAGE, |
---|
| 97 | SCAN_PTYPE, |
---|
| 98 | SCAN_PROG |
---|
| 99 | }; |
---|
| 100 | |
---|
| 101 | /**************************************************/ |
---|
| 102 | |
---|
| 103 | int |
---|
| 104 | daplex(YYSTYPE* lvalp, DAPparsestate* state) |
---|
| 105 | { |
---|
| 106 | DAPlexstate* lexstate = state->lexstate; |
---|
| 107 | int token; |
---|
| 108 | int c; |
---|
| 109 | unsigned int i; |
---|
| 110 | char* p; |
---|
| 111 | char* tmp; |
---|
| 112 | |
---|
| 113 | token = 0; |
---|
| 114 | ocbytesclear(lexstate->yytext); |
---|
| 115 | /* invariant: p always points to current char */ |
---|
| 116 | for(p=lexstate->next;token==0&&(c=*p);p++) { |
---|
| 117 | if(c == '\n') { |
---|
| 118 | lexstate->lineno++; |
---|
| 119 | } else if(c <= ' ' || c == '\177') { |
---|
| 120 | /* whitespace: ignore */ |
---|
| 121 | } else if(c == '#') { |
---|
| 122 | /* single line comment */ |
---|
| 123 | while((c=*(++p))) {if(c == '\n') break;} |
---|
| 124 | } else if(strchr(lexstate->worddelims,c) != NULL) { |
---|
| 125 | /* don't put in lexstate->yytext to avoid memory leak */ |
---|
| 126 | token = c; |
---|
| 127 | } else if(c == '"') { |
---|
| 128 | int more = 1; |
---|
| 129 | /* We have a string token; will be reported as WORD_STRING */ |
---|
| 130 | while(more && (c=*(++p))) { |
---|
| 131 | #ifdef DAP2ENCODE |
---|
| 132 | if(c == '"') |
---|
| 133 | more = 0; |
---|
| 134 | else if(c == '\\') { |
---|
| 135 | /* Remove spec ambiguity by convering \c to c |
---|
| 136 | for any character c */ |
---|
| 137 | c=*(++p); |
---|
| 138 | if(c == '\0') more = 0; |
---|
| 139 | } |
---|
| 140 | #else /*Non-standard*/ |
---|
| 141 | switch (c) { |
---|
| 142 | case '"': more=0; break; |
---|
| 143 | case '\\': |
---|
| 144 | c=*(++p); |
---|
| 145 | switch (c) { |
---|
| 146 | case 'r': c = '\r'; break; |
---|
| 147 | case 'n': c = '\n'; break; |
---|
| 148 | case 'f': c = '\f'; break; |
---|
| 149 | case 't': c = '\t'; break; |
---|
| 150 | case 'x': { |
---|
| 151 | int d1,d2; |
---|
| 152 | c = '?'; |
---|
| 153 | ++p; |
---|
| 154 | d1 = tohex(*p++); |
---|
| 155 | if(d1 < 0) { |
---|
| 156 | daperror(state,"Illegal \\xDD in TOKEN_STRING"); |
---|
| 157 | } else { |
---|
| 158 | d2 = tohex(*p++); |
---|
| 159 | if(d2 < 0) { |
---|
| 160 | daperror(state,"Illegal \\xDD in TOKEN_STRING"); |
---|
| 161 | } else { |
---|
| 162 | c=(((unsigned int)d1)<<4) | (unsigned int)d2; |
---|
| 163 | } |
---|
| 164 | } |
---|
| 165 | } break; |
---|
| 166 | default: break; |
---|
| 167 | } |
---|
| 168 | break; |
---|
| 169 | default: break; |
---|
| 170 | } |
---|
| 171 | #endif /*!DAP2ENCODE*/ |
---|
| 172 | if(more) dapaddyytext(lexstate,c); |
---|
| 173 | } |
---|
| 174 | token=WORD_STRING; |
---|
| 175 | } else if(strchr(lexstate->wordchars1,c) != NULL) { |
---|
| 176 | int isdatamark = 0; |
---|
| 177 | /* we have a WORD_WORD */ |
---|
| 178 | dapaddyytext(lexstate,c); |
---|
| 179 | while((c=*(++p))) { |
---|
| 180 | #ifdef URLCVT |
---|
| 181 | if(c == '%' && p[1] != 0 && p[2] != 0 |
---|
| 182 | && strchr(hexdigits,p[1]) != NULL |
---|
| 183 | && strchr(hexdigits,p[2]) != NULL) { |
---|
| 184 | int d1,d2; |
---|
| 185 | d1 = tohex(p[1]); |
---|
| 186 | d2 = tohex(p[2]); |
---|
| 187 | if(d1 >= 0 || d2 >= 0) { |
---|
| 188 | c=(((unsigned int)d1)<<4) | (unsigned int)d2; |
---|
| 189 | p+=2; |
---|
| 190 | } |
---|
| 191 | } else { |
---|
| 192 | if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} |
---|
| 193 | } |
---|
| 194 | dapaddyytext(lexstate,c); |
---|
| 195 | #else |
---|
| 196 | if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} |
---|
| 197 | dapaddyytext(lexstate,c); |
---|
| 198 | #endif |
---|
| 199 | } |
---|
| 200 | /* Special check for Data: */ |
---|
| 201 | tmp = ocbytescontents(lexstate->yytext); |
---|
| 202 | if(strcmp(tmp,"Data")==0 && *p == ':') { |
---|
| 203 | dapaddyytext(lexstate,*p); p++; |
---|
| 204 | if(p[0] == '\n') { |
---|
| 205 | token = SCAN_DATA; |
---|
| 206 | isdatamark = 1; |
---|
| 207 | p++; |
---|
| 208 | } else if(p[0] == '\r' && p[1] == '\n') { |
---|
| 209 | token = SCAN_DATA; |
---|
| 210 | isdatamark = 1; |
---|
| 211 | p+=2; |
---|
| 212 | } |
---|
| 213 | } |
---|
| 214 | if(!isdatamark) { |
---|
| 215 | /* check for keyword */ |
---|
| 216 | token=WORD_WORD; /* assume */ |
---|
| 217 | for(i=0;;i++) { |
---|
| 218 | if(keywords[i] == NULL) break; |
---|
| 219 | if(strcasecmp(keywords[i],tmp)==0) { |
---|
| 220 | token=keytokens[i]; |
---|
| 221 | break; |
---|
| 222 | } |
---|
| 223 | } |
---|
| 224 | } |
---|
| 225 | } else { /* illegal */ |
---|
| 226 | } |
---|
| 227 | } |
---|
| 228 | lexstate->next = p; |
---|
| 229 | strncpy(lexstate->lasttokentext,ocbytescontents(lexstate->yytext),MAX_TOKEN_LENGTH); |
---|
| 230 | lexstate->lasttoken = token; |
---|
| 231 | if(ocdebug >= 2) |
---|
| 232 | dumptoken(lexstate); |
---|
| 233 | |
---|
| 234 | /*Put return value onto Bison stack*/ |
---|
| 235 | |
---|
| 236 | if(ocbyteslength(lexstate->yytext) == 0) |
---|
| 237 | *lvalp = NULL; |
---|
| 238 | else { |
---|
| 239 | *lvalp = ocbytesdup(lexstate->yytext); |
---|
| 240 | oclistpush(lexstate->reclaim,(ocelem)*lvalp); |
---|
| 241 | } |
---|
| 242 | return token; /* Return the type of the token. */ |
---|
| 243 | } |
---|
| 244 | |
---|
| 245 | static void |
---|
| 246 | dapaddyytext(DAPlexstate* lex, int c) |
---|
| 247 | { |
---|
| 248 | ocbytesappend(lex->yytext,(char)c); |
---|
| 249 | } |
---|
| 250 | |
---|
| 251 | #ifndef DAP2ENCODE |
---|
| 252 | static int |
---|
| 253 | tohex(int c) |
---|
| 254 | { |
---|
| 255 | if(c >= 'a' && c <= 'f') return (c - 'a') + 0xa; |
---|
| 256 | if(c >= 'A' && c <= 'F') return (c - 'A') + 0xa; |
---|
| 257 | if(c >= '0' && c <= '9') return (c - '0'); |
---|
| 258 | return -1; |
---|
| 259 | } |
---|
| 260 | #endif |
---|
| 261 | |
---|
| 262 | static void |
---|
| 263 | dumptoken(DAPlexstate* lexstate) |
---|
| 264 | { |
---|
| 265 | fprintf(stderr,"TOKEN = |%s|\n",ocbytescontents(lexstate->yytext)); |
---|
| 266 | } |
---|
| 267 | |
---|
| 268 | /* |
---|
| 269 | Simple lexer |
---|
| 270 | */ |
---|
| 271 | |
---|
| 272 | void |
---|
| 273 | dapsetwordchars(DAPlexstate* lexstate, int kind) |
---|
| 274 | { |
---|
| 275 | switch (kind) { |
---|
| 276 | case 0: |
---|
| 277 | lexstate->worddelims = ddsworddelims; |
---|
| 278 | lexstate->wordchars1 = ddswordchars1; |
---|
| 279 | lexstate->wordcharsn = ddswordcharsn; |
---|
| 280 | break; |
---|
| 281 | case 1: |
---|
| 282 | lexstate->worddelims = ddsworddelims; |
---|
| 283 | lexstate->wordchars1 = ddswordchars1; |
---|
| 284 | lexstate->wordcharsn = daswordcharsn; |
---|
| 285 | break; |
---|
| 286 | case 2: |
---|
| 287 | lexstate->worddelims = ddsworddelims; |
---|
| 288 | lexstate->wordchars1 = cewordchars1; |
---|
| 289 | lexstate->wordcharsn = cewordcharsn; |
---|
| 290 | break; |
---|
| 291 | default: break; |
---|
| 292 | } |
---|
| 293 | } |
---|
| 294 | |
---|
| 295 | void |
---|
| 296 | daplexinit(char* input, DAPlexstate** lexstatep) |
---|
| 297 | { |
---|
| 298 | DAPlexstate* lexstate = (DAPlexstate*)malloc(sizeof(DAPlexstate)); |
---|
| 299 | if(lexstatep) *lexstatep = lexstate; |
---|
| 300 | if(lexstate == NULL) return; |
---|
| 301 | memset((void*)lexstate,0,sizeof(DAPlexstate)); |
---|
| 302 | lexstate->input = strdup(input); |
---|
| 303 | lexstate->next = lexstate->input; |
---|
| 304 | lexstate->yytext = ocbytesnew(); |
---|
| 305 | lexstate->reclaim = oclistnew(); |
---|
| 306 | dapsetwordchars(lexstate,0); /* Assume DDS */ |
---|
| 307 | } |
---|
| 308 | |
---|
| 309 | void |
---|
| 310 | daplexcleanup(DAPlexstate** lexstatep) |
---|
| 311 | { |
---|
| 312 | DAPlexstate* lexstate = *lexstatep; |
---|
| 313 | if(lexstate == NULL) return; |
---|
| 314 | if(lexstate->input != NULL) ocfree(lexstate->input); |
---|
| 315 | if(lexstate->reclaim != NULL) { |
---|
| 316 | while(oclistlength(lexstate->reclaim) > 0) { |
---|
| 317 | char* word = (char*)oclistpop(lexstate->reclaim); |
---|
| 318 | if(word) free(word); |
---|
| 319 | } |
---|
| 320 | oclistfree(lexstate->reclaim); |
---|
| 321 | } |
---|
| 322 | ocbytesfree(lexstate->yytext); |
---|
| 323 | free(lexstate); |
---|
| 324 | *lexstatep = NULL; |
---|
| 325 | } |
---|
| 326 | |
---|
| 327 | /* Dap identifiers will come to us with some |
---|
| 328 | characters escaped using the URL notation of |
---|
| 329 | %HH. The assumption here is that any character |
---|
| 330 | that is encoded is left encoded, except as follows: |
---|
| 331 | 1. if the encoded character is in fact a legal DAP2 character |
---|
| 332 | (alphanum+"_!~*'-\"") then it is decoded, otherwise not. |
---|
| 333 | */ |
---|
| 334 | #ifndef DECODE_IDENTIFIERS |
---|
| 335 | static char* decodelist = |
---|
| 336 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!~*'-\""; |
---|
| 337 | #endif |
---|
| 338 | |
---|
| 339 | char* |
---|
| 340 | dapdecode(DAPlexstate* lexstate, char* name) |
---|
| 341 | { |
---|
| 342 | char* decoded; |
---|
| 343 | #ifdef DECODE_IDENTIFIERS |
---|
| 344 | decoded = ocuridecode(name); |
---|
| 345 | #else |
---|
| 346 | decoded = ocuridecodeonly(name,decodelist); |
---|
| 347 | #endif |
---|
| 348 | oclistpush(lexstate->reclaim,(ocelem)decoded); |
---|
| 349 | return decoded; |
---|
| 350 | } |
---|