1 | /************************************************* |
---|
2 | * Perl-Compatible Regular Expressions * |
---|
3 | *************************************************/ |
---|
4 | |
---|
5 | |
---|
6 | /* PCRE is a library of functions to support regular expressions whose syntax |
---|
7 | and semantics are as close as possible to those of the Perl 5 language. |
---|
8 | |
---|
9 | Written by Philip Hazel |
---|
10 | Copyright (c) 1997-2008 University of Cambridge |
---|
11 | |
---|
12 | ----------------------------------------------------------------------------- |
---|
13 | Redistribution and use in source and binary forms, with or without |
---|
14 | modification, are permitted provided that the following conditions are met: |
---|
15 | |
---|
16 | * Redistributions of source code must retain the above copyright notice, |
---|
17 | this list of conditions and the following disclaimer. |
---|
18 | |
---|
19 | * Redistributions in binary form must reproduce the above copyright |
---|
20 | notice, this list of conditions and the following disclaimer in the |
---|
21 | documentation and/or other materials provided with the distribution. |
---|
22 | |
---|
23 | * Neither the name of the University of Cambridge nor the names of its |
---|
24 | contributors may be used to endorse or promote products derived from |
---|
25 | this software without specific prior written permission. |
---|
26 | |
---|
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
37 | POSSIBILITY OF SUCH DAMAGE. |
---|
38 | ----------------------------------------------------------------------------- |
---|
39 | */ |
---|
40 | |
---|
41 | /* This header contains definitions that are shared between the different |
---|
42 | modules, but which are not relevant to the exported API. This includes some |
---|
43 | functions whose names all begin with "_pcre_". */ |
---|
44 | |
---|
45 | #ifndef PCRE_INTERNAL_H |
---|
46 | #define PCRE_INTERNAL_H |
---|
47 | |
---|
48 | /* Define DEBUG to get debugging output on stdout. */ |
---|
49 | |
---|
50 | #if 0 |
---|
51 | #define DEBUG |
---|
52 | #endif |
---|
53 | |
---|
54 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
---|
55 | inline, and there are *still* stupid compilers about that don't like indented |
---|
56 | pre-processor statements, or at least there were when I first wrote this. After |
---|
57 | all, it had only been about 10 years then... |
---|
58 | |
---|
59 | It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
---|
60 | be absolutely sure we get our version. */ |
---|
61 | |
---|
62 | #undef DPRINTF |
---|
63 | #ifdef DEBUG |
---|
64 | #define DPRINTF(p) printf p |
---|
65 | #else |
---|
66 | #define DPRINTF(p) /* Nothing */ |
---|
67 | #endif |
---|
68 | |
---|
69 | |
---|
70 | /* Standard C headers plus the external interface definition. The only time |
---|
71 | setjmp and stdarg are used is when NO_RECURSE is set. */ |
---|
72 | |
---|
73 | #include <ctype.h> |
---|
74 | #include <limits.h> |
---|
75 | #include <setjmp.h> |
---|
76 | #include <stdarg.h> |
---|
77 | #include <stddef.h> |
---|
78 | #include <stdio.h> |
---|
79 | #include <stdlib.h> |
---|
80 | #include <string.h> |
---|
81 | |
---|
82 | /* When compiling a DLL for Windows, the exported symbols have to be declared |
---|
83 | using some MS magic. I found some useful information on this web page: |
---|
84 | http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
---|
85 | information there, using __declspec(dllexport) without "extern" we have a |
---|
86 | definition; with "extern" we have a declaration. The settings here override the |
---|
87 | setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, |
---|
88 | which is all that is needed for applications (they just import the symbols). We |
---|
89 | use: |
---|
90 | |
---|
91 | PCRE_EXP_DECL for declarations |
---|
92 | PCRE_EXP_DEFN for definitions of exported functions |
---|
93 | PCRE_EXP_DATA_DEFN for definitions of exported variables |
---|
94 | |
---|
95 | The reason for the two DEFN macros is that in non-Windows environments, one |
---|
96 | does not want to have "extern" before variable definitions because it leads to |
---|
97 | compiler warnings. So we distinguish between functions and variables. In |
---|
98 | Windows, the two should always be the same. |
---|
99 | |
---|
100 | The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, |
---|
101 | which is an application, but needs to import this file in order to "peek" at |
---|
102 | internals, can #include pcre.h first to get an application's-eye view. |
---|
103 | |
---|
104 | In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, |
---|
105 | special-purpose environments) might want to stick other stuff in front of |
---|
106 | exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and |
---|
107 | PCRE_EXP_DATA_DEFN only if they are not already set. */ |
---|
108 | |
---|
109 | #ifndef PCRE_EXP_DECL |
---|
110 | # ifdef _WIN32 |
---|
111 | # ifndef PCRE_STATIC |
---|
112 | # define PCRE_EXP_DECL extern __declspec(dllexport) |
---|
113 | # define PCRE_EXP_DEFN __declspec(dllexport) |
---|
114 | # define PCRE_EXP_DATA_DEFN __declspec(dllexport) |
---|
115 | # else |
---|
116 | # define PCRE_EXP_DECL extern |
---|
117 | # define PCRE_EXP_DEFN |
---|
118 | # define PCRE_EXP_DATA_DEFN |
---|
119 | # endif |
---|
120 | # else |
---|
121 | # ifdef __cplusplus |
---|
122 | # define PCRE_EXP_DECL extern "C" |
---|
123 | # else |
---|
124 | # define PCRE_EXP_DECL extern |
---|
125 | # endif |
---|
126 | # ifndef PCRE_EXP_DEFN |
---|
127 | # define PCRE_EXP_DEFN PCRE_EXP_DECL |
---|
128 | # endif |
---|
129 | # ifndef PCRE_EXP_DATA_DEFN |
---|
130 | # define PCRE_EXP_DATA_DEFN |
---|
131 | # endif |
---|
132 | # endif |
---|
133 | #endif |
---|
134 | |
---|
135 | /* When compiling with the MSVC compiler, it is sometimes necessary to include |
---|
136 | a "calling convention" before exported function names. (This is secondhand |
---|
137 | information; I know nothing about MSVC myself). For example, something like |
---|
138 | |
---|
139 | void __cdecl function(....) |
---|
140 | |
---|
141 | might be needed. In order so make this easy, all the exported functions have |
---|
142 | PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not |
---|
143 | set, we ensure here that it has no effect. */ |
---|
144 | |
---|
145 | #ifndef PCRE_CALL_CONVENTION |
---|
146 | #define PCRE_CALL_CONVENTION |
---|
147 | #endif |
---|
148 | |
---|
149 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We |
---|
150 | cannot determine these outside the compilation (e.g. by running a program as |
---|
151 | part of "configure") because PCRE is often cross-compiled for use on other |
---|
152 | systems. Instead we make use of the maximum sizes that are available at |
---|
153 | preprocessor time in standard C environments. */ |
---|
154 | |
---|
155 | #if USHRT_MAX == 65535 |
---|
156 | typedef unsigned short pcre_uint16; |
---|
157 | typedef short pcre_int16; |
---|
158 | #elif UINT_MAX == 65535 |
---|
159 | typedef unsigned int pcre_uint16; |
---|
160 | typedef int pcre_int16; |
---|
161 | #else |
---|
162 | #error Cannot determine a type for 16-bit unsigned integers |
---|
163 | #endif |
---|
164 | |
---|
165 | #if UINT_MAX == 4294967295 |
---|
166 | typedef unsigned int pcre_uint32; |
---|
167 | typedef int pcre_int32; |
---|
168 | #elif ULONG_MAX == 4294967295 |
---|
169 | typedef unsigned long int pcre_uint32; |
---|
170 | typedef long int pcre_int32; |
---|
171 | #else |
---|
172 | #error Cannot determine a type for 32-bit unsigned integers |
---|
173 | #endif |
---|
174 | |
---|
175 | /* All character handling must be done as unsigned characters. Otherwise there |
---|
176 | are problems with top-bit-set characters and functions such as isspace(). |
---|
177 | However, we leave the interface to the outside world as char *, because that |
---|
178 | should make things easier for callers. We define a short type for unsigned char |
---|
179 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
---|
180 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
---|
181 | |
---|
182 | typedef unsigned char uschar; |
---|
183 | |
---|
184 | /* This is an unsigned int value that no character can ever have. UTF-8 |
---|
185 | characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
---|
186 | 0x0010ffff). */ |
---|
187 | |
---|
188 | #define NOTACHAR 0xffffffff |
---|
189 | |
---|
190 | /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, |
---|
191 | "any" and "anycrlf" at present). The following macros are used to package up |
---|
192 | testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various |
---|
193 | modules to indicate in which datablock the parameters exist, and what the |
---|
194 | start/end of string field names are. */ |
---|
195 | |
---|
196 | #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ |
---|
197 | #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ |
---|
198 | #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ |
---|
199 | |
---|
200 | /* This macro checks for a newline at the given position */ |
---|
201 | |
---|
202 | #define IS_NEWLINE(p) \ |
---|
203 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
---|
204 | ((p) < NLBLOCK->PSEND && \ |
---|
205 | _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ |
---|
206 | utf8)) \ |
---|
207 | : \ |
---|
208 | ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
---|
209 | (p)[0] == NLBLOCK->nl[0] && \ |
---|
210 | (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ |
---|
211 | ) \ |
---|
212 | ) |
---|
213 | |
---|
214 | /* This macro checks for a newline immediately preceding the given position */ |
---|
215 | |
---|
216 | #define WAS_NEWLINE(p) \ |
---|
217 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
---|
218 | ((p) > NLBLOCK->PSSTART && \ |
---|
219 | _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ |
---|
220 | &(NLBLOCK->nllen), utf8)) \ |
---|
221 | : \ |
---|
222 | ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
---|
223 | (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ |
---|
224 | (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ |
---|
225 | ) \ |
---|
226 | ) |
---|
227 | |
---|
228 | /* When PCRE is compiled as a C++ library, the subject pointer can be replaced |
---|
229 | with a custom type. This makes it possible, for example, to allow pcre_exec() |
---|
230 | to process subject strings that are discontinuous by using a smart pointer |
---|
231 | class. It must always be possible to inspect all of the subject string in |
---|
232 | pcre_exec() because of the way it backtracks. Two macros are required in the |
---|
233 | normal case, for sign-unspecified and unsigned char pointers. The former is |
---|
234 | used for the external interface and appears in pcre.h, which is why its name |
---|
235 | must begin with PCRE_. */ |
---|
236 | |
---|
237 | #ifdef CUSTOM_SUBJECT_PTR |
---|
238 | #define PCRE_SPTR CUSTOM_SUBJECT_PTR |
---|
239 | #define USPTR CUSTOM_SUBJECT_PTR |
---|
240 | #else |
---|
241 | #define PCRE_SPTR const char * |
---|
242 | #define USPTR const unsigned char * |
---|
243 | #endif |
---|
244 | |
---|
245 | |
---|
246 | |
---|
247 | /* Include the public PCRE header and the definitions of UCP character property |
---|
248 | values. */ |
---|
249 | |
---|
250 | #include "pcre.h" |
---|
251 | #include "ucp.h" |
---|
252 | |
---|
253 | /* When compiling for use with the Virtual Pascal compiler, these functions |
---|
254 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
---|
255 | option on the command line. */ |
---|
256 | |
---|
257 | #ifdef VPCOMPAT |
---|
258 | #define strlen(s) _strlen(s) |
---|
259 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
---|
260 | #define memcmp(s,c,n) _memcmp(s,c,n) |
---|
261 | #define memcpy(d,s,n) _memcpy(d,s,n) |
---|
262 | #define memmove(d,s,n) _memmove(d,s,n) |
---|
263 | #define memset(s,c,n) _memset(s,c,n) |
---|
264 | #else /* VPCOMPAT */ |
---|
265 | |
---|
266 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
---|
267 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
---|
268 | is set. Otherwise, include an emulating function for those systems that have |
---|
269 | neither (there some non-Unix environments where this is the case). */ |
---|
270 | |
---|
271 | #ifndef HAVE_MEMMOVE |
---|
272 | #undef memmove /* some systems may have a macro */ |
---|
273 | #ifdef HAVE_BCOPY |
---|
274 | #define memmove(a, b, c) bcopy(b, a, c) |
---|
275 | #else /* HAVE_BCOPY */ |
---|
276 | static void * |
---|
277 | pcre_memmove(void *d, const void *s, size_t n) |
---|
278 | { |
---|
279 | size_t i; |
---|
280 | unsigned char *dest = (unsigned char *)d; |
---|
281 | const unsigned char *src = (const unsigned char *)s; |
---|
282 | if (dest > src) |
---|
283 | { |
---|
284 | dest += n; |
---|
285 | src += n; |
---|
286 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
---|
287 | return (void *)dest; |
---|
288 | } |
---|
289 | else |
---|
290 | { |
---|
291 | for (i = 0; i < n; ++i) *dest++ = *src++; |
---|
292 | return (void *)(dest - n); |
---|
293 | } |
---|
294 | } |
---|
295 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
---|
296 | #endif /* not HAVE_BCOPY */ |
---|
297 | #endif /* not HAVE_MEMMOVE */ |
---|
298 | #endif /* not VPCOMPAT */ |
---|
299 | |
---|
300 | |
---|
301 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
---|
302 | in big-endian order) by default. These are used, for example, to link from the |
---|
303 | start of a subpattern to its alternatives and its end. The use of 2 bytes per |
---|
304 | offset limits the size of the compiled regex to around 64K, which is big enough |
---|
305 | for almost everybody. However, I received a request for an even bigger limit. |
---|
306 | For this reason, and also to make the code easier to maintain, the storing and |
---|
307 | loading of offsets from the byte string is now handled by the macros that are |
---|
308 | defined here. |
---|
309 | |
---|
310 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
---|
311 | the config.h file, but can be overridden by using -D on the command line. This |
---|
312 | is automated on Unix systems via the "configure" command. */ |
---|
313 | |
---|
314 | #if LINK_SIZE == 2 |
---|
315 | |
---|
316 | #define PUT(a,n,d) \ |
---|
317 | (a[n] = (d) >> 8), \ |
---|
318 | (a[(n)+1] = (d) & 255) |
---|
319 | |
---|
320 | #define GET(a,n) \ |
---|
321 | (((a)[n] << 8) | (a)[(n)+1]) |
---|
322 | |
---|
323 | #define MAX_PATTERN_SIZE (1 << 16) |
---|
324 | |
---|
325 | |
---|
326 | #elif LINK_SIZE == 3 |
---|
327 | |
---|
328 | #define PUT(a,n,d) \ |
---|
329 | (a[n] = (d) >> 16), \ |
---|
330 | (a[(n)+1] = (d) >> 8), \ |
---|
331 | (a[(n)+2] = (d) & 255) |
---|
332 | |
---|
333 | #define GET(a,n) \ |
---|
334 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
---|
335 | |
---|
336 | #define MAX_PATTERN_SIZE (1 << 24) |
---|
337 | |
---|
338 | |
---|
339 | #elif LINK_SIZE == 4 |
---|
340 | |
---|
341 | #define PUT(a,n,d) \ |
---|
342 | (a[n] = (d) >> 24), \ |
---|
343 | (a[(n)+1] = (d) >> 16), \ |
---|
344 | (a[(n)+2] = (d) >> 8), \ |
---|
345 | (a[(n)+3] = (d) & 255) |
---|
346 | |
---|
347 | #define GET(a,n) \ |
---|
348 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
---|
349 | |
---|
350 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
---|
351 | |
---|
352 | |
---|
353 | #else |
---|
354 | #error LINK_SIZE must be either 2, 3, or 4 |
---|
355 | #endif |
---|
356 | |
---|
357 | |
---|
358 | /* Convenience macro defined in terms of the others */ |
---|
359 | |
---|
360 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
---|
361 | |
---|
362 | |
---|
363 | /* PCRE uses some other 2-byte quantities that do not change when the size of |
---|
364 | offsets changes. There are used for repeat counts and for other things such as |
---|
365 | capturing parenthesis numbers in back references. */ |
---|
366 | |
---|
367 | #define PUT2(a,n,d) \ |
---|
368 | a[n] = (d) >> 8; \ |
---|
369 | a[(n)+1] = (d) & 255 |
---|
370 | |
---|
371 | #define GET2(a,n) \ |
---|
372 | (((a)[n] << 8) | (a)[(n)+1]) |
---|
373 | |
---|
374 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 |
---|
375 | |
---|
376 | |
---|
377 | /* When UTF-8 encoding is being used, a character is no longer just a single |
---|
378 | byte. The macros for character handling generate simple sequences when used in |
---|
379 | byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should |
---|
380 | never be called in byte mode. To make sure it can never even appear when UTF-8 |
---|
381 | support is omitted, we don't even define it. */ |
---|
382 | |
---|
383 | #ifndef SUPPORT_UTF8 |
---|
384 | #define GETCHAR(c, eptr) c = *eptr; |
---|
385 | #define GETCHARTEST(c, eptr) c = *eptr; |
---|
386 | #define GETCHARINC(c, eptr) c = *eptr++; |
---|
387 | #define GETCHARINCTEST(c, eptr) c = *eptr++; |
---|
388 | #define GETCHARLEN(c, eptr, len) c = *eptr; |
---|
389 | /* #define BACKCHAR(eptr) */ |
---|
390 | |
---|
391 | #else /* SUPPORT_UTF8 */ |
---|
392 | |
---|
393 | /* Get the next UTF-8 character, not advancing the pointer. This is called when |
---|
394 | we know we are in UTF-8 mode. */ |
---|
395 | |
---|
396 | #define GETCHAR(c, eptr) \ |
---|
397 | c = *eptr; \ |
---|
398 | if (c >= 0xc0) \ |
---|
399 | { \ |
---|
400 | int gcii; \ |
---|
401 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
402 | int gcss = 6*gcaa; \ |
---|
403 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
404 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
405 | { \ |
---|
406 | gcss -= 6; \ |
---|
407 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
408 | } \ |
---|
409 | } |
---|
410 | |
---|
411 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
---|
412 | pointer. */ |
---|
413 | |
---|
414 | #define GETCHARTEST(c, eptr) \ |
---|
415 | c = *eptr; \ |
---|
416 | if (utf8 && c >= 0xc0) \ |
---|
417 | { \ |
---|
418 | int gcii; \ |
---|
419 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
420 | int gcss = 6*gcaa; \ |
---|
421 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
422 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
423 | { \ |
---|
424 | gcss -= 6; \ |
---|
425 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
426 | } \ |
---|
427 | } |
---|
428 | |
---|
429 | /* Get the next UTF-8 character, advancing the pointer. This is called when we |
---|
430 | know we are in UTF-8 mode. */ |
---|
431 | |
---|
432 | #define GETCHARINC(c, eptr) \ |
---|
433 | c = *eptr++; \ |
---|
434 | if (c >= 0xc0) \ |
---|
435 | { \ |
---|
436 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
437 | int gcss = 6*gcaa; \ |
---|
438 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
439 | while (gcaa-- > 0) \ |
---|
440 | { \ |
---|
441 | gcss -= 6; \ |
---|
442 | c |= (*eptr++ & 0x3f) << gcss; \ |
---|
443 | } \ |
---|
444 | } |
---|
445 | |
---|
446 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer */ |
---|
447 | |
---|
448 | #define GETCHARINCTEST(c, eptr) \ |
---|
449 | c = *eptr++; \ |
---|
450 | if (utf8 && c >= 0xc0) \ |
---|
451 | { \ |
---|
452 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
453 | int gcss = 6*gcaa; \ |
---|
454 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
455 | while (gcaa-- > 0) \ |
---|
456 | { \ |
---|
457 | gcss -= 6; \ |
---|
458 | c |= (*eptr++ & 0x3f) << gcss; \ |
---|
459 | } \ |
---|
460 | } |
---|
461 | |
---|
462 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
---|
463 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
---|
464 | |
---|
465 | #define GETCHARLEN(c, eptr, len) \ |
---|
466 | c = *eptr; \ |
---|
467 | if (c >= 0xc0) \ |
---|
468 | { \ |
---|
469 | int gcii; \ |
---|
470 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
471 | int gcss = 6*gcaa; \ |
---|
472 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
473 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
474 | { \ |
---|
475 | gcss -= 6; \ |
---|
476 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
477 | } \ |
---|
478 | len += gcaa; \ |
---|
479 | } |
---|
480 | |
---|
481 | /* If the pointer is not at the start of a character, move it back until |
---|
482 | it is. This is called only in UTF-8 mode - we don't put a test within the macro |
---|
483 | because almost all calls are already within a block of UTF-8 only code. */ |
---|
484 | |
---|
485 | #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
---|
486 | |
---|
487 | #endif |
---|
488 | |
---|
489 | |
---|
490 | /* In case there is no definition of offsetof() provided - though any proper |
---|
491 | Standard C system should have one. */ |
---|
492 | |
---|
493 | #ifndef offsetof |
---|
494 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
---|
495 | #endif |
---|
496 | |
---|
497 | |
---|
498 | /* These are the public options that can change during matching. */ |
---|
499 | |
---|
500 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
---|
501 | |
---|
502 | /* Private flags containing information about the compiled regex. They used to |
---|
503 | live at the top end of the options word, but that got almost full, so now they |
---|
504 | are in a 16-bit flags word. */ |
---|
505 | |
---|
506 | #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ |
---|
507 | #define PCRE_FIRSTSET 0x0002 /* first_byte is set */ |
---|
508 | #define PCRE_REQCHSET 0x0004 /* req_byte is set */ |
---|
509 | #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ |
---|
510 | #define PCRE_JCHANGED 0x0010 /* j option used in regex */ |
---|
511 | #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ |
---|
512 | |
---|
513 | /* Options for the "extra" block produced by pcre_study(). */ |
---|
514 | |
---|
515 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
---|
516 | |
---|
517 | /* Masks for identifying the public options that are permitted at compile |
---|
518 | time, run time, or study time, respectively. */ |
---|
519 | |
---|
520 | #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ |
---|
521 | PCRE_NEWLINE_ANYCRLF) |
---|
522 | |
---|
523 | #define PUBLIC_OPTIONS \ |
---|
524 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
---|
525 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
---|
526 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
---|
527 | PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
---|
528 | PCRE_JAVASCRIPT_COMPAT) |
---|
529 | |
---|
530 | #define PUBLIC_EXEC_OPTIONS \ |
---|
531 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
---|
532 | PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
---|
533 | |
---|
534 | #define PUBLIC_DFA_EXEC_OPTIONS \ |
---|
535 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
---|
536 | PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ |
---|
537 | PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
---|
538 | |
---|
539 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
---|
540 | |
---|
541 | /* Magic number to provide a small check against being handed junk. Also used |
---|
542 | to detect whether a pattern was compiled on a host of different endianness. */ |
---|
543 | |
---|
544 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
---|
545 | |
---|
546 | /* Negative values for the firstchar and reqchar variables */ |
---|
547 | |
---|
548 | #define REQ_UNSET (-2) |
---|
549 | #define REQ_NONE (-1) |
---|
550 | |
---|
551 | /* The maximum remaining length of subject we are prepared to search for a |
---|
552 | req_byte match. */ |
---|
553 | |
---|
554 | #define REQ_BYTE_MAX 1000 |
---|
555 | |
---|
556 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a |
---|
557 | variable-length repeat, or a anything other than literal characters. */ |
---|
558 | |
---|
559 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ |
---|
560 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ |
---|
561 | |
---|
562 | /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in |
---|
563 | environments where these macros are defined elsewhere. */ |
---|
564 | |
---|
565 | #ifndef FALSE |
---|
566 | typedef int BOOL; |
---|
567 | |
---|
568 | #define FALSE 0 |
---|
569 | #define TRUE 1 |
---|
570 | #endif |
---|
571 | |
---|
572 | /* Escape items that are just an encoding of a particular data value. */ |
---|
573 | |
---|
574 | #ifndef ESC_e |
---|
575 | #define ESC_e 27 |
---|
576 | #endif |
---|
577 | |
---|
578 | #ifndef ESC_f |
---|
579 | #define ESC_f '\f' |
---|
580 | #endif |
---|
581 | |
---|
582 | #ifndef ESC_n |
---|
583 | #define ESC_n '\n' |
---|
584 | #endif |
---|
585 | |
---|
586 | #ifndef ESC_r |
---|
587 | #define ESC_r '\r' |
---|
588 | #endif |
---|
589 | |
---|
590 | /* We can't officially use ESC_t because it is a POSIX reserved identifier |
---|
591 | (presumably because of all the others like size_t). */ |
---|
592 | |
---|
593 | #ifndef ESC_tee |
---|
594 | #define ESC_tee '\t' |
---|
595 | #endif |
---|
596 | |
---|
597 | /* Codes for different types of Unicode property */ |
---|
598 | |
---|
599 | #define PT_ANY 0 /* Any property - matches all chars */ |
---|
600 | #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ |
---|
601 | #define PT_GC 2 /* General characteristic (e.g. L) */ |
---|
602 | #define PT_PC 3 /* Particular characteristic (e.g. Lu) */ |
---|
603 | #define PT_SC 4 /* Script (e.g. Han) */ |
---|
604 | |
---|
605 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
---|
606 | contain UTF-8 characters with values greater than 255. */ |
---|
607 | |
---|
608 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
---|
609 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
---|
610 | |
---|
611 | #define XCL_END 0 /* Marks end of individual items */ |
---|
612 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
---|
613 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
---|
614 | #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ |
---|
615 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
---|
616 | |
---|
617 | /* These are escaped items that aren't just an encoding of a particular data |
---|
618 | value such as \n. They must have non-zero values, as check_escape() returns |
---|
619 | their negation. Also, they must appear in the same order as in the opcode |
---|
620 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it |
---|
621 | corresponds to "." rather than an escape sequence, and another for OP_ALLANY |
---|
622 | (which is used for [^] in JavaScript compatibility mode). |
---|
623 | |
---|
624 | The final escape must be ESC_REF as subsequent values are used for |
---|
625 | backreferences (\1, \2, \3, etc). There are two tests in the code for an escape |
---|
626 | greater than ESC_b and less than ESC_Z to detect the types that may be |
---|
627 | repeated. These are the types that consume characters. If any new escapes are |
---|
628 | put in between that don't consume a character, that code will have to change. |
---|
629 | */ |
---|
630 | |
---|
631 | enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, |
---|
632 | ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
---|
633 | ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, |
---|
634 | ESC_REF }; |
---|
635 | |
---|
636 | |
---|
637 | /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to |
---|
638 | OP_EOD must correspond in order to the list of escapes immediately above. |
---|
639 | |
---|
640 | *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions |
---|
641 | that follow must also be updated to match. There is also a table called |
---|
642 | "coptable" in pcre_dfa_exec.c that must be updated. */ |
---|
643 | |
---|
644 | enum { |
---|
645 | OP_END, /* 0 End of pattern */ |
---|
646 | |
---|
647 | /* Values corresponding to backslashed metacharacters */ |
---|
648 | |
---|
649 | OP_SOD, /* 1 Start of data: \A */ |
---|
650 | OP_SOM, /* 2 Start of match (subject + offset): \G */ |
---|
651 | OP_SET_SOM, /* 3 Set start of match (\K) */ |
---|
652 | OP_NOT_WORD_BOUNDARY, /* 4 \B */ |
---|
653 | OP_WORD_BOUNDARY, /* 5 \b */ |
---|
654 | OP_NOT_DIGIT, /* 6 \D */ |
---|
655 | OP_DIGIT, /* 7 \d */ |
---|
656 | OP_NOT_WHITESPACE, /* 8 \S */ |
---|
657 | OP_WHITESPACE, /* 9 \s */ |
---|
658 | OP_NOT_WORDCHAR, /* 10 \W */ |
---|
659 | OP_WORDCHAR, /* 11 \w */ |
---|
660 | OP_ANY, /* 12 Match any character (subject to DOTALL) */ |
---|
661 | OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ |
---|
662 | OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
---|
663 | OP_NOTPROP, /* 15 \P (not Unicode property) */ |
---|
664 | OP_PROP, /* 16 \p (Unicode property) */ |
---|
665 | OP_ANYNL, /* 17 \R (any newline sequence) */ |
---|
666 | OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ |
---|
667 | OP_HSPACE, /* 19 \h (horizontal whitespace) */ |
---|
668 | OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
---|
669 | OP_VSPACE, /* 21 \v (vertical whitespace) */ |
---|
670 | OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
---|
671 | OP_EODN, /* 23 End of data or \n at end of data: \Z. */ |
---|
672 | OP_EOD, /* 24 End of data: \z */ |
---|
673 | |
---|
674 | OP_OPT, /* 25 Set runtime options */ |
---|
675 | OP_CIRC, /* 26 Start of line - varies with multiline switch */ |
---|
676 | OP_DOLL, /* 27 End of line - varies with multiline switch */ |
---|
677 | OP_CHAR, /* 28 Match one character, casefully */ |
---|
678 | OP_CHARNC, /* 29 Match one character, caselessly */ |
---|
679 | OP_NOT, /* 30 Match one character, not the following one */ |
---|
680 | |
---|
681 | OP_STAR, /* 31 The maximizing and minimizing versions of */ |
---|
682 | OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */ |
---|
683 | OP_PLUS, /* 33 the minimizing one second. */ |
---|
684 | OP_MINPLUS, /* 34 This first set applies to single characters.*/ |
---|
685 | OP_QUERY, /* 35 */ |
---|
686 | OP_MINQUERY, /* 36 */ |
---|
687 | |
---|
688 | OP_UPTO, /* 37 From 0 to n matches */ |
---|
689 | OP_MINUPTO, /* 38 */ |
---|
690 | OP_EXACT, /* 39 Exactly n matches */ |
---|
691 | |
---|
692 | OP_POSSTAR, /* 40 Possessified star */ |
---|
693 | OP_POSPLUS, /* 41 Possessified plus */ |
---|
694 | OP_POSQUERY, /* 42 Posesssified query */ |
---|
695 | OP_POSUPTO, /* 43 Possessified upto */ |
---|
696 | |
---|
697 | OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */ |
---|
698 | OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */ |
---|
699 | OP_NOTPLUS, /* 46 the minimizing one second. They must be in */ |
---|
700 | OP_NOTMINPLUS, /* 47 exactly the same order as those above. */ |
---|
701 | OP_NOTQUERY, /* 48 This set applies to "not" single characters. */ |
---|
702 | OP_NOTMINQUERY, /* 49 */ |
---|
703 | |
---|
704 | OP_NOTUPTO, /* 50 From 0 to n matches */ |
---|
705 | OP_NOTMINUPTO, /* 51 */ |
---|
706 | OP_NOTEXACT, /* 52 Exactly n matches */ |
---|
707 | |
---|
708 | OP_NOTPOSSTAR, /* 53 Possessified versions */ |
---|
709 | OP_NOTPOSPLUS, /* 54 */ |
---|
710 | OP_NOTPOSQUERY, /* 55 */ |
---|
711 | OP_NOTPOSUPTO, /* 56 */ |
---|
712 | |
---|
713 | OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */ |
---|
714 | OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */ |
---|
715 | OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */ |
---|
716 | OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */ |
---|
717 | OP_TYPEQUERY, /* 61 This set applies to character types such as \d */ |
---|
718 | OP_TYPEMINQUERY, /* 62 */ |
---|
719 | |
---|
720 | OP_TYPEUPTO, /* 63 From 0 to n matches */ |
---|
721 | OP_TYPEMINUPTO, /* 64 */ |
---|
722 | OP_TYPEEXACT, /* 65 Exactly n matches */ |
---|
723 | |
---|
724 | OP_TYPEPOSSTAR, /* 66 Possessified versions */ |
---|
725 | OP_TYPEPOSPLUS, /* 67 */ |
---|
726 | OP_TYPEPOSQUERY, /* 68 */ |
---|
727 | OP_TYPEPOSUPTO, /* 69 */ |
---|
728 | |
---|
729 | OP_CRSTAR, /* 70 The maximizing and minimizing versions of */ |
---|
730 | OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */ |
---|
731 | OP_CRPLUS, /* 72 the minimizing one second. These codes must */ |
---|
732 | OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */ |
---|
733 | OP_CRQUERY, /* 74 These are for character classes and back refs */ |
---|
734 | OP_CRMINQUERY, /* 75 */ |
---|
735 | OP_CRRANGE, /* 76 These are different to the three sets above. */ |
---|
736 | OP_CRMINRANGE, /* 77 */ |
---|
737 | |
---|
738 | OP_CLASS, /* 78 Match a character class, chars < 256 only */ |
---|
739 | OP_NCLASS, /* 79 Same, but the bitmap was created from a negative |
---|
740 | class - the difference is relevant only when a UTF-8 |
---|
741 | character > 255 is encountered. */ |
---|
742 | |
---|
743 | OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the |
---|
744 | class. This does both positive and negative. */ |
---|
745 | |
---|
746 | OP_REF, /* 81 Match a back reference */ |
---|
747 | OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */ |
---|
748 | OP_CALLOUT, /* 83 Call out to external function if provided */ |
---|
749 | |
---|
750 | OP_ALT, /* 84 Start of alternation */ |
---|
751 | OP_KET, /* 85 End of group that doesn't have an unbounded repeat */ |
---|
752 | OP_KETRMAX, /* 86 These two must remain together and in this */ |
---|
753 | OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */ |
---|
754 | |
---|
755 | /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ |
---|
756 | |
---|
757 | OP_ASSERT, /* 88 Positive lookahead */ |
---|
758 | OP_ASSERT_NOT, /* 89 Negative lookahead */ |
---|
759 | OP_ASSERTBACK, /* 90 Positive lookbehind */ |
---|
760 | OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */ |
---|
761 | OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */ |
---|
762 | |
---|
763 | /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, |
---|
764 | as there's a test for >= ONCE for a subpattern that isn't an assertion. */ |
---|
765 | |
---|
766 | OP_ONCE, /* 93 Atomic group */ |
---|
767 | OP_BRA, /* 94 Start of non-capturing bracket */ |
---|
768 | OP_CBRA, /* 95 Start of capturing bracket */ |
---|
769 | OP_COND, /* 96 Conditional group */ |
---|
770 | |
---|
771 | /* These three must follow the previous three, in the same order. There's a |
---|
772 | check for >= SBRA to distinguish the two sets. */ |
---|
773 | |
---|
774 | OP_SBRA, /* 97 Start of non-capturing bracket, check empty */ |
---|
775 | OP_SCBRA, /* 98 Start of capturing bracket, check empty */ |
---|
776 | OP_SCOND, /* 99 Conditional group, check empty */ |
---|
777 | |
---|
778 | OP_CREF, /* 100 Used to hold a capture number as condition */ |
---|
779 | OP_RREF, /* 101 Used to hold a recursion number as condition */ |
---|
780 | OP_DEF, /* 102 The DEFINE condition */ |
---|
781 | |
---|
782 | OP_BRAZERO, /* 103 These two must remain together and in this */ |
---|
783 | OP_BRAMINZERO, /* 104 order. */ |
---|
784 | |
---|
785 | /* These are backtracking control verbs */ |
---|
786 | |
---|
787 | OP_PRUNE, /* 105 */ |
---|
788 | OP_SKIP, /* 106 */ |
---|
789 | OP_THEN, /* 107 */ |
---|
790 | OP_COMMIT, /* 108 */ |
---|
791 | |
---|
792 | /* These are forced failure and success verbs */ |
---|
793 | |
---|
794 | OP_FAIL, /* 109 */ |
---|
795 | OP_ACCEPT, /* 110 */ |
---|
796 | |
---|
797 | /* This is used to skip a subpattern with a {0} quantifier */ |
---|
798 | |
---|
799 | OP_SKIPZERO /* 111 */ |
---|
800 | }; |
---|
801 | |
---|
802 | |
---|
803 | /* This macro defines textual names for all the opcodes. These are used only |
---|
804 | for debugging. The macro is referenced only in pcre_printint.c. */ |
---|
805 | |
---|
806 | #define OP_NAME_LIST \ |
---|
807 | "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ |
---|
808 | "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
---|
809 | "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
---|
810 | "extuni", "\\Z", "\\z", \ |
---|
811 | "Opt", "^", "$", "char", "charnc", "not", \ |
---|
812 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
813 | "*+","++", "?+", "{", \ |
---|
814 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
815 | "*+","++", "?+", "{", \ |
---|
816 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
817 | "*+","++", "?+", "{", \ |
---|
818 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
---|
819 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ |
---|
820 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ |
---|
821 | "AssertB", "AssertB not", "Reverse", \ |
---|
822 | "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ |
---|
823 | "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ |
---|
824 | "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ |
---|
825 | "Skip zero" |
---|
826 | |
---|
827 | |
---|
828 | /* This macro defines the length of fixed length operations in the compiled |
---|
829 | regex. The lengths are used when searching for specific things, and also in the |
---|
830 | debugging printing of a compiled regex. We use a macro so that it can be |
---|
831 | defined close to the definitions of the opcodes themselves. |
---|
832 | |
---|
833 | As things have been extended, some of these are no longer fixed lenths, but are |
---|
834 | minima instead. For example, the length of a single-character repeat may vary |
---|
835 | in UTF-8 mode. The code that uses this table must know about such things. */ |
---|
836 | |
---|
837 | #define OP_LENGTHS \ |
---|
838 | 1, /* End */ \ |
---|
839 | 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ |
---|
840 | 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ |
---|
841 | 1, 1, 1, /* Any, AllAny, Anybyte */ \ |
---|
842 | 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ |
---|
843 | 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
---|
844 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
---|
845 | 2, /* Char - the minimum length */ \ |
---|
846 | 2, /* Charnc - the minimum length */ \ |
---|
847 | 2, /* not */ \ |
---|
848 | /* Positive single-char repeats ** These are */ \ |
---|
849 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
---|
850 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ |
---|
851 | 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ |
---|
852 | /* Negative single-char repeats - only for chars < 256 */ \ |
---|
853 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
---|
854 | 4, 4, 4, /* NOT upto, minupto, exact */ \ |
---|
855 | 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ |
---|
856 | /* Positive type repeats */ \ |
---|
857 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
---|
858 | 4, 4, 4, /* Type upto, minupto, exact */ \ |
---|
859 | 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ |
---|
860 | /* Character class & ref repeats */ \ |
---|
861 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
---|
862 | 5, 5, /* CRRANGE, CRMINRANGE */ \ |
---|
863 | 33, /* CLASS */ \ |
---|
864 | 33, /* NCLASS */ \ |
---|
865 | 0, /* XCLASS - variable length */ \ |
---|
866 | 3, /* REF */ \ |
---|
867 | 1+LINK_SIZE, /* RECURSE */ \ |
---|
868 | 2+2*LINK_SIZE, /* CALLOUT */ \ |
---|
869 | 1+LINK_SIZE, /* Alt */ \ |
---|
870 | 1+LINK_SIZE, /* Ket */ \ |
---|
871 | 1+LINK_SIZE, /* KetRmax */ \ |
---|
872 | 1+LINK_SIZE, /* KetRmin */ \ |
---|
873 | 1+LINK_SIZE, /* Assert */ \ |
---|
874 | 1+LINK_SIZE, /* Assert not */ \ |
---|
875 | 1+LINK_SIZE, /* Assert behind */ \ |
---|
876 | 1+LINK_SIZE, /* Assert behind not */ \ |
---|
877 | 1+LINK_SIZE, /* Reverse */ \ |
---|
878 | 1+LINK_SIZE, /* ONCE */ \ |
---|
879 | 1+LINK_SIZE, /* BRA */ \ |
---|
880 | 3+LINK_SIZE, /* CBRA */ \ |
---|
881 | 1+LINK_SIZE, /* COND */ \ |
---|
882 | 1+LINK_SIZE, /* SBRA */ \ |
---|
883 | 3+LINK_SIZE, /* SCBRA */ \ |
---|
884 | 1+LINK_SIZE, /* SCOND */ \ |
---|
885 | 3, /* CREF */ \ |
---|
886 | 3, /* RREF */ \ |
---|
887 | 1, /* DEF */ \ |
---|
888 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
---|
889 | 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ |
---|
890 | 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ |
---|
891 | |
---|
892 | |
---|
893 | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
---|
894 | |
---|
895 | #define RREF_ANY 0xffff |
---|
896 | |
---|
897 | /* Error code numbers. They are given names so that they can more easily be |
---|
898 | tracked. */ |
---|
899 | |
---|
900 | enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, |
---|
901 | ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, |
---|
902 | ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, |
---|
903 | ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, |
---|
904 | ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
---|
905 | ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
---|
906 | ERR60, ERR61, ERR62, ERR63, ERR64 }; |
---|
907 | |
---|
908 | /* The real format of the start of the pcre block; the index of names and the |
---|
909 | code vector run on as long as necessary after the end. We store an explicit |
---|
910 | offset to the name table so that if a regex is compiled on one host, saved, and |
---|
911 | then run on another where the size of pointers is different, all might still |
---|
912 | be well. For the case of compiled-on-4 and run-on-8, we include an extra |
---|
913 | pointer that is always NULL. For future-proofing, a few dummy fields were |
---|
914 | originally included - even though you can never get this planning right - but |
---|
915 | there is only one left now. |
---|
916 | |
---|
917 | NOTE NOTE NOTE: |
---|
918 | Because people can now save and re-use compiled patterns, any additions to this |
---|
919 | structure should be made at the end, and something earlier (e.g. a new |
---|
920 | flag in the options or one of the dummy fields) should indicate that the new |
---|
921 | fields are present. Currently PCRE always sets the dummy fields to zero. |
---|
922 | NOTE NOTE NOTE: |
---|
923 | */ |
---|
924 | |
---|
925 | typedef struct real_pcre { |
---|
926 | pcre_uint32 magic_number; |
---|
927 | pcre_uint32 size; /* Total that was malloced */ |
---|
928 | pcre_uint32 options; /* Public options */ |
---|
929 | pcre_uint16 flags; /* Private flags */ |
---|
930 | pcre_uint16 dummy1; /* For future use */ |
---|
931 | pcre_uint16 top_bracket; |
---|
932 | pcre_uint16 top_backref; |
---|
933 | pcre_uint16 first_byte; |
---|
934 | pcre_uint16 req_byte; |
---|
935 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
---|
936 | pcre_uint16 name_entry_size; /* Size of any name items */ |
---|
937 | pcre_uint16 name_count; /* Number of name items */ |
---|
938 | pcre_uint16 ref_count; /* Reference count */ |
---|
939 | |
---|
940 | const unsigned char *tables; /* Pointer to tables or NULL for std */ |
---|
941 | const unsigned char *nullpad; /* NULL padding */ |
---|
942 | } real_pcre; |
---|
943 | |
---|
944 | /* The format of the block used to store data from pcre_study(). The same |
---|
945 | remark (see NOTE above) about extending this structure applies. */ |
---|
946 | |
---|
947 | typedef struct pcre_study_data { |
---|
948 | pcre_uint32 size; /* Total that was malloced */ |
---|
949 | pcre_uint32 options; |
---|
950 | uschar start_bits[32]; |
---|
951 | } pcre_study_data; |
---|
952 | |
---|
953 | /* Structure for passing "static" information around between the functions |
---|
954 | doing the compiling, so that they are thread-safe. */ |
---|
955 | |
---|
956 | typedef struct compile_data { |
---|
957 | const uschar *lcc; /* Points to lower casing table */ |
---|
958 | const uschar *fcc; /* Points to case-flipping table */ |
---|
959 | const uschar *cbits; /* Points to character type table */ |
---|
960 | const uschar *ctypes; /* Points to table of type maps */ |
---|
961 | const uschar *start_workspace;/* The start of working space */ |
---|
962 | const uschar *start_code; /* The start of the compiled code */ |
---|
963 | const uschar *start_pattern; /* The start of the pattern */ |
---|
964 | const uschar *end_pattern; /* The end of the pattern */ |
---|
965 | uschar *hwm; /* High watermark of workspace */ |
---|
966 | uschar *name_table; /* The name/number table */ |
---|
967 | int names_found; /* Number of entries so far */ |
---|
968 | int name_entry_size; /* Size of each entry */ |
---|
969 | int bracount; /* Count of capturing parens as we compile */ |
---|
970 | int final_bracount; /* Saved value after first pass */ |
---|
971 | int top_backref; /* Maximum back reference */ |
---|
972 | unsigned int backref_map; /* Bitmap of low back refs */ |
---|
973 | int external_options; /* External (initial) options */ |
---|
974 | int external_flags; /* External flag bits to be set */ |
---|
975 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
---|
976 | BOOL had_accept; /* (*ACCEPT) encountered */ |
---|
977 | int nltype; /* Newline type */ |
---|
978 | int nllen; /* Newline string length */ |
---|
979 | uschar nl[4]; /* Newline string when fixed length */ |
---|
980 | } compile_data; |
---|
981 | |
---|
982 | /* Structure for maintaining a chain of pointers to the currently incomplete |
---|
983 | branches, for testing for left recursion. */ |
---|
984 | |
---|
985 | typedef struct branch_chain { |
---|
986 | struct branch_chain *outer; |
---|
987 | uschar *current; |
---|
988 | } branch_chain; |
---|
989 | |
---|
990 | /* Structure for items in a linked list that represents an explicit recursive |
---|
991 | call within the pattern. */ |
---|
992 | |
---|
993 | typedef struct recursion_info { |
---|
994 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
---|
995 | int group_num; /* Number of group that was called */ |
---|
996 | const uschar *after_call; /* "Return value": points after the call in the expr */ |
---|
997 | USPTR save_start; /* Old value of mstart */ |
---|
998 | int *offset_save; /* Pointer to start of saved offsets */ |
---|
999 | int saved_max; /* Number of saved offsets */ |
---|
1000 | } recursion_info; |
---|
1001 | |
---|
1002 | /* Structure for building a chain of data for holding the values of the subject |
---|
1003 | pointer at the start of each subpattern, so as to detect when an empty string |
---|
1004 | has been matched by a subpattern - to break infinite loops. */ |
---|
1005 | |
---|
1006 | typedef struct eptrblock { |
---|
1007 | struct eptrblock *epb_prev; |
---|
1008 | USPTR epb_saved_eptr; |
---|
1009 | } eptrblock; |
---|
1010 | |
---|
1011 | |
---|
1012 | /* Structure for passing "static" information around between the functions |
---|
1013 | doing traditional NFA matching, so that they are thread-safe. */ |
---|
1014 | |
---|
1015 | typedef struct match_data { |
---|
1016 | unsigned long int match_call_count; /* As it says */ |
---|
1017 | unsigned long int match_limit; /* As it says */ |
---|
1018 | unsigned long int match_limit_recursion; /* As it says */ |
---|
1019 | int *offset_vector; /* Offset vector */ |
---|
1020 | int offset_end; /* One past the end */ |
---|
1021 | int offset_max; /* The maximum usable for return data */ |
---|
1022 | int nltype; /* Newline type */ |
---|
1023 | int nllen; /* Newline string length */ |
---|
1024 | uschar nl[4]; /* Newline string when fixed */ |
---|
1025 | const uschar *lcc; /* Points to lower casing table */ |
---|
1026 | const uschar *ctypes; /* Points to table of type maps */ |
---|
1027 | BOOL offset_overflow; /* Set if too many extractions */ |
---|
1028 | BOOL notbol; /* NOTBOL flag */ |
---|
1029 | BOOL noteol; /* NOTEOL flag */ |
---|
1030 | BOOL utf8; /* UTF8 flag */ |
---|
1031 | BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ |
---|
1032 | BOOL endonly; /* Dollar not before final \n */ |
---|
1033 | BOOL notempty; /* Empty string match not wanted */ |
---|
1034 | BOOL partial; /* PARTIAL flag */ |
---|
1035 | BOOL hitend; /* Hit the end of the subject at some point */ |
---|
1036 | BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
---|
1037 | const uschar *start_code; /* For use when recursing */ |
---|
1038 | USPTR start_subject; /* Start of the subject string */ |
---|
1039 | USPTR end_subject; /* End of the subject string */ |
---|
1040 | USPTR start_match_ptr; /* Start of matched string */ |
---|
1041 | USPTR end_match_ptr; /* Subject position at end match */ |
---|
1042 | int end_offset_top; /* Highwater mark at end of match */ |
---|
1043 | int capture_last; /* Most recent capture number */ |
---|
1044 | int start_offset; /* The start offset value */ |
---|
1045 | eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
---|
1046 | int eptrn; /* Next free eptrblock */ |
---|
1047 | recursion_info *recursive; /* Linked list of recursion data */ |
---|
1048 | void *callout_data; /* To pass back to callouts */ |
---|
1049 | } match_data; |
---|
1050 | |
---|
1051 | /* A similar structure is used for the same purpose by the DFA matching |
---|
1052 | functions. */ |
---|
1053 | |
---|
1054 | typedef struct dfa_match_data { |
---|
1055 | const uschar *start_code; /* Start of the compiled pattern */ |
---|
1056 | const uschar *start_subject; /* Start of the subject string */ |
---|
1057 | const uschar *end_subject; /* End of subject string */ |
---|
1058 | const uschar *tables; /* Character tables */ |
---|
1059 | int moptions; /* Match options */ |
---|
1060 | int poptions; /* Pattern options */ |
---|
1061 | int nltype; /* Newline type */ |
---|
1062 | int nllen; /* Newline string length */ |
---|
1063 | uschar nl[4]; /* Newline string when fixed */ |
---|
1064 | void *callout_data; /* To pass back to callouts */ |
---|
1065 | } dfa_match_data; |
---|
1066 | |
---|
1067 | /* Bit definitions for entries in the pcre_ctypes table. */ |
---|
1068 | |
---|
1069 | #define ctype_space 0x01 |
---|
1070 | #define ctype_letter 0x02 |
---|
1071 | #define ctype_digit 0x04 |
---|
1072 | #define ctype_xdigit 0x08 |
---|
1073 | #define ctype_word 0x10 /* alphanumeric or '_' */ |
---|
1074 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
---|
1075 | |
---|
1076 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
---|
1077 | of bits for a class map. Some classes are built by combining these tables. */ |
---|
1078 | |
---|
1079 | #define cbit_space 0 /* [:space:] or \s */ |
---|
1080 | #define cbit_xdigit 32 /* [:xdigit:] */ |
---|
1081 | #define cbit_digit 64 /* [:digit:] or \d */ |
---|
1082 | #define cbit_upper 96 /* [:upper:] */ |
---|
1083 | #define cbit_lower 128 /* [:lower:] */ |
---|
1084 | #define cbit_word 160 /* [:word:] or \w */ |
---|
1085 | #define cbit_graph 192 /* [:graph:] */ |
---|
1086 | #define cbit_print 224 /* [:print:] */ |
---|
1087 | #define cbit_punct 256 /* [:punct:] */ |
---|
1088 | #define cbit_cntrl 288 /* [:cntrl:] */ |
---|
1089 | #define cbit_length 320 /* Length of the cbits table */ |
---|
1090 | |
---|
1091 | /* Offsets of the various tables from the base tables pointer, and |
---|
1092 | total length. */ |
---|
1093 | |
---|
1094 | #define lcc_offset 0 |
---|
1095 | #define fcc_offset 256 |
---|
1096 | #define cbits_offset 512 |
---|
1097 | #define ctypes_offset (cbits_offset + cbit_length) |
---|
1098 | #define tables_length (ctypes_offset + 256) |
---|
1099 | |
---|
1100 | /* Layout of the UCP type table that translates property names into types and |
---|
1101 | codes. Each entry used to point directly to a name, but to reduce the number of |
---|
1102 | relocations in shared libraries, it now has an offset into a single string |
---|
1103 | instead. */ |
---|
1104 | |
---|
1105 | typedef struct { |
---|
1106 | pcre_uint16 name_offset; |
---|
1107 | pcre_uint16 type; |
---|
1108 | pcre_uint16 value; |
---|
1109 | } ucp_type_table; |
---|
1110 | |
---|
1111 | |
---|
1112 | /* Internal shared data tables. These are tables that are used by more than one |
---|
1113 | of the exported public functions. They have to be "external" in the C sense, |
---|
1114 | but are not part of the PCRE public API. The data for these tables is in the |
---|
1115 | pcre_tables.c module. */ |
---|
1116 | |
---|
1117 | extern const int _pcre_utf8_table1[]; |
---|
1118 | extern const int _pcre_utf8_table2[]; |
---|
1119 | extern const int _pcre_utf8_table3[]; |
---|
1120 | extern const uschar _pcre_utf8_table4[]; |
---|
1121 | |
---|
1122 | extern const int _pcre_utf8_table1_size; |
---|
1123 | |
---|
1124 | extern const char _pcre_utt_names[]; |
---|
1125 | extern const ucp_type_table _pcre_utt[]; |
---|
1126 | extern const int _pcre_utt_size; |
---|
1127 | |
---|
1128 | extern const uschar _pcre_default_tables[]; |
---|
1129 | |
---|
1130 | extern const uschar _pcre_OP_lengths[]; |
---|
1131 | |
---|
1132 | |
---|
1133 | /* Internal shared functions. These are functions that are used by more than |
---|
1134 | one of the exported public functions. They have to be "external" in the C |
---|
1135 | sense, but are not part of the PCRE public API. */ |
---|
1136 | |
---|
1137 | extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, |
---|
1138 | int *, BOOL); |
---|
1139 | extern int _pcre_ord2utf8(int, uschar *); |
---|
1140 | extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, |
---|
1141 | const pcre_study_data *, pcre_study_data *); |
---|
1142 | extern int _pcre_valid_utf8(const uschar *, int); |
---|
1143 | extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, |
---|
1144 | int *, BOOL); |
---|
1145 | extern BOOL _pcre_xclass(int, const uschar *); |
---|
1146 | |
---|
1147 | |
---|
1148 | /* Unicode character database (UCD) */ |
---|
1149 | |
---|
1150 | typedef struct { |
---|
1151 | uschar script; |
---|
1152 | uschar chartype; |
---|
1153 | pcre_int32 other_case; |
---|
1154 | } ucd_record; |
---|
1155 | |
---|
1156 | extern const ucd_record _pcre_ucd_records[]; |
---|
1157 | extern const uschar _pcre_ucd_stage1[]; |
---|
1158 | extern const pcre_uint16 _pcre_ucd_stage2[]; |
---|
1159 | extern const int _pcre_ucp_gentype[]; |
---|
1160 | |
---|
1161 | |
---|
1162 | /* UCD access macros */ |
---|
1163 | |
---|
1164 | #define UCD_BLOCK_SIZE 128 |
---|
1165 | #define GET_UCD(ch) (_pcre_ucd_records + \ |
---|
1166 | _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \ |
---|
1167 | UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE]) |
---|
1168 | |
---|
1169 | #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype |
---|
1170 | #define UCD_SCRIPT(ch) GET_UCD(ch)->script |
---|
1171 | #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)] |
---|
1172 | #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) |
---|
1173 | |
---|
1174 | #endif |
---|
1175 | |
---|
1176 | /* End of pcre_internal.h */ |
---|