source: XMLIO_V2/external/src/POCO/Foundation.save/pcre_compile.c @ 80

Last change on this file since 80 was 80, checked in by ymipsl, 14 years ago

ajout lib externe

  • Property svn:eol-style set to native
File size: 198.1 KB
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#include "pcre_config.h"
46
47#define NLBLOCK cd             /* Block containing newline information */
48#define PSSTART start_pattern  /* Field containing processed string start */
49#define PSEND   end_pattern    /* Field containing processed string end */
50
51#include "pcre_internal.h"
52
53
54/* When DEBUG is defined, we need the pcre_printint() function, which is also
55used by pcretest. DEBUG is not defined when building a production library. */
56
57#ifdef DEBUG
58#include "pcre_printint.src"
59#endif
60
61
62/* Macro for setting individual bits in class bitmaps. */
63
64#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
65
66/* Maximum length value to check against when making sure that the integer that
67holds the compiled pattern length does not overflow. We make it a bit less than
68INT_MAX to allow for adding in group terminating bytes, so that we don't have
69to check them every time. */
70
71#define OFLOW_MAX (INT_MAX - 20)
72
73
74/*************************************************
75*      Code parameters and static tables         *
76*************************************************/
77
78/* This value specifies the size of stack workspace that is used during the
79first pre-compile phase that determines how much memory is required. The regex
80is partly compiled into this space, but the compiled parts are discarded as
81soon as they can be, so that hopefully there will never be an overrun. The code
82does, however, check for an overrun. The largest amount I've seen used is 218,
83so this number is very generous.
84
85The same workspace is used during the second, actual compile phase for
86remembering forward references to groups so that they can be filled in at the
87end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
88is 4 there is plenty of room. */
89
90#define COMPILE_WORK_SIZE (4096)
91
92
93/* Table for handling escaped characters in the range '0'-'z'. Positive returns
94are simple data values; negative values are for special things like \d and so
95on. Zero means further processing is needed (for things like \x), or the escape
96is invalid. */
97
98#ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
99static const short int escapes[] = {
100     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
101     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
102   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
103-ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
104-ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
105-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
106   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
107-ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
108-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
109     0,      0, -ESC_z                                            /* x - z */
110};
111
112#else           /* This is the "abnormal" table for EBCDIC systems */
113static const short int escapes[] = {
114/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
115/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
116/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
117/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
118/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
119/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
120/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
121/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
122/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
123/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
124/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
125/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
126/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
127/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
128/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
129/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
130/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
131/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
132/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
133/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
134/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
135/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
136/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
137};
138#endif
139
140
141/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
142searched linearly. Put all the names into a single string, in order to reduce
143the number of relocations when a shared library is dynamically linked. */
144
145typedef struct verbitem {
146  int   len;
147  int   op;
148} verbitem;
149
150static const char verbnames[] =
151  "ACCEPT\0"
152  "COMMIT\0"
153  "F\0"
154  "FAIL\0"
155  "PRUNE\0"
156  "SKIP\0"
157  "THEN";
158
159static const verbitem verbs[] = {
160  { 6, OP_ACCEPT },
161  { 6, OP_COMMIT },
162  { 1, OP_FAIL },
163  { 4, OP_FAIL },
164  { 5, OP_PRUNE },
165  { 4, OP_SKIP  },
166  { 4, OP_THEN  }
167};
168
169static const int verbcount = sizeof(verbs)/sizeof(verbitem);
170
171
172/* Tables of names of POSIX character classes and their lengths. The names are
173now all in a single string, to reduce the number of relocations when a shared
174library is dynamically loaded. The list of lengths is terminated by a zero
175length entry. The first three must be alpha, lower, upper, as this is assumed
176for handling case independence. */
177
178static const char posix_names[] =
179  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
180  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
181  "word\0"   "xdigit";
182
183static const uschar posix_name_lengths[] = {
184  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
185
186/* Table of class bit maps for each POSIX class. Each class is formed from a
187base map, with an optional addition or removal of another map. Then, for some
188classes, there is some additional tweaking: for [:blank:] the vertical space
189characters are removed, and for [:alpha:] and [:alnum:] the underscore
190character is removed. The triples in the table consist of the base map offset,
191second map offset or -1 if no second map, and a non-negative value for map
192addition or a negative value for map subtraction (if there are two maps). The
193absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
194remove vertical space characters, 2 => remove underscore. */
195
196static const int posix_class_maps[] = {
197  cbit_word,  cbit_digit, -2,             /* alpha */
198  cbit_lower, -1,          0,             /* lower */
199  cbit_upper, -1,          0,             /* upper */
200  cbit_word,  -1,          2,             /* alnum - word without underscore */
201  cbit_print, cbit_cntrl,  0,             /* ascii */
202  cbit_space, -1,          1,             /* blank - a GNU extension */
203  cbit_cntrl, -1,          0,             /* cntrl */
204  cbit_digit, -1,          0,             /* digit */
205  cbit_graph, -1,          0,             /* graph */
206  cbit_print, -1,          0,             /* print */
207  cbit_punct, -1,          0,             /* punct */
208  cbit_space, -1,          0,             /* space */
209  cbit_word,  -1,          0,             /* word - a Perl extension */
210  cbit_xdigit,-1,          0              /* xdigit */
211};
212
213
214#define STRING(a)  # a
215#define XSTRING(s) STRING(s)
216
217/* The texts of compile-time error messages. These are "char *" because they
218are passed to the outside world. Do not ever re-use any error number, because
219they are documented. Always add a new error instead. Messages marked DEAD below
220are no longer used. This used to be a table of strings, but in order to reduce
221the number of relocations needed when a shared library is loaded dynamically,
222it is now one long string. We cannot use a table of offsets, because the
223lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
224simply count through to the one we want - this isn't a performance issue
225because these strings are used only when there is a compilation error. */
226
227static const char error_texts[] =
228  "no error\0"
229  "\\ at end of pattern\0"
230  "\\c at end of pattern\0"
231  "unrecognized character follows \\\0"
232  "numbers out of order in {} quantifier\0"
233  /* 5 */
234  "number too big in {} quantifier\0"
235  "missing terminating ] for character class\0"
236  "invalid escape sequence in character class\0"
237  "range out of order in character class\0"
238  "nothing to repeat\0"
239  /* 10 */
240  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
241  "internal error: unexpected repeat\0"
242  "unrecognized character after (? or (?-\0"
243  "POSIX named classes are supported only within a class\0"
244  "missing )\0"
245  /* 15 */
246  "reference to non-existent subpattern\0"
247  "erroffset passed as NULL\0"
248  "unknown option bit(s) set\0"
249  "missing ) after comment\0"
250  "parentheses nested too deeply\0"  /** DEAD **/
251  /* 20 */
252  "regular expression is too large\0"
253  "failed to get memory\0"
254  "unmatched parentheses\0"
255  "internal error: code overflow\0"
256  "unrecognized character after (?<\0"
257  /* 25 */
258  "lookbehind assertion is not fixed length\0"
259  "malformed number or name after (?(\0"
260  "conditional group contains more than two branches\0"
261  "assertion expected after (?(\0"
262  "(?R or (?[+-]digits must be followed by )\0"
263  /* 30 */
264  "unknown POSIX class name\0"
265  "POSIX collating elements are not supported\0"
266  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
267  "spare error\0"  /** DEAD **/
268  "character value in \\x{...} sequence is too large\0"
269  /* 35 */
270  "invalid condition (?(0)\0"
271  "\\C not allowed in lookbehind assertion\0"
272  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
273  "number after (?C is > 255\0"
274  "closing ) for (?C expected\0"
275  /* 40 */
276  "recursive call could loop indefinitely\0"
277  "unrecognized character after (?P\0"
278  "syntax error in subpattern name (missing terminator)\0"
279  "two named subpatterns have the same name\0"
280  "invalid UTF-8 string\0"
281  /* 45 */
282  "support for \\P, \\p, and \\X has not been compiled\0"
283  "malformed \\P or \\p sequence\0"
284  "unknown property name after \\P or \\p\0"
285  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
286  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
287  /* 50 */
288  "repeated subpattern is too long\0"    /** DEAD **/
289  "octal value is greater than \\377 (not in UTF-8 mode)\0"
290  "internal error: overran compiling workspace\0"
291  "internal error: previously-checked referenced subpattern not found\0"
292  "DEFINE group contains more than one branch\0"
293  /* 55 */
294  "repeating a DEFINE group is not allowed\0"
295  "inconsistent NEWLINE options\0"
296  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
297  "a numbered reference must not be zero\0"
298  "(*VERB) with an argument is not supported\0"
299  /* 60 */
300  "(*VERB) not recognized\0"
301  "number is too big\0"
302  "subpattern name expected\0"
303  "digit expected after (?+\0"
304  "] is an invalid data character in JavaScript compatibility mode";
305
306
307/* Table to identify digits and hex digits. This is used when compiling
308patterns. Note that the tables in chartables are dependent on the locale, and
309may mark arbitrary characters as digits - but the PCRE compiling code expects
310to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
311a private table here. It costs 256 bytes, but it is a lot faster than doing
312character value tests (at least in some simple cases I timed), and in some
313applications one wants PCRE to compile efficiently as well as match
314efficiently.
315
316For convenience, we use the same bit definitions as in chartables:
317
318  0x04   decimal digit
319  0x08   hexadecimal digit
320
321Then we can use ctype_digit and ctype_xdigit in the code. */
322
323#ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
324static const unsigned char digitab[] =
325  {
326  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
327  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
328  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
329  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
330  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
331  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
332  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
333  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
334  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
335  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
336  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
337  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
338  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
339  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
340  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
341  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
342  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
343  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
344  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
345  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
346  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
347  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
348  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
349  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
350  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
351  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
352  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
353  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
354  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
355  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
356  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
357  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
358
359#else           /* This is the "abnormal" case, for EBCDIC systems */
360static const unsigned char digitab[] =
361  {
362  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
363  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
364  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
365  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
366  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
367  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
368  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
369  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
370  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
371  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
372  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
373  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
374  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
375  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
376  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
377  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
378  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
379  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
380  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
381  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
382  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
383  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
384  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
385  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
386  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
387  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
388  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
389  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
390  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
391  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
392  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
393  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
394
395static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
396  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
397  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
398  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
399  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
400  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
401  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
402  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
403  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
404  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
405  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
406  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
407  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
408  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
409  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
410  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
411  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
412  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
413  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
414  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
415  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
416  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
417  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
418  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
419  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
420  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
421  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
422  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
423  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
424  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
425  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
426  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
427  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
428#endif
429
430
431/* Definition to allow mutual recursion */
432
433static BOOL
434  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
435    int *, int *, branch_chain *, compile_data *, int *);
436
437
438
439/*************************************************
440*            Find an error text                  *
441*************************************************/
442
443/* The error texts are now all in one long string, to save on relocations. As
444some of the text is of unknown length, we can't use a table of offsets.
445Instead, just count through the strings. This is not a performance issue
446because it happens only when there has been a compilation error.
447
448Argument:   the error number
449Returns:    pointer to the error string
450*/
451
452static const char *
453find_error_text(int n)
454{
455const char *s = error_texts;
456for (; n > 0; n--) while (*s++ != 0) {};
457return s;
458}
459
460
461/*************************************************
462*            Handle escapes                      *
463*************************************************/
464
465/* This function is called when a \ has been encountered. It either returns a
466positive value for a simple escape such as \n, or a negative value which
467encodes one of the more complicated things such as \d. A backreference to group
468n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
469UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
470ptr is pointing at the \. On exit, it is on the final character of the escape
471sequence.
472
473Arguments:
474  ptrptr         points to the pattern position pointer
475  errorcodeptr   points to the errorcode variable
476  bracount       number of previous extracting brackets
477  options        the options bits
478  isclass        TRUE if inside a character class
479
480Returns:         zero or positive => a data character
481                 negative => a special escape sequence
482                 on error, errorcodeptr is set
483*/
484
485static int
486check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
487  int options, BOOL isclass)
488{
489BOOL utf8 = (options & PCRE_UTF8) != 0;
490const uschar *ptr = *ptrptr + 1;
491int c, i;
492
493GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
494ptr--;                            /* Set pointer back to the last byte */
495
496/* If backslash is at the end of the pattern, it's an error. */
497
498if (c == 0) *errorcodeptr = ERR1;
499
500/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
501in a table. A non-zero result is something that can be returned immediately.
502Otherwise further processing may be required. */
503
504#ifndef EBCDIC  /* ASCII coding */
505else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
506else if ((i = escapes[c - '0']) != 0) c = i;
507
508#else           /* EBCDIC coding */
509else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
510else if ((i = escapes[c - 0x48]) != 0)  c = i;
511#endif
512
513/* Escapes that need further processing, or are illegal. */
514
515else
516  {
517  const uschar *oldptr;
518  BOOL braced, negated;
519
520  switch (c)
521    {
522    /* A number of Perl escapes are not handled by PCRE. We give an explicit
523    error. */
524
525    case 'l':
526    case 'L':
527    case 'N':
528    case 'u':
529    case 'U':
530    *errorcodeptr = ERR37;
531    break;
532
533    /* \g must be followed by one of a number of specific things:
534
535    (1) A number, either plain or braced. If positive, it is an absolute
536    backreference. If negative, it is a relative backreference. This is a Perl
537    5.10 feature.
538
539    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
540    is part of Perl's movement towards a unified syntax for back references. As
541    this is synonymous with \k{name}, we fudge it up by pretending it really
542    was \k.
543
544    (3) For Oniguruma compatibility we also support \g followed by a name or a
545    number either in angle brackets or in single quotes. However, these are
546    (possibly recursive) subroutine calls, _not_ backreferences. Just return
547    the -ESC_g code (cf \k). */
548
549    case 'g':
550    if (ptr[1] == '<' || ptr[1] == '\'')
551      {
552      c = -ESC_g;
553      break;
554      }
555
556    /* Handle the Perl-compatible cases */
557
558    if (ptr[1] == '{')
559      {
560      const uschar *p;
561      for (p = ptr+2; *p != 0 && *p != '}'; p++)
562        if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
563      if (*p != 0 && *p != '}')
564        {
565        c = -ESC_k;
566        break;
567        }
568      braced = TRUE;
569      ptr++;
570      }
571    else braced = FALSE;
572
573    if (ptr[1] == '-')
574      {
575      negated = TRUE;
576      ptr++;
577      }
578    else negated = FALSE;
579
580    c = 0;
581    while ((digitab[ptr[1]] & ctype_digit) != 0)
582      c = c * 10 + *(++ptr) - '0';
583
584    if (c < 0)   /* Integer overflow */
585      {
586      *errorcodeptr = ERR61;
587      break;
588      }
589
590    if (braced && *(++ptr) != '}')
591      {
592      *errorcodeptr = ERR57;
593      break;
594      }
595
596    if (c == 0)
597      {
598      *errorcodeptr = ERR58;
599      break;
600      }
601
602    if (negated)
603      {
604      if (c > bracount)
605        {
606        *errorcodeptr = ERR15;
607        break;
608        }
609      c = bracount - (c - 1);
610      }
611
612    c = -(ESC_REF + c);
613    break;
614
615    /* The handling of escape sequences consisting of a string of digits
616    starting with one that is not zero is not straightforward. By experiment,
617    the way Perl works seems to be as follows:
618
619    Outside a character class, the digits are read as a decimal number. If the
620    number is less than 10, or if there are that many previous extracting
621    left brackets, then it is a back reference. Otherwise, up to three octal
622    digits are read to form an escaped byte. Thus \123 is likely to be octal
623    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
624    value is greater than 377, the least significant 8 bits are taken. Inside a
625    character class, \ followed by a digit is always an octal number. */
626
627    case '1': case '2': case '3': case '4': case '5':
628    case '6': case '7': case '8': case '9':
629
630    if (!isclass)
631      {
632      oldptr = ptr;
633      c -= '0';
634      while ((digitab[ptr[1]] & ctype_digit) != 0)
635        c = c * 10 + *(++ptr) - '0';
636      if (c < 0)    /* Integer overflow */
637        {
638        *errorcodeptr = ERR61;
639        break;
640        }
641      if (c < 10 || c <= bracount)
642        {
643        c = -(ESC_REF + c);
644        break;
645        }
646      ptr = oldptr;      /* Put the pointer back and fall through */
647      }
648
649    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
650    generates a binary zero byte and treats the digit as a following literal.
651    Thus we have to pull back the pointer by one. */
652
653    if ((c = *ptr) >= '8')
654      {
655      ptr--;
656      c = 0;
657      break;
658      }
659
660    /* \0 always starts an octal number, but we may drop through to here with a
661    larger first octal digit. The original code used just to take the least
662    significant 8 bits of octal numbers (I think this is what early Perls used
663    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
664    than 3 octal digits. */
665
666    case '0':
667    c -= '0';
668    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
669        c = c * 8 + *(++ptr) - '0';
670    if (!utf8 && c > 255) *errorcodeptr = ERR51;
671    break;
672
673    /* \x is complicated. \x{ddd} is a character number which can be greater
674    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
675    treated as a data character. */
676
677    case 'x':
678    if (ptr[1] == '{')
679      {
680      const uschar *pt = ptr + 2;
681      int count = 0;
682
683      c = 0;
684      while ((digitab[*pt] & ctype_xdigit) != 0)
685        {
686        register int cc = *pt++;
687        if (c == 0 && cc == '0') continue;     /* Leading zeroes */
688        count++;
689
690#ifndef EBCDIC  /* ASCII coding */
691        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
692        c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
693#else           /* EBCDIC coding */
694        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
695        c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
696#endif
697        }
698
699      if (*pt == '}')
700        {
701        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
702        ptr = pt;
703        break;
704        }
705
706      /* If the sequence of hex digits does not end with '}', then we don't
707      recognize this construct; fall through to the normal \x handling. */
708      }
709
710    /* Read just a single-byte hex-defined char */
711
712    c = 0;
713    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
714      {
715      int cc;                               /* Some compilers don't like ++ */
716      cc = *(++ptr);                        /* in initializers */
717#ifndef EBCDIC  /* ASCII coding */
718      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
719      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
720#else           /* EBCDIC coding */
721      if (cc <= 'z') cc += 64;              /* Convert to upper case */
722      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
723#endif
724      }
725    break;
726
727    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
728    This coding is ASCII-specific, but then the whole concept of \cx is
729    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
730
731    case 'c':
732    c = *(++ptr);
733    if (c == 0)
734      {
735      *errorcodeptr = ERR2;
736      break;
737      }
738
739#ifndef EBCDIC  /* ASCII coding */
740    if (c >= 'a' && c <= 'z') c -= 32;
741    c ^= 0x40;
742#else           /* EBCDIC coding */
743    if (c >= 'a' && c <= 'z') c += 64;
744    c ^= 0xC0;
745#endif
746    break;
747
748    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
749    other alphanumeric following \ is an error if PCRE_EXTRA was set;
750    otherwise, for Perl compatibility, it is a literal. This code looks a bit
751    odd, but there used to be some cases other than the default, and there may
752    be again in future, so I haven't "optimized" it. */
753
754    default:
755    if ((options & PCRE_EXTRA) != 0) switch(c)
756      {
757      default:
758      *errorcodeptr = ERR3;
759      break;
760      }
761    break;
762    }
763  }
764
765*ptrptr = ptr;
766return c;
767}
768
769
770
771#ifdef SUPPORT_UCP
772/*************************************************
773*               Handle \P and \p                 *
774*************************************************/
775
776/* This function is called after \P or \p has been encountered, provided that
777PCRE is compiled with support for Unicode properties. On entry, ptrptr is
778pointing at the P or p. On exit, it is pointing at the final character of the
779escape sequence.
780
781Argument:
782  ptrptr         points to the pattern position pointer
783  negptr         points to a boolean that is set TRUE for negation else FALSE
784  dptr           points to an int that is set to the detailed property value
785  errorcodeptr   points to the error code variable
786
787Returns:         type value from ucp_type_table, or -1 for an invalid type
788*/
789
790static int
791get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
792{
793int c, i, bot, top;
794const uschar *ptr = *ptrptr;
795char name[32];
796
797c = *(++ptr);
798if (c == 0) goto ERROR_RETURN;
799
800*negptr = FALSE;
801
802/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
803negation. */
804
805if (c == '{')
806  {
807  if (ptr[1] == '^')
808    {
809    *negptr = TRUE;
810    ptr++;
811    }
812  for (i = 0; i < (int)sizeof(name) - 1; i++)
813    {
814    c = *(++ptr);
815    if (c == 0) goto ERROR_RETURN;
816    if (c == '}') break;
817    name[i] = c;
818    }
819  if (c !='}') goto ERROR_RETURN;
820  name[i] = 0;
821  }
822
823/* Otherwise there is just one following character */
824
825else
826  {
827  name[0] = c;
828  name[1] = 0;
829  }
830
831*ptrptr = ptr;
832
833/* Search for a recognized property name using binary chop */
834
835bot = 0;
836top = _pcre_utt_size;
837
838while (bot < top)
839  {
840  i = (bot + top) >> 1;
841  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
842  if (c == 0)
843    {
844    *dptr = _pcre_utt[i].value;
845    return _pcre_utt[i].type;
846    }
847  if (c > 0) bot = i + 1; else top = i;
848  }
849
850*errorcodeptr = ERR47;
851*ptrptr = ptr;
852return -1;
853
854ERROR_RETURN:
855*errorcodeptr = ERR46;
856*ptrptr = ptr;
857return -1;
858}
859#endif
860
861
862
863
864/*************************************************
865*            Check for counted repeat            *
866*************************************************/
867
868/* This function is called when a '{' is encountered in a place where it might
869start a quantifier. It looks ahead to see if it really is a quantifier or not.
870It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
871where the ddds are digits.
872
873Arguments:
874  p         pointer to the first char after '{'
875
876Returns:    TRUE or FALSE
877*/
878
879static BOOL
880is_counted_repeat(const uschar *p)
881{
882if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
883while ((digitab[*p] & ctype_digit) != 0) p++;
884if (*p == '}') return TRUE;
885
886if (*p++ != ',') return FALSE;
887if (*p == '}') return TRUE;
888
889if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
890while ((digitab[*p] & ctype_digit) != 0) p++;
891
892return (*p == '}');
893}
894
895
896
897/*************************************************
898*         Read repeat counts                     *
899*************************************************/
900
901/* Read an item of the form {n,m} and return the values. This is called only
902after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
903so the syntax is guaranteed to be correct, but we need to check the values.
904
905Arguments:
906  p              pointer to first char after '{'
907  minp           pointer to int for min
908  maxp           pointer to int for max
909                 returned as -1 if no max
910  errorcodeptr   points to error code variable
911
912Returns:         pointer to '}' on success;
913                 current ptr on error, with errorcodeptr set non-zero
914*/
915
916static const uschar *
917read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
918{
919int min = 0;
920int max = -1;
921
922/* Read the minimum value and do a paranoid check: a negative value indicates
923an integer overflow. */
924
925while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
926if (min < 0 || min > 65535)
927  {
928  *errorcodeptr = ERR5;
929  return p;
930  }
931
932/* Read the maximum value if there is one, and again do a paranoid on its size.
933Also, max must not be less than min. */
934
935if (*p == '}') max = min; else
936  {
937  if (*(++p) != '}')
938    {
939    max = 0;
940    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
941    if (max < 0 || max > 65535)
942      {
943      *errorcodeptr = ERR5;
944      return p;
945      }
946    if (max < min)
947      {
948      *errorcodeptr = ERR4;
949      return p;
950      }
951    }
952  }
953
954/* Fill in the required variables, and pass back the pointer to the terminating
955'}'. */
956
957*minp = min;
958*maxp = max;
959return p;
960}
961
962
963
964/*************************************************
965*       Find forward referenced subpattern       *
966*************************************************/
967
968/* This function scans along a pattern's text looking for capturing
969subpatterns, and counting them. If it finds a named pattern that matches the
970name it is given, it returns its number. Alternatively, if the name is NULL, it
971returns when it reaches a given numbered subpattern. This is used for forward
972references to subpatterns. We know that if (?P< is encountered, the name will
973be terminated by '>' because that is checked in the first pass.
974
975Arguments:
976  ptr          current position in the pattern
977  cd           compile background data
978  name         name to seek, or NULL if seeking a numbered subpattern
979  lorn         name length, or subpattern number if name is NULL
980  xmode        TRUE if we are in /x mode
981
982Returns:       the number of the named subpattern, or -1 if not found
983*/
984
985static int
986find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
987  BOOL xmode)
988{
989const uschar *thisname;
990int count = cd->bracount;
991
992for (; *ptr != 0; ptr++)
993  {
994  int term;
995
996  /* Skip over backslashed characters and also entire \Q...\E */
997
998  if (*ptr == '\\')
999    {
1000    if (*(++ptr) == 0) return -1;
1001    if (*ptr == 'Q') for (;;)
1002      {
1003      while (*(++ptr) != 0 && *ptr != '\\') {};
1004      if (*ptr == 0) return -1;
1005      if (*(++ptr) == 'E') break;
1006      }
1007    continue;
1008    }
1009
1010  /* Skip over character classes; this logic must be similar to the way they
1011  are handled for real. If the first character is '^', skip it. Also, if the
1012  first few characters (either before or after ^) are \Q\E or \E we skip them
1013  too. This makes for compatibility with Perl. */
1014
1015  if (*ptr == '[')
1016    {
1017    BOOL negate_class = FALSE;
1018    for (;;)
1019      {
1020      int c = *(++ptr);
1021      if (c == '\\')
1022        {
1023        if (ptr[1] == 'E') ptr++;
1024          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1025            else break;
1026        }
1027      else if (!negate_class && c == '^')
1028        negate_class = TRUE;
1029      else break;
1030      }
1031
1032    /* If the next character is ']', it is a data character that must be
1033    skipped, except in JavaScript compatibility mode. */
1034
1035    if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1036      ptr++;
1037
1038    while (*(++ptr) != ']')
1039      {
1040      if (*ptr == 0) return -1;
1041      if (*ptr == '\\')
1042        {
1043        if (*(++ptr) == 0) return -1;
1044        if (*ptr == 'Q') for (;;)
1045          {
1046          while (*(++ptr) != 0 && *ptr != '\\') {};
1047          if (*ptr == 0) return -1;
1048          if (*(++ptr) == 'E') break;
1049          }
1050        continue;
1051        }
1052      }
1053    continue;
1054    }
1055
1056  /* Skip comments in /x mode */
1057
1058  if (xmode && *ptr == '#')
1059    {
1060    while (*(++ptr) != 0 && *ptr != '\n') {};
1061    if (*ptr == 0) return -1;
1062    continue;
1063    }
1064
1065  /* An opening parens must now be a real metacharacter */
1066
1067  if (*ptr != '(') continue;
1068  if (ptr[1] != '?' && ptr[1] != '*')
1069    {
1070    count++;
1071    if (name == NULL && count == lorn) return count;
1072    continue;
1073    }
1074
1075  ptr += 2;
1076  if (*ptr == 'P') ptr++;                      /* Allow optional P */
1077
1078  /* We have to disambiguate (?<! and (?<= from (?<name> */
1079
1080  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1081       *ptr != '\'')
1082    continue;
1083
1084  count++;
1085
1086  if (name == NULL && count == lorn) return count;
1087  term = *ptr++;
1088  if (term == '<') term = '>';
1089  thisname = ptr;
1090  while (*ptr != term) ptr++;
1091  if (name != NULL && lorn == ptr - thisname &&
1092      strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1093    return count;
1094  }
1095
1096return -1;
1097}
1098
1099
1100
1101/*************************************************
1102*      Find first significant op code            *
1103*************************************************/
1104
1105/* This is called by several functions that scan a compiled expression looking
1106for a fixed first character, or an anchoring op code etc. It skips over things
1107that do not influence this. For some calls, a change of option is important.
1108For some calls, it makes sense to skip negative forward and all backward
1109assertions, and also the \b assertion; for others it does not.
1110
1111Arguments:
1112  code         pointer to the start of the group
1113  options      pointer to external options
1114  optbit       the option bit whose changing is significant, or
1115                 zero if none are
1116  skipassert   TRUE if certain assertions are to be skipped
1117
1118Returns:       pointer to the first significant opcode
1119*/
1120
1121static const uschar*
1122first_significant_code(const uschar *code, int *options, int optbit,
1123  BOOL skipassert)
1124{
1125for (;;)
1126  {
1127  switch ((int)*code)
1128    {
1129    case OP_OPT:
1130    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1131      *options = (int)code[1];
1132    code += 2;
1133    break;
1134
1135    case OP_ASSERT_NOT:
1136    case OP_ASSERTBACK:
1137    case OP_ASSERTBACK_NOT:
1138    if (!skipassert) return code;
1139    do code += GET(code, 1); while (*code == OP_ALT);
1140    code += _pcre_OP_lengths[*code];
1141    break;
1142
1143    case OP_WORD_BOUNDARY:
1144    case OP_NOT_WORD_BOUNDARY:
1145    if (!skipassert) return code;
1146    /* Fall through */
1147
1148    case OP_CALLOUT:
1149    case OP_CREF:
1150    case OP_RREF:
1151    case OP_DEF:
1152    code += _pcre_OP_lengths[*code];
1153    break;
1154
1155    default:
1156    return code;
1157    }
1158  }
1159/* Control never reaches here */
1160}
1161
1162
1163
1164
1165/*************************************************
1166*        Find the fixed length of a pattern      *
1167*************************************************/
1168
1169/* Scan a pattern and compute the fixed length of subject that will match it,
1170if the length is fixed. This is needed for dealing with backward assertions.
1171In UTF8 mode, the result is in characters rather than bytes.
1172
1173Arguments:
1174  code     points to the start of the pattern (the bracket)
1175  options  the compiling options
1176
1177Returns:   the fixed length, or -1 if there is no fixed length,
1178             or -2 if \C was encountered
1179*/
1180
1181static int
1182find_fixedlength(uschar *code, int options)
1183{
1184int length = -1;
1185
1186register int branchlength = 0;
1187register uschar *cc = code + 1 + LINK_SIZE;
1188
1189/* Scan along the opcodes for this branch. If we get to the end of the
1190branch, check the length against that of the other branches. */
1191
1192for (;;)
1193  {
1194  int d;
1195  register int op = *cc;
1196  switch (op)
1197    {
1198    case OP_CBRA:
1199    case OP_BRA:
1200    case OP_ONCE:
1201    case OP_COND:
1202    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1203    if (d < 0) return d;
1204    branchlength += d;
1205    do cc += GET(cc, 1); while (*cc == OP_ALT);
1206    cc += 1 + LINK_SIZE;
1207    break;
1208
1209    /* Reached end of a branch; if it's a ket it is the end of a nested
1210    call. If it's ALT it is an alternation in a nested call. If it is
1211    END it's the end of the outer call. All can be handled by the same code. */
1212
1213    case OP_ALT:
1214    case OP_KET:
1215    case OP_KETRMAX:
1216    case OP_KETRMIN:
1217    case OP_END:
1218    if (length < 0) length = branchlength;
1219      else if (length != branchlength) return -1;
1220    if (*cc != OP_ALT) return length;
1221    cc += 1 + LINK_SIZE;
1222    branchlength = 0;
1223    break;
1224
1225    /* Skip over assertive subpatterns */
1226
1227    case OP_ASSERT:
1228    case OP_ASSERT_NOT:
1229    case OP_ASSERTBACK:
1230    case OP_ASSERTBACK_NOT:
1231    do cc += GET(cc, 1); while (*cc == OP_ALT);
1232    /* Fall through */
1233
1234    /* Skip over things that don't match chars */
1235
1236    case OP_REVERSE:
1237    case OP_CREF:
1238    case OP_RREF:
1239    case OP_DEF:
1240    case OP_OPT:
1241    case OP_CALLOUT:
1242    case OP_SOD:
1243    case OP_SOM:
1244    case OP_EOD:
1245    case OP_EODN:
1246    case OP_CIRC:
1247    case OP_DOLL:
1248    case OP_NOT_WORD_BOUNDARY:
1249    case OP_WORD_BOUNDARY:
1250    cc += _pcre_OP_lengths[*cc];
1251    break;
1252
1253    /* Handle literal characters */
1254
1255    case OP_CHAR:
1256    case OP_CHARNC:
1257    case OP_NOT:
1258    branchlength++;
1259    cc += 2;
1260#ifdef SUPPORT_UTF8
1261    if ((options & PCRE_UTF8) != 0)
1262      {
1263      while ((*cc & 0xc0) == 0x80) cc++;
1264      }
1265#endif
1266    break;
1267
1268    /* Handle exact repetitions. The count is already in characters, but we
1269    need to skip over a multibyte character in UTF8 mode.  */
1270
1271    case OP_EXACT:
1272    branchlength += GET2(cc,1);
1273    cc += 4;
1274#ifdef SUPPORT_UTF8
1275    if ((options & PCRE_UTF8) != 0)
1276      {
1277      while((*cc & 0x80) == 0x80) cc++;
1278      }
1279#endif
1280    break;
1281
1282    case OP_TYPEEXACT:
1283    branchlength += GET2(cc,1);
1284    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1285    cc += 4;
1286    break;
1287
1288    /* Handle single-char matchers */
1289
1290    case OP_PROP:
1291    case OP_NOTPROP:
1292    cc += 2;
1293    /* Fall through */
1294
1295    case OP_NOT_DIGIT:
1296    case OP_DIGIT:
1297    case OP_NOT_WHITESPACE:
1298    case OP_WHITESPACE:
1299    case OP_NOT_WORDCHAR:
1300    case OP_WORDCHAR:
1301    case OP_ANY:
1302    case OP_ALLANY:
1303    branchlength++;
1304    cc++;
1305    break;
1306
1307    /* The single-byte matcher isn't allowed */
1308
1309    case OP_ANYBYTE:
1310    return -2;
1311
1312    /* Check a class for variable quantification */
1313
1314#ifdef SUPPORT_UTF8
1315    case OP_XCLASS:
1316    cc += GET(cc, 1) - 33;
1317    /* Fall through */
1318#endif
1319
1320    case OP_CLASS:
1321    case OP_NCLASS:
1322    cc += 33;
1323
1324    switch (*cc)
1325      {
1326      case OP_CRSTAR:
1327      case OP_CRMINSTAR:
1328      case OP_CRQUERY:
1329      case OP_CRMINQUERY:
1330      return -1;
1331
1332      case OP_CRRANGE:
1333      case OP_CRMINRANGE:
1334      if (GET2(cc,1) != GET2(cc,3)) return -1;
1335      branchlength += GET2(cc,1);
1336      cc += 5;
1337      break;
1338
1339      default:
1340      branchlength++;
1341      }
1342    break;
1343
1344    /* Anything else is variable length */
1345
1346    default:
1347    return -1;
1348    }
1349  }
1350/* Control never gets here */
1351}
1352
1353
1354
1355
1356/*************************************************
1357*    Scan compiled regex for numbered bracket    *
1358*************************************************/
1359
1360/* This little function scans through a compiled pattern until it finds a
1361capturing bracket with the given number.
1362
1363Arguments:
1364  code        points to start of expression
1365  utf8        TRUE in UTF-8 mode
1366  number      the required bracket number
1367
1368Returns:      pointer to the opcode for the bracket, or NULL if not found
1369*/
1370
1371static const uschar *
1372find_bracket(const uschar *code, BOOL utf8, int number)
1373{
1374for (;;)
1375  {
1376  register int c = *code;
1377  if (c == OP_END) return NULL;
1378
1379  /* XCLASS is used for classes that cannot be represented just by a bit
1380  map. This includes negated single high-valued characters. The length in
1381  the table is zero; the actual length is stored in the compiled code. */
1382
1383  if (c == OP_XCLASS) code += GET(code, 1);
1384
1385  /* Handle capturing bracket */
1386
1387  else if (c == OP_CBRA)
1388    {
1389    int n = GET2(code, 1+LINK_SIZE);
1390    if (n == number) return (uschar *)code;
1391    code += _pcre_OP_lengths[c];
1392    }
1393
1394  /* Otherwise, we can get the item's length from the table, except that for
1395  repeated character types, we have to test for \p and \P, which have an extra
1396  two bytes of parameters. */
1397
1398  else
1399    {
1400    switch(c)
1401      {
1402      case OP_TYPESTAR:
1403      case OP_TYPEMINSTAR:
1404      case OP_TYPEPLUS:
1405      case OP_TYPEMINPLUS:
1406      case OP_TYPEQUERY:
1407      case OP_TYPEMINQUERY:
1408      case OP_TYPEPOSSTAR:
1409      case OP_TYPEPOSPLUS:
1410      case OP_TYPEPOSQUERY:
1411      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1412      break;
1413
1414      case OP_TYPEUPTO:
1415      case OP_TYPEMINUPTO:
1416      case OP_TYPEEXACT:
1417      case OP_TYPEPOSUPTO:
1418      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1419      break;
1420      }
1421
1422    /* Add in the fixed length from the table */
1423
1424    code += _pcre_OP_lengths[c];
1425
1426  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1427  a multi-byte character. The length in the table is a minimum, so we have to
1428  arrange to skip the extra bytes. */
1429
1430#ifdef SUPPORT_UTF8
1431    if (utf8) switch(c)
1432      {
1433      case OP_CHAR:
1434      case OP_CHARNC:
1435      case OP_EXACT:
1436      case OP_UPTO:
1437      case OP_MINUPTO:
1438      case OP_POSUPTO:
1439      case OP_STAR:
1440      case OP_MINSTAR:
1441      case OP_POSSTAR:
1442      case OP_PLUS:
1443      case OP_MINPLUS:
1444      case OP_POSPLUS:
1445      case OP_QUERY:
1446      case OP_MINQUERY:
1447      case OP_POSQUERY:
1448      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1449      break;
1450      }
1451#else
1452    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1453#endif
1454    }
1455  }
1456}
1457
1458
1459
1460/*************************************************
1461*   Scan compiled regex for recursion reference  *
1462*************************************************/
1463
1464/* This little function scans through a compiled pattern until it finds an
1465instance of OP_RECURSE.
1466
1467Arguments:
1468  code        points to start of expression
1469  utf8        TRUE in UTF-8 mode
1470
1471Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1472*/
1473
1474static const uschar *
1475find_recurse(const uschar *code, BOOL utf8)
1476{
1477for (;;)
1478  {
1479  register int c = *code;
1480  if (c == OP_END) return NULL;
1481  if (c == OP_RECURSE) return code;
1482
1483  /* XCLASS is used for classes that cannot be represented just by a bit
1484  map. This includes negated single high-valued characters. The length in
1485  the table is zero; the actual length is stored in the compiled code. */
1486
1487  if (c == OP_XCLASS) code += GET(code, 1);
1488
1489  /* Otherwise, we can get the item's length from the table, except that for
1490  repeated character types, we have to test for \p and \P, which have an extra
1491  two bytes of parameters. */
1492
1493  else
1494    {
1495    switch(c)
1496      {
1497      case OP_TYPESTAR:
1498      case OP_TYPEMINSTAR:
1499      case OP_TYPEPLUS:
1500      case OP_TYPEMINPLUS:
1501      case OP_TYPEQUERY:
1502      case OP_TYPEMINQUERY:
1503      case OP_TYPEPOSSTAR:
1504      case OP_TYPEPOSPLUS:
1505      case OP_TYPEPOSQUERY:
1506      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1507      break;
1508
1509      case OP_TYPEPOSUPTO:
1510      case OP_TYPEUPTO:
1511      case OP_TYPEMINUPTO:
1512      case OP_TYPEEXACT:
1513      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1514      break;
1515      }
1516
1517    /* Add in the fixed length from the table */
1518
1519    code += _pcre_OP_lengths[c];
1520
1521    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1522    by a multi-byte character. The length in the table is a minimum, so we have
1523    to arrange to skip the extra bytes. */
1524
1525#ifdef SUPPORT_UTF8
1526    if (utf8) switch(c)
1527      {
1528      case OP_CHAR:
1529      case OP_CHARNC:
1530      case OP_EXACT:
1531      case OP_UPTO:
1532      case OP_MINUPTO:
1533      case OP_POSUPTO:
1534      case OP_STAR:
1535      case OP_MINSTAR:
1536      case OP_POSSTAR:
1537      case OP_PLUS:
1538      case OP_MINPLUS:
1539      case OP_POSPLUS:
1540      case OP_QUERY:
1541      case OP_MINQUERY:
1542      case OP_POSQUERY:
1543      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1544      break;
1545      }
1546#else
1547    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1548#endif
1549    }
1550  }
1551}
1552
1553
1554
1555/*************************************************
1556*    Scan compiled branch for non-emptiness      *
1557*************************************************/
1558
1559/* This function scans through a branch of a compiled pattern to see whether it
1560can match the empty string or not. It is called from could_be_empty()
1561below and from compile_branch() when checking for an unlimited repeat of a
1562group that can match nothing. Note that first_significant_code() skips over
1563backward and negative forward assertions when its final argument is TRUE. If we
1564hit an unclosed bracket, we return "empty" - this means we've struck an inner
1565bracket whose current branch will already have been scanned.
1566
1567Arguments:
1568  code        points to start of search
1569  endcode     points to where to stop
1570  utf8        TRUE if in UTF8 mode
1571
1572Returns:      TRUE if what is matched could be empty
1573*/
1574
1575static BOOL
1576could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1577{
1578register int c;
1579for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1580     code < endcode;
1581     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1582  {
1583  const uschar *ccode;
1584
1585  c = *code;
1586
1587  /* Skip over forward assertions; the other assertions are skipped by
1588  first_significant_code() with a TRUE final argument. */
1589
1590  if (c == OP_ASSERT)
1591    {
1592    do code += GET(code, 1); while (*code == OP_ALT);
1593    c = *code;
1594    continue;
1595    }
1596
1597  /* Groups with zero repeats can of course be empty; skip them. */
1598
1599  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1600    {
1601    code += _pcre_OP_lengths[c];
1602    do code += GET(code, 1); while (*code == OP_ALT);
1603    c = *code;
1604    continue;
1605    }
1606
1607  /* For other groups, scan the branches. */
1608
1609  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1610    {
1611    BOOL empty_branch;
1612    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1613
1614    /* Scan a closed bracket */
1615
1616    empty_branch = FALSE;
1617    do
1618      {
1619      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1620        empty_branch = TRUE;
1621      code += GET(code, 1);
1622      }
1623    while (*code == OP_ALT);
1624    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1625    c = *code;
1626    continue;
1627    }
1628
1629  /* Handle the other opcodes */
1630
1631  switch (c)
1632    {
1633    /* Check for quantifiers after a class. XCLASS is used for classes that
1634    cannot be represented just by a bit map. This includes negated single
1635    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1636    actual length is stored in the compiled code, so we must update "code"
1637    here. */
1638
1639#ifdef SUPPORT_UTF8
1640    case OP_XCLASS:
1641    ccode = code += GET(code, 1);
1642    goto CHECK_CLASS_REPEAT;
1643#endif
1644
1645    case OP_CLASS:
1646    case OP_NCLASS:
1647    ccode = code + 33;
1648
1649#ifdef SUPPORT_UTF8
1650    CHECK_CLASS_REPEAT:
1651#endif
1652
1653    switch (*ccode)
1654      {
1655      case OP_CRSTAR:            /* These could be empty; continue */
1656      case OP_CRMINSTAR:
1657      case OP_CRQUERY:
1658      case OP_CRMINQUERY:
1659      break;
1660
1661      default:                   /* Non-repeat => class must match */
1662      case OP_CRPLUS:            /* These repeats aren't empty */
1663      case OP_CRMINPLUS:
1664      return FALSE;
1665
1666      case OP_CRRANGE:
1667      case OP_CRMINRANGE:
1668      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1669      break;
1670      }
1671    break;
1672
1673    /* Opcodes that must match a character */
1674
1675    case OP_PROP:
1676    case OP_NOTPROP:
1677    case OP_EXTUNI:
1678    case OP_NOT_DIGIT:
1679    case OP_DIGIT:
1680    case OP_NOT_WHITESPACE:
1681    case OP_WHITESPACE:
1682    case OP_NOT_WORDCHAR:
1683    case OP_WORDCHAR:
1684    case OP_ANY:
1685    case OP_ALLANY:
1686    case OP_ANYBYTE:
1687    case OP_CHAR:
1688    case OP_CHARNC:
1689    case OP_NOT:
1690    case OP_PLUS:
1691    case OP_MINPLUS:
1692    case OP_POSPLUS:
1693    case OP_EXACT:
1694    case OP_NOTPLUS:
1695    case OP_NOTMINPLUS:
1696    case OP_NOTPOSPLUS:
1697    case OP_NOTEXACT:
1698    case OP_TYPEPLUS:
1699    case OP_TYPEMINPLUS:
1700    case OP_TYPEPOSPLUS:
1701    case OP_TYPEEXACT:
1702    return FALSE;
1703
1704    /* These are going to continue, as they may be empty, but we have to
1705    fudge the length for the \p and \P cases. */
1706
1707    case OP_TYPESTAR:
1708    case OP_TYPEMINSTAR:
1709    case OP_TYPEPOSSTAR:
1710    case OP_TYPEQUERY:
1711    case OP_TYPEMINQUERY:
1712    case OP_TYPEPOSQUERY:
1713    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1714    break;
1715
1716    /* Same for these */
1717
1718    case OP_TYPEUPTO:
1719    case OP_TYPEMINUPTO:
1720    case OP_TYPEPOSUPTO:
1721    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1722    break;
1723
1724    /* End of branch */
1725
1726    case OP_KET:
1727    case OP_KETRMAX:
1728    case OP_KETRMIN:
1729    case OP_ALT:
1730    return TRUE;
1731
1732    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1733    MINUPTO, and POSUPTO may be followed by a multibyte character */
1734
1735#ifdef SUPPORT_UTF8
1736    case OP_STAR:
1737    case OP_MINSTAR:
1738    case OP_POSSTAR:
1739    case OP_QUERY:
1740    case OP_MINQUERY:
1741    case OP_POSQUERY:
1742    case OP_UPTO:
1743    case OP_MINUPTO:
1744    case OP_POSUPTO:
1745    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1746    break;
1747#endif
1748    }
1749  }
1750
1751return TRUE;
1752}
1753
1754
1755
1756/*************************************************
1757*    Scan compiled regex for non-emptiness       *
1758*************************************************/
1759
1760/* This function is called to check for left recursive calls. We want to check
1761the current branch of the current pattern to see if it could match the empty
1762string. If it could, we must look outwards for branches at other levels,
1763stopping when we pass beyond the bracket which is the subject of the recursion.
1764
1765Arguments:
1766  code        points to start of the recursion
1767  endcode     points to where to stop (current RECURSE item)
1768  bcptr       points to the chain of current (unclosed) branch starts
1769  utf8        TRUE if in UTF-8 mode
1770
1771Returns:      TRUE if what is matched could be empty
1772*/
1773
1774static BOOL
1775could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1776  BOOL utf8)
1777{
1778while (bcptr != NULL && bcptr->current >= code)
1779  {
1780  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1781  bcptr = bcptr->outer;
1782  }
1783return TRUE;
1784}
1785
1786
1787
1788/*************************************************
1789*           Check for POSIX class syntax         *
1790*************************************************/
1791
1792/* This function is called when the sequence "[:" or "[." or "[=" is
1793encountered in a character class. It checks whether this is followed by a
1794sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1795reach an unescaped ']' without the special preceding character, return FALSE.
1796
1797Originally, this function only recognized a sequence of letters between the
1798terminators, but it seems that Perl recognizes any sequence of characters,
1799though of course unknown POSIX names are subsequently rejected. Perl gives an
1800"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1801didn't consider this to be a POSIX class. Likewise for [:1234:].
1802
1803The problem in trying to be exactly like Perl is in the handling of escapes. We
1804have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1805class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1806below handles the special case of \], but does not try to do any other escape
1807processing. This makes it different from Perl for cases such as [:l\ower:]
1808where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1809"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1810I think.
1811
1812Arguments:
1813  ptr      pointer to the initial [
1814  endptr   where to return the end pointer
1815
1816Returns:   TRUE or FALSE
1817*/
1818
1819static BOOL
1820check_posix_syntax(const uschar *ptr, const uschar **endptr)
1821{
1822int terminator;          /* Don't combine these lines; the Solaris cc */
1823terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1824for (++ptr; *ptr != 0; ptr++)
1825  {
1826  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1827    {
1828    if (*ptr == ']') return FALSE;
1829    if (*ptr == terminator && ptr[1] == ']')
1830      {
1831      *endptr = ptr;
1832      return TRUE;
1833      }
1834    }
1835  }
1836return FALSE;
1837}
1838
1839
1840
1841
1842/*************************************************
1843*          Check POSIX class name                *
1844*************************************************/
1845
1846/* This function is called to check the name given in a POSIX-style class entry
1847such as [:alnum:].
1848
1849Arguments:
1850  ptr        points to the first letter
1851  len        the length of the name
1852
1853Returns:     a value representing the name, or -1 if unknown
1854*/
1855
1856static int
1857check_posix_name(const uschar *ptr, int len)
1858{
1859const char *pn = posix_names;
1860register int yield = 0;
1861while (posix_name_lengths[yield] != 0)
1862  {
1863  if (len == posix_name_lengths[yield] &&
1864    strncmp((const char *)ptr, pn, len) == 0) return yield;
1865  pn += posix_name_lengths[yield] + 1;
1866  yield++;
1867  }
1868return -1;
1869}
1870
1871
1872/*************************************************
1873*    Adjust OP_RECURSE items in repeated group   *
1874*************************************************/
1875
1876/* OP_RECURSE items contain an offset from the start of the regex to the group
1877that is referenced. This means that groups can be replicated for fixed
1878repetition simply by copying (because the recursion is allowed to refer to
1879earlier groups that are outside the current group). However, when a group is
1880optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1881inserted before it, after it has been compiled. This means that any OP_RECURSE
1882items within it that refer to the group itself or any contained groups have to
1883have their offsets adjusted. That one of the jobs of this function. Before it
1884is called, the partially compiled regex must be temporarily terminated with
1885OP_END.
1886
1887This function has been extended with the possibility of forward references for
1888recursions and subroutine calls. It must also check the list of such references
1889for the group we are dealing with. If it finds that one of the recursions in
1890the current group is on this list, it adjusts the offset in the list, not the
1891value in the reference (which is a group number).
1892
1893Arguments:
1894  group      points to the start of the group
1895  adjust     the amount by which the group is to be moved
1896  utf8       TRUE in UTF-8 mode
1897  cd         contains pointers to tables etc.
1898  save_hwm   the hwm forward reference pointer at the start of the group
1899
1900Returns:     nothing
1901*/
1902
1903static void
1904adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1905  uschar *save_hwm)
1906{
1907uschar *ptr = group;
1908
1909while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1910  {
1911  int offset;
1912  uschar *hc;
1913
1914  /* See if this recursion is on the forward reference list. If so, adjust the
1915  reference. */
1916
1917  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1918    {
1919    offset = GET(hc, 0);
1920    if (cd->start_code + offset == ptr + 1)
1921      {
1922      PUT(hc, 0, offset + adjust);
1923      break;
1924      }
1925    }
1926
1927  /* Otherwise, adjust the recursion offset if it's after the start of this
1928  group. */
1929
1930  if (hc >= cd->hwm)
1931    {
1932    offset = GET(ptr, 1);
1933    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1934    }
1935
1936  ptr += 1 + LINK_SIZE;
1937  }
1938}
1939
1940
1941
1942/*************************************************
1943*        Insert an automatic callout point       *
1944*************************************************/
1945
1946/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1947callout points before each pattern item.
1948
1949Arguments:
1950  code           current code pointer
1951  ptr            current pattern pointer
1952  cd             pointers to tables etc
1953
1954Returns:         new code pointer
1955*/
1956
1957static uschar *
1958auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1959{
1960*code++ = OP_CALLOUT;
1961*code++ = 255;
1962PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1963PUT(code, LINK_SIZE, 0);                /* Default length */
1964return code + 2*LINK_SIZE;
1965}
1966
1967
1968
1969/*************************************************
1970*         Complete a callout item                *
1971*************************************************/
1972
1973/* A callout item contains the length of the next item in the pattern, which
1974we can't fill in till after we have reached the relevant point. This is used
1975for both automatic and manual callouts.
1976
1977Arguments:
1978  previous_callout   points to previous callout item
1979  ptr                current pattern pointer
1980  cd                 pointers to tables etc
1981
1982Returns:             nothing
1983*/
1984
1985static void
1986complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1987{
1988int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1989PUT(previous_callout, 2 + LINK_SIZE, length);
1990}
1991
1992
1993
1994#ifdef SUPPORT_UCP
1995/*************************************************
1996*           Get othercase range                  *
1997*************************************************/
1998
1999/* This function is passed the start and end of a class range, in UTF-8 mode
2000with UCP support. It searches up the characters, looking for internal ranges of
2001characters in the "other" case. Each call returns the next one, updating the
2002start address.
2003
2004Arguments:
2005  cptr        points to starting character value; updated
2006  d           end value
2007  ocptr       where to put start of othercase range
2008  odptr       where to put end of othercase range
2009
2010Yield:        TRUE when range returned; FALSE when no more
2011*/
2012
2013static BOOL
2014get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2015  unsigned int *odptr)
2016{
2017unsigned int c, othercase, next;
2018
2019for (c = *cptr; c <= d; c++)
2020  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2021
2022if (c > d) return FALSE;
2023
2024*ocptr = othercase;
2025next = othercase + 1;
2026
2027for (++c; c <= d; c++)
2028  {
2029  if (UCD_OTHERCASE(c) != next) break;
2030  next++;
2031  }
2032
2033*odptr = next - 1;
2034*cptr = c;
2035
2036return TRUE;
2037}
2038#endif  /* SUPPORT_UCP */
2039
2040
2041
2042/*************************************************
2043*     Check if auto-possessifying is possible    *
2044*************************************************/
2045
2046/* This function is called for unlimited repeats of certain items, to see
2047whether the next thing could possibly match the repeated item. If not, it makes
2048sense to automatically possessify the repeated item.
2049
2050Arguments:
2051  op_code       the repeated op code
2052  this          data for this item, depends on the opcode
2053  utf8          TRUE in UTF-8 mode
2054  utf8_char     used for utf8 character bytes, NULL if not relevant
2055  ptr           next character in pattern
2056  options       options bits
2057  cd            contains pointers to tables etc.
2058
2059Returns:        TRUE if possessifying is wanted
2060*/
2061
2062static BOOL
2063check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2064  const uschar *ptr, int options, compile_data *cd)
2065{
2066int next;
2067
2068/* Skip whitespace and comments in extended mode */
2069
2070if ((options & PCRE_EXTENDED) != 0)
2071  {
2072  for (;;)
2073    {
2074    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2075    if (*ptr == '#')
2076      {
2077      while (*(++ptr) != 0)
2078        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2079      }
2080    else break;
2081    }
2082  }
2083
2084/* If the next item is one that we can handle, get its value. A non-negative
2085value is a character, a negative value is an escape value. */
2086
2087if (*ptr == '\\')
2088  {
2089  int temperrorcode = 0;
2090  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2091  if (temperrorcode != 0) return FALSE;
2092  ptr++;    /* Point after the escape sequence */
2093  }
2094
2095else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2096  {
2097#ifdef SUPPORT_UTF8
2098  if (utf8) { GETCHARINC(next, ptr); } else
2099#endif
2100  next = *ptr++;
2101  }
2102
2103else return FALSE;
2104
2105/* Skip whitespace and comments in extended mode */
2106
2107if ((options & PCRE_EXTENDED) != 0)
2108  {
2109  for (;;)
2110    {
2111    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2112    if (*ptr == '#')
2113      {
2114      while (*(++ptr) != 0)
2115        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2116      }
2117    else break;
2118    }
2119  }
2120
2121/* If the next thing is itself optional, we have to give up. */
2122
2123if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2124  return FALSE;
2125
2126/* Now compare the next item with the previous opcode. If the previous is a
2127positive single character match, "item" either contains the character or, if
2128"item" is greater than 127 in utf8 mode, the character's bytes are in
2129utf8_char. */
2130
2131
2132/* Handle cases when the next item is a character. */
2133
2134if (next >= 0) switch(op_code)
2135  {
2136  case OP_CHAR:
2137#ifdef SUPPORT_UTF8
2138  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2139#else
2140  (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2141#endif
2142  return item != next;
2143
2144  /* For CHARNC (caseless character) we must check the other case. If we have
2145  Unicode property support, we can use it to test the other case of
2146  high-valued characters. */
2147
2148  case OP_CHARNC:
2149#ifdef SUPPORT_UTF8
2150  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2151#endif
2152  if (item == next) return FALSE;
2153#ifdef SUPPORT_UTF8
2154  if (utf8)
2155    {
2156    unsigned int othercase;
2157    if (next < 128) othercase = cd->fcc[next]; else
2158#ifdef SUPPORT_UCP
2159    othercase = UCD_OTHERCASE((unsigned int)next);
2160#else
2161    othercase = NOTACHAR;
2162#endif
2163    return (unsigned int)item != othercase;
2164    }
2165  else
2166#endif  /* SUPPORT_UTF8 */
2167  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2168
2169  /* For OP_NOT, "item" must be a single-byte character. */
2170
2171  case OP_NOT:
2172  if (item == next) return TRUE;
2173  if ((options & PCRE_CASELESS) == 0) return FALSE;
2174#ifdef SUPPORT_UTF8
2175  if (utf8)
2176    {
2177    unsigned int othercase;
2178    if (next < 128) othercase = cd->fcc[next]; else
2179#ifdef SUPPORT_UCP
2180    othercase = UCD_OTHERCASE(next);
2181#else
2182    othercase = NOTACHAR;
2183#endif
2184    return (unsigned int)item == othercase;
2185    }
2186  else
2187#endif  /* SUPPORT_UTF8 */
2188  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2189
2190  case OP_DIGIT:
2191  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2192
2193  case OP_NOT_DIGIT:
2194  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2195
2196  case OP_WHITESPACE:
2197  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2198
2199  case OP_NOT_WHITESPACE:
2200  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2201
2202  case OP_WORDCHAR:
2203  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2204
2205  case OP_NOT_WORDCHAR:
2206  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2207
2208  case OP_HSPACE:
2209  case OP_NOT_HSPACE:
2210  switch(next)
2211    {
2212    case 0x09:
2213    case 0x20:
2214    case 0xa0:
2215    case 0x1680:
2216    case 0x180e:
2217    case 0x2000:
2218    case 0x2001:
2219    case 0x2002:
2220    case 0x2003:
2221    case 0x2004:
2222    case 0x2005:
2223    case 0x2006:
2224    case 0x2007:
2225    case 0x2008:
2226    case 0x2009:
2227    case 0x200A:
2228    case 0x202f:
2229    case 0x205f:
2230    case 0x3000:
2231    return op_code != OP_HSPACE;
2232    default:
2233    return op_code == OP_HSPACE;
2234    }
2235
2236  case OP_VSPACE:
2237  case OP_NOT_VSPACE:
2238  switch(next)
2239    {
2240    case 0x0a:
2241    case 0x0b:
2242    case 0x0c:
2243    case 0x0d:
2244    case 0x85:
2245    case 0x2028:
2246    case 0x2029:
2247    return op_code != OP_VSPACE;
2248    default:
2249    return op_code == OP_VSPACE;
2250    }
2251
2252  default:
2253  return FALSE;
2254  }
2255
2256
2257/* Handle the case when the next item is \d, \s, etc. */
2258
2259switch(op_code)
2260  {
2261  case OP_CHAR:
2262  case OP_CHARNC:
2263#ifdef SUPPORT_UTF8
2264  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2265#endif
2266  switch(-next)
2267    {
2268    case ESC_d:
2269    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2270
2271    case ESC_D:
2272    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2273
2274    case ESC_s:
2275    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2276
2277    case ESC_S:
2278    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2279
2280    case ESC_w:
2281    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2282
2283    case ESC_W:
2284    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2285
2286    case ESC_h:
2287    case ESC_H:
2288    switch(item)
2289      {
2290      case 0x09:
2291      case 0x20:
2292      case 0xa0:
2293      case 0x1680:
2294      case 0x180e:
2295      case 0x2000:
2296      case 0x2001:
2297      case 0x2002:
2298      case 0x2003:
2299      case 0x2004:
2300      case 0x2005:
2301      case 0x2006:
2302      case 0x2007:
2303      case 0x2008:
2304      case 0x2009:
2305      case 0x200A:
2306      case 0x202f:
2307      case 0x205f:
2308      case 0x3000:
2309      return -next != ESC_h;
2310      default:
2311      return -next == ESC_h;
2312      }
2313
2314    case ESC_v:
2315    case ESC_V:
2316    switch(item)
2317      {
2318      case 0x0a:
2319      case 0x0b:
2320      case 0x0c:
2321      case 0x0d:
2322      case 0x85:
2323      case 0x2028:
2324      case 0x2029:
2325      return -next != ESC_v;
2326      default:
2327      return -next == ESC_v;
2328      }
2329
2330    default:
2331    return FALSE;
2332    }
2333
2334  case OP_DIGIT:
2335  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2336         next == -ESC_h || next == -ESC_v;
2337
2338  case OP_NOT_DIGIT:
2339  return next == -ESC_d;
2340
2341  case OP_WHITESPACE:
2342  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2343
2344  case OP_NOT_WHITESPACE:
2345  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2346
2347  case OP_HSPACE:
2348  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2349
2350  case OP_NOT_HSPACE:
2351  return next == -ESC_h;
2352
2353  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2354  case OP_VSPACE:
2355  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2356
2357  case OP_NOT_VSPACE:
2358  return next == -ESC_v;
2359
2360  case OP_WORDCHAR:
2361  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2362
2363  case OP_NOT_WORDCHAR:
2364  return next == -ESC_w || next == -ESC_d;
2365
2366  default:
2367  return FALSE;
2368  }
2369
2370/* Control does not reach here */
2371}
2372
2373
2374
2375/*************************************************
2376*           Compile one branch                   *
2377*************************************************/
2378
2379/* Scan the pattern, compiling it into the a vector. If the options are
2380changed during the branch, the pointer is used to change the external options
2381bits. This function is used during the pre-compile phase when we are trying
2382to find out the amount of memory needed, as well as during the real compile
2383phase. The value of lengthptr distinguishes the two phases.
2384
2385Arguments:
2386  optionsptr     pointer to the option bits
2387  codeptr        points to the pointer to the current code point
2388  ptrptr         points to the current pattern pointer
2389  errorcodeptr   points to error code variable
2390  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2391  reqbyteptr     set to the last literal character required, else < 0
2392  bcptr          points to current branch chain
2393  cd             contains pointers to tables etc.
2394  lengthptr      NULL during the real compile phase
2395                 points to length accumulator during pre-compile phase
2396
2397Returns:         TRUE on success
2398                 FALSE, with *errorcodeptr set non-zero on error
2399*/
2400
2401static BOOL
2402compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2403  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2404  compile_data *cd, int *lengthptr)
2405{
2406int repeat_type, op_type;
2407int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2408int bravalue = 0;
2409int greedy_default, greedy_non_default;
2410int firstbyte, reqbyte;
2411int zeroreqbyte, zerofirstbyte;
2412int req_caseopt, reqvary, tempreqvary;
2413int options = *optionsptr;
2414int after_manual_callout = 0;
2415int length_prevgroup = 0;
2416register int c;
2417register uschar *code = *codeptr;
2418uschar *last_code = code;
2419uschar *orig_code = code;
2420uschar *tempcode;
2421BOOL inescq = FALSE;
2422BOOL groupsetfirstbyte = FALSE;
2423const uschar *ptr = *ptrptr;
2424const uschar *tempptr;
2425uschar *previous = NULL;
2426uschar *previous_callout = NULL;
2427uschar *save_hwm = NULL;
2428uschar classbits[32];
2429
2430#ifdef SUPPORT_UTF8
2431BOOL class_utf8;
2432BOOL utf8 = (options & PCRE_UTF8) != 0;
2433uschar *class_utf8data;
2434uschar *class_utf8data_base;
2435uschar utf8_char[6];
2436#else
2437BOOL utf8 = FALSE;
2438uschar *utf8_char = NULL;
2439#endif
2440
2441#ifdef DEBUG
2442if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2443#endif
2444
2445/* Set up the default and non-default settings for greediness */
2446
2447greedy_default = ((options & PCRE_UNGREEDY) != 0);
2448greedy_non_default = greedy_default ^ 1;
2449
2450/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2451matching encountered yet". It gets changed to REQ_NONE if we hit something that
2452matches a non-fixed char first char; reqbyte just remains unset if we never
2453find one.
2454
2455When we hit a repeat whose minimum is zero, we may have to adjust these values
2456to take the zero repeat into account. This is implemented by setting them to
2457zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2458item types that can be repeated set these backoff variables appropriately. */
2459
2460firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2461
2462/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2463according to the current setting of the caseless flag. REQ_CASELESS is a bit
2464value > 255. It is added into the firstbyte or reqbyte variables to record the
2465case status of the value. This is used only for ASCII characters. */
2466
2467req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2468
2469/* Switch on next character until the end of the branch */
2470
2471for (;; ptr++)
2472  {
2473  BOOL negate_class;
2474  BOOL should_flip_negation;
2475  BOOL possessive_quantifier;
2476  BOOL is_quantifier;
2477  BOOL is_recurse;
2478  BOOL reset_bracount;
2479  int class_charcount;
2480  int class_lastchar;
2481  int newoptions;
2482  int recno;
2483  int refsign;
2484  int skipbytes;
2485  int subreqbyte;
2486  int subfirstbyte;
2487  int terminator;
2488  int mclength;
2489  uschar mcbuffer[8];
2490
2491  /* Get next byte in the pattern */
2492
2493  c = *ptr;
2494
2495  /* If we are in the pre-compile phase, accumulate the length used for the
2496  previous cycle of this loop. */
2497
2498  if (lengthptr != NULL)
2499    {
2500#ifdef DEBUG
2501    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2502#endif
2503    if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2504      {
2505      *errorcodeptr = ERR52;
2506      goto FAILED;
2507      }
2508
2509    /* There is at least one situation where code goes backwards: this is the
2510    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2511    the class is simply eliminated. However, it is created first, so we have to
2512    allow memory for it. Therefore, don't ever reduce the length at this point.
2513    */
2514
2515    if (code < last_code) code = last_code;
2516
2517    /* Paranoid check for integer overflow */
2518
2519    if (OFLOW_MAX - *lengthptr < code - last_code)
2520      {
2521      *errorcodeptr = ERR20;
2522      goto FAILED;
2523      }
2524
2525    *lengthptr += code - last_code;
2526    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2527
2528    /* If "previous" is set and it is not at the start of the work space, move
2529    it back to there, in order to avoid filling up the work space. Otherwise,
2530    if "previous" is NULL, reset the current code pointer to the start. */
2531
2532    if (previous != NULL)
2533      {
2534      if (previous > orig_code)
2535        {
2536        memmove(orig_code, previous, code - previous);
2537        code -= previous - orig_code;
2538        previous = orig_code;
2539        }
2540      }
2541    else code = orig_code;
2542
2543    /* Remember where this code item starts so we can pick up the length
2544    next time round. */
2545
2546    last_code = code;
2547    }
2548
2549  /* In the real compile phase, just check the workspace used by the forward
2550  reference list. */
2551
2552  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2553    {
2554    *errorcodeptr = ERR52;
2555    goto FAILED;
2556    }
2557
2558  /* If in \Q...\E, check for the end; if not, we have a literal */
2559
2560  if (inescq && c != 0)
2561    {
2562    if (c == '\\' && ptr[1] == 'E')
2563      {
2564      inescq = FALSE;
2565      ptr++;
2566      continue;
2567      }
2568    else
2569      {
2570      if (previous_callout != NULL)
2571        {
2572        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2573          complete_callout(previous_callout, ptr, cd);
2574        previous_callout = NULL;
2575        }
2576      if ((options & PCRE_AUTO_CALLOUT) != 0)
2577        {
2578        previous_callout = code;
2579        code = auto_callout(code, ptr, cd);
2580        }
2581      goto NORMAL_CHAR;
2582      }
2583    }
2584
2585  /* Fill in length of a previous callout, except when the next thing is
2586  a quantifier. */
2587
2588  is_quantifier = c == '*' || c == '+' || c == '?' ||
2589    (c == '{' && is_counted_repeat(ptr+1));
2590
2591  if (!is_quantifier && previous_callout != NULL &&
2592       after_manual_callout-- <= 0)
2593    {
2594    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2595      complete_callout(previous_callout, ptr, cd);
2596    previous_callout = NULL;
2597    }
2598
2599  /* In extended mode, skip white space and comments */
2600
2601  if ((options & PCRE_EXTENDED) != 0)
2602    {
2603    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2604    if (c == '#')
2605      {
2606      while (*(++ptr) != 0)
2607        {
2608        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2609        }
2610      if (*ptr != 0) continue;
2611
2612      /* Else fall through to handle end of string */
2613      c = 0;
2614      }
2615    }
2616
2617  /* No auto callout for quantifiers. */
2618
2619  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2620    {
2621    previous_callout = code;
2622    code = auto_callout(code, ptr, cd);
2623    }
2624
2625  switch(c)
2626    {
2627    /* ===================================================================*/
2628    case 0:                        /* The branch terminates at string end */
2629    case '|':                      /* or | or ) */
2630    case ')':
2631    *firstbyteptr = firstbyte;
2632    *reqbyteptr = reqbyte;
2633    *codeptr = code;
2634    *ptrptr = ptr;
2635    if (lengthptr != NULL)
2636      {
2637      if (OFLOW_MAX - *lengthptr < code - last_code)
2638        {
2639        *errorcodeptr = ERR20;
2640        goto FAILED;
2641        }
2642      *lengthptr += code - last_code;   /* To include callout length */
2643      DPRINTF((">> end branch\n"));
2644      }
2645    return TRUE;
2646
2647
2648    /* ===================================================================*/
2649    /* Handle single-character metacharacters. In multiline mode, ^ disables
2650    the setting of any following char as a first character. */
2651
2652    case '^':
2653    if ((options & PCRE_MULTILINE) != 0)
2654      {
2655      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2656      }
2657    previous = NULL;
2658    *code++ = OP_CIRC;
2659    break;
2660
2661    case '$':
2662    previous = NULL;
2663    *code++ = OP_DOLL;
2664    break;
2665
2666    /* There can never be a first char if '.' is first, whatever happens about
2667    repeats. The value of reqbyte doesn't change either. */
2668
2669    case '.':
2670    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2671    zerofirstbyte = firstbyte;
2672    zeroreqbyte = reqbyte;
2673    previous = code;
2674    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2675    break;
2676
2677
2678    /* ===================================================================*/
2679    /* Character classes. If the included characters are all < 256, we build a
2680    32-byte bitmap of the permitted characters, except in the special case
2681    where there is only one such character. For negated classes, we build the
2682    map as usual, then invert it at the end. However, we use a different opcode
2683    so that data characters > 255 can be handled correctly.
2684
2685    If the class contains characters outside the 0-255 range, a different
2686    opcode is compiled. It may optionally have a bit map for characters < 256,
2687    but those above are are explicitly listed afterwards. A flag byte tells
2688    whether the bitmap is present, and whether this is a negated class or not.
2689
2690    In JavaScript compatibility mode, an isolated ']' causes an error. In
2691    default (Perl) mode, it is treated as a data character. */
2692
2693    case ']':
2694    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2695      {
2696      *errorcodeptr = ERR64;
2697      goto FAILED;
2698      }
2699    goto NORMAL_CHAR;
2700
2701    case '[':
2702    previous = code;
2703
2704    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2705    they are encountered at the top level, so we'll do that too. */
2706
2707    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2708        check_posix_syntax(ptr, &tempptr))
2709      {
2710      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2711      goto FAILED;
2712      }
2713
2714    /* If the first character is '^', set the negation flag and skip it. Also,
2715    if the first few characters (either before or after ^) are \Q\E or \E we
2716    skip them too. This makes for compatibility with Perl. */
2717
2718    negate_class = FALSE;
2719    for (;;)
2720      {
2721      c = *(++ptr);
2722      if (c == '\\')
2723        {
2724        if (ptr[1] == 'E') ptr++;
2725          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2726            else break;
2727        }
2728      else if (!negate_class && c == '^')
2729        negate_class = TRUE;
2730      else break;
2731      }
2732
2733    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2734    an initial ']' is taken as a data character -- the code below handles
2735    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2736    [^] must match any character, so generate OP_ALLANY. */
2737
2738    if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2739      {
2740      *code++ = negate_class? OP_ALLANY : OP_FAIL;
2741      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2742      zerofirstbyte = firstbyte;
2743      break;
2744      }
2745
2746    /* If a class contains a negative special such as \S, we need to flip the
2747    negation flag at the end, so that support for characters > 255 works
2748    correctly (they are all included in the class). */
2749
2750    should_flip_negation = FALSE;
2751
2752    /* Keep a count of chars with values < 256 so that we can optimize the case
2753    of just a single character (as long as it's < 256). However, For higher
2754    valued UTF-8 characters, we don't yet do any optimization. */
2755
2756    class_charcount = 0;
2757    class_lastchar = -1;
2758
2759    /* Initialize the 32-char bit map to all zeros. We build the map in a
2760    temporary bit of memory, in case the class contains only 1 character (less
2761    than 256), because in that case the compiled code doesn't use the bit map.
2762    */
2763
2764    memset(classbits, 0, 32 * sizeof(uschar));
2765
2766#ifdef SUPPORT_UTF8
2767    class_utf8 = FALSE;                       /* No chars >= 256 */
2768    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2769    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2770#endif
2771
2772    /* Process characters until ] is reached. By writing this as a "do" it
2773    means that an initial ] is taken as a data character. At the start of the
2774    loop, c contains the first byte of the character. */
2775
2776    if (c != 0) do
2777      {
2778      const uschar *oldptr;
2779
2780#ifdef SUPPORT_UTF8
2781      if (utf8 && c > 127)
2782        {                           /* Braces are required because the */
2783        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2784        }
2785
2786      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2787      data and reset the pointer. This is so that very large classes that
2788      contain a zillion UTF-8 characters no longer overwrite the work space
2789      (which is on the stack). */
2790
2791      if (lengthptr != NULL)
2792        {
2793        *lengthptr += class_utf8data - class_utf8data_base;
2794        class_utf8data = class_utf8data_base;
2795        }
2796
2797#endif
2798
2799      /* Inside \Q...\E everything is literal except \E */
2800
2801      if (inescq)
2802        {
2803        if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2804          {
2805          inescq = FALSE;                   /* Reset literal state */
2806          ptr++;                            /* Skip the 'E' */
2807          continue;                         /* Carry on with next */
2808          }
2809        goto CHECK_RANGE;                   /* Could be range if \E follows */
2810        }
2811
2812      /* Handle POSIX class names. Perl allows a negation extension of the
2813      form [:^name:]. A square bracket that doesn't match the syntax is
2814      treated as a literal. We also recognize the POSIX constructions
2815      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2816      5.6 and 5.8 do. */
2817
2818      if (c == '[' &&
2819          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2820          check_posix_syntax(ptr, &tempptr))
2821        {
2822        BOOL local_negate = FALSE;
2823        int posix_class, taboffset, tabopt;
2824        register const uschar *cbits = cd->cbits;
2825        uschar pbits[32];
2826
2827        if (ptr[1] != ':')
2828          {
2829          *errorcodeptr = ERR31;
2830          goto FAILED;
2831          }
2832
2833        ptr += 2;
2834        if (*ptr == '^')
2835          {
2836          local_negate = TRUE;
2837          should_flip_negation = TRUE;  /* Note negative special */
2838          ptr++;
2839          }
2840
2841        posix_class = check_posix_name(ptr, tempptr - ptr);
2842        if (posix_class < 0)
2843          {
2844          *errorcodeptr = ERR30;
2845          goto FAILED;
2846          }
2847
2848        /* If matching is caseless, upper and lower are converted to
2849        alpha. This relies on the fact that the class table starts with
2850        alpha, lower, upper as the first 3 entries. */
2851
2852        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2853          posix_class = 0;
2854
2855        /* We build the bit map for the POSIX class in a chunk of local store
2856        because we may be adding and subtracting from it, and we don't want to
2857        subtract bits that may be in the main map already. At the end we or the
2858        result into the bit map that is being built. */
2859
2860        posix_class *= 3;
2861
2862        /* Copy in the first table (always present) */
2863
2864        memcpy(pbits, cbits + posix_class_maps[posix_class],
2865          32 * sizeof(uschar));
2866
2867        /* If there is a second table, add or remove it as required. */
2868
2869        taboffset = posix_class_maps[posix_class + 1];
2870        tabopt = posix_class_maps[posix_class + 2];
2871
2872        if (taboffset >= 0)
2873          {
2874          if (tabopt >= 0)
2875            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2876          else
2877            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2878          }
2879
2880        /* Not see if we need to remove any special characters. An option
2881        value of 1 removes vertical space and 2 removes underscore. */
2882
2883        if (tabopt < 0) tabopt = -tabopt;
2884        if (tabopt == 1) pbits[1] &= ~0x3c;
2885          else if (tabopt == 2) pbits[11] &= 0x7f;
2886
2887        /* Add the POSIX table or its complement into the main table that is
2888        being built and we are done. */
2889
2890        if (local_negate)
2891          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2892        else
2893          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2894
2895        ptr = tempptr + 1;
2896        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2897        continue;    /* End of POSIX syntax handling */
2898        }
2899
2900      /* Backslash may introduce a single character, or it may introduce one
2901      of the specials, which just set a flag. The sequence \b is a special
2902      case. Inside a class (and only there) it is treated as backspace.
2903      Elsewhere it marks a word boundary. Other escapes have preset maps ready
2904      to 'or' into the one we are building. We assume they have more than one
2905      character in them, so set class_charcount bigger than one. */
2906
2907      if (c == '\\')
2908        {
2909        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2910        if (*errorcodeptr != 0) goto FAILED;
2911
2912        if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2913        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2914        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2915        else if (-c == ESC_Q)            /* Handle start of quoted string */
2916          {
2917          if (ptr[1] == '\\' && ptr[2] == 'E')
2918            {
2919            ptr += 2; /* avoid empty string */
2920            }
2921          else inescq = TRUE;
2922          continue;
2923          }
2924        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2925
2926        if (c < 0)
2927          {
2928          register const uschar *cbits = cd->cbits;
2929          class_charcount += 2;     /* Greater than 1 is what matters */
2930
2931          /* Save time by not doing this in the pre-compile phase. */
2932
2933          if (lengthptr == NULL) switch (-c)
2934            {
2935            case ESC_d:
2936            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2937            continue;
2938
2939            case ESC_D:
2940            should_flip_negation = TRUE;
2941            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2942            continue;
2943
2944            case ESC_w:
2945            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2946            continue;
2947
2948            case ESC_W:
2949            should_flip_negation = TRUE;
2950            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2951            continue;
2952
2953            case ESC_s:
2954            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2955            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2956            continue;
2957
2958            case ESC_S:
2959            should_flip_negation = TRUE;
2960            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2961            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2962            continue;
2963
2964            default:    /* Not recognized; fall through */
2965            break;      /* Need "default" setting to stop compiler warning. */
2966            }
2967
2968          /* In the pre-compile phase, just do the recognition. */
2969
2970          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2971                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2972
2973          /* We need to deal with \H, \h, \V, and \v in both phases because
2974          they use extra memory. */
2975
2976          if (-c == ESC_h)
2977            {
2978            SETBIT(classbits, 0x09); /* VT */
2979            SETBIT(classbits, 0x20); /* SPACE */
2980            SETBIT(classbits, 0xa0); /* NSBP */
2981#ifdef SUPPORT_UTF8
2982            if (utf8)
2983              {
2984              class_utf8 = TRUE;
2985              *class_utf8data++ = XCL_SINGLE;
2986              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2987              *class_utf8data++ = XCL_SINGLE;
2988              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2989              *class_utf8data++ = XCL_RANGE;
2990              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2991              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2992              *class_utf8data++ = XCL_SINGLE;
2993              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2994              *class_utf8data++ = XCL_SINGLE;
2995              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2996              *class_utf8data++ = XCL_SINGLE;
2997              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2998              }
2999#endif
3000            continue;
3001            }
3002
3003          if (-c == ESC_H)
3004            {
3005            for (c = 0; c < 32; c++)
3006              {
3007              int x = 0xff;
3008              switch (c)
3009                {
3010                case 0x09/8: x ^= 1 << (0x09%8); break;
3011                case 0x20/8: x ^= 1 << (0x20%8); break;
3012                case 0xa0/8: x ^= 1 << (0xa0%8); break;
3013                default: break;
3014                }
3015              classbits[c] |= x;
3016              }
3017
3018#ifdef SUPPORT_UTF8
3019            if (utf8)
3020              {
3021              class_utf8 = TRUE;
3022              *class_utf8data++ = XCL_RANGE;
3023              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3024              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3025              *class_utf8data++ = XCL_RANGE;
3026              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3027              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3028              *class_utf8data++ = XCL_RANGE;
3029              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3030              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3031              *class_utf8data++ = XCL_RANGE;
3032              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3033              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3034              *class_utf8data++ = XCL_RANGE;
3035              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3036              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3037              *class_utf8data++ = XCL_RANGE;
3038              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3039              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3040              *class_utf8data++ = XCL_RANGE;
3041              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3042              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3043              }
3044#endif
3045            continue;
3046            }
3047
3048          if (-c == ESC_v)
3049            {
3050            SETBIT(classbits, 0x0a); /* LF */
3051            SETBIT(classbits, 0x0b); /* VT */
3052            SETBIT(classbits, 0x0c); /* FF */
3053            SETBIT(classbits, 0x0d); /* CR */
3054            SETBIT(classbits, 0x85); /* NEL */
3055#ifdef SUPPORT_UTF8
3056            if (utf8)
3057              {
3058              class_utf8 = TRUE;
3059              *class_utf8data++ = XCL_RANGE;
3060              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3061              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3062              }
3063#endif
3064            continue;
3065            }
3066
3067          if (-c == ESC_V)
3068            {
3069            for (c = 0; c < 32; c++)
3070              {
3071              int x = 0xff;
3072              switch (c)
3073                {
3074                case 0x0a/8: x ^= 1 << (0x0a%8);
3075                             x ^= 1 << (0x0b%8);
3076                             x ^= 1 << (0x0c%8);
3077                             x ^= 1 << (0x0d%8);
3078                             break;
3079                case 0x85/8: x ^= 1 << (0x85%8); break;
3080                default: break;
3081                }
3082              classbits[c] |= x;
3083              }
3084
3085#ifdef SUPPORT_UTF8
3086            if (utf8)
3087              {
3088              class_utf8 = TRUE;
3089              *class_utf8data++ = XCL_RANGE;
3090              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3091              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3092              *class_utf8data++ = XCL_RANGE;
3093              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3094              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3095              }
3096#endif
3097            continue;
3098            }
3099
3100          /* We need to deal with \P and \p in both phases. */
3101
3102#ifdef SUPPORT_UCP
3103          if (-c == ESC_p || -c == ESC_P)
3104            {
3105            BOOL negated;
3106            int pdata;
3107            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3108            if (ptype < 0) goto FAILED;
3109            class_utf8 = TRUE;
3110            *class_utf8data++ = ((-c == ESC_p) != negated)?
3111              XCL_PROP : XCL_NOTPROP;
3112            *class_utf8data++ = ptype;
3113            *class_utf8data++ = pdata;
3114            class_charcount -= 2;   /* Not a < 256 character */
3115            continue;
3116            }
3117#endif
3118          /* Unrecognized escapes are faulted if PCRE is running in its
3119          strict mode. By default, for compatibility with Perl, they are
3120          treated as literals. */
3121
3122          if ((options & PCRE_EXTRA) != 0)
3123            {
3124            *errorcodeptr = ERR7;
3125            goto FAILED;
3126            }
3127
3128          class_charcount -= 2;  /* Undo the default count from above */
3129          c = *ptr;              /* Get the final character and fall through */
3130          }
3131
3132        /* Fall through if we have a single character (c >= 0). This may be
3133        greater than 256 in UTF-8 mode. */
3134
3135        }   /* End of backslash handling */
3136
3137      /* A single character may be followed by '-' to form a range. However,
3138      Perl does not permit ']' to be the end of the range. A '-' character
3139      at the end is treated as a literal. Perl ignores orphaned \E sequences
3140      entirely. The code for handling \Q and \E is messy. */
3141
3142      CHECK_RANGE:
3143      while (ptr[1] == '\\' && ptr[2] == 'E')
3144        {
3145        inescq = FALSE;
3146        ptr += 2;
3147        }
3148
3149      oldptr = ptr;
3150
3151      /* Remember \r or \n */
3152
3153      if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3154
3155      /* Check for range */
3156
3157      if (!inescq && ptr[1] == '-')
3158        {
3159        int d;
3160        ptr += 2;
3161        while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3162
3163        /* If we hit \Q (not followed by \E) at this point, go into escaped
3164        mode. */
3165
3166        while (*ptr == '\\' && ptr[1] == 'Q')
3167          {
3168          ptr += 2;
3169          if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3170          inescq = TRUE;
3171          break;
3172          }
3173
3174        if (*ptr == 0 || (!inescq && *ptr == ']'))
3175          {
3176          ptr = oldptr;
3177          goto LONE_SINGLE_CHARACTER;
3178          }
3179
3180#ifdef SUPPORT_UTF8
3181        if (utf8)
3182          {                           /* Braces are required because the */
3183          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3184          }
3185        else
3186#endif
3187        d = *ptr;  /* Not UTF-8 mode */
3188
3189        /* The second part of a range can be a single-character escape, but
3190        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3191        in such circumstances. */
3192
3193        if (!inescq && d == '\\')
3194          {
3195          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3196          if (*errorcodeptr != 0) goto FAILED;
3197
3198          /* \b is backspace; \X is literal X; \R is literal R; any other
3199          special means the '-' was literal */
3200
3201          if (d < 0)
3202            {
3203            if (d == -ESC_b) d = '\b';
3204            else if (d == -ESC_X) d = 'X';
3205            else if (d == -ESC_R) d = 'R'; else
3206              {
3207              ptr = oldptr;
3208              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3209              }
3210            }
3211          }
3212
3213        /* Check that the two values are in the correct order. Optimize
3214        one-character ranges */
3215
3216        if (d < c)
3217          {
3218          *errorcodeptr = ERR8;
3219          goto FAILED;
3220          }
3221
3222        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3223
3224        /* Remember \r or \n */
3225
3226        if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3227
3228        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3229        matching, we have to use an XCLASS with extra data items. Caseless
3230        matching for characters > 127 is available only if UCP support is
3231        available. */
3232
3233#ifdef SUPPORT_UTF8
3234        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3235          {
3236          class_utf8 = TRUE;
3237
3238          /* With UCP support, we can find the other case equivalents of
3239          the relevant characters. There may be several ranges. Optimize how
3240          they fit with the basic range. */
3241
3242#ifdef SUPPORT_UCP
3243          if ((options & PCRE_CASELESS) != 0)
3244            {
3245            unsigned int occ, ocd;
3246            unsigned int cc = c;
3247            unsigned int origd = d;
3248            while (get_othercase_range(&cc, origd, &occ, &ocd))
3249              {
3250              if (occ >= (unsigned int)c &&
3251                  ocd <= (unsigned int)d)
3252                continue;                          /* Skip embedded ranges */
3253
3254              if (occ < (unsigned int)&&
3255                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3256                {                                  /* if there is overlap,   */
3257                c = occ;                           /* noting that if occ < c */
3258                continue;                          /* we can't have ocd > d  */
3259                }                                  /* because a subrange is  */
3260              if (ocd > (unsigned int)d &&
3261                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3262                {                                  /* the basic range.       */
3263                d = ocd;
3264                continue;
3265                }
3266
3267              if (occ == ocd)
3268                {
3269                *class_utf8data++ = XCL_SINGLE;
3270                }
3271              else
3272                {
3273                *class_utf8data++ = XCL_RANGE;
3274                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3275                }
3276              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3277              }
3278            }
3279#endif  /* SUPPORT_UCP */
3280
3281          /* Now record the original range, possibly modified for UCP caseless
3282          overlapping ranges. */
3283
3284          *class_utf8data++ = XCL_RANGE;
3285          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3286          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3287
3288          /* With UCP support, we are done. Without UCP support, there is no
3289          caseless matching for UTF-8 characters > 127; we can use the bit map
3290          for the smaller ones. */
3291
3292#ifdef SUPPORT_UCP
3293          continue;    /* With next character in the class */
3294#else
3295          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3296
3297          /* Adjust upper limit and fall through to set up the map */
3298
3299          d = 127;
3300
3301#endif  /* SUPPORT_UCP */
3302          }
3303#endif  /* SUPPORT_UTF8 */
3304
3305        /* We use the bit map for all cases when not in UTF-8 mode; else
3306        ranges that lie entirely within 0-127 when there is UCP support; else
3307        for partial ranges without UCP support. */
3308
3309        class_charcount += d - c + 1;
3310        class_lastchar = d;
3311
3312        /* We can save a bit of time by skipping this in the pre-compile. */
3313
3314        if (lengthptr == NULL) for (; c <= d; c++)
3315          {
3316          classbits[c/8] |= (1 << (c&7));
3317          if ((options & PCRE_CASELESS) != 0)
3318            {
3319            int uc = cd->fcc[c];           /* flip case */
3320            classbits[uc/8] |= (1 << (uc&7));
3321            }
3322          }
3323
3324        continue;   /* Go get the next char in the class */
3325        }
3326
3327      /* Handle a lone single character - we can get here for a normal
3328      non-escape char, or after \ that introduces a single character or for an
3329      apparent range that isn't. */
3330
3331      LONE_SINGLE_CHARACTER:
3332
3333      /* Handle a character that cannot go in the bit map */
3334
3335#ifdef SUPPORT_UTF8
3336      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3337        {
3338        class_utf8 = TRUE;
3339        *class_utf8data++ = XCL_SINGLE;
3340        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3341
3342#ifdef SUPPORT_UCP
3343        if ((options & PCRE_CASELESS) != 0)
3344          {
3345          unsigned int othercase;
3346          if ((othercase = UCD_OTHERCASE(c)) != c)
3347            {
3348            *class_utf8data++ = XCL_SINGLE;
3349            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3350            }
3351          }
3352#endif  /* SUPPORT_UCP */
3353
3354        }
3355      else
3356#endif  /* SUPPORT_UTF8 */
3357
3358      /* Handle a single-byte character */
3359        {
3360        classbits[c/8] |= (1 << (c&7));
3361        if ((options & PCRE_CASELESS) != 0)
3362          {
3363          c = cd->fcc[c];   /* flip case */
3364          classbits[c/8] |= (1 << (c&7));
3365          }
3366        class_charcount++;
3367        class_lastchar = c;
3368        }
3369      }
3370
3371    /* Loop until ']' reached. This "while" is the end of the "do" above. */
3372
3373    while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3374
3375    if (c == 0)                          /* Missing terminating ']' */
3376      {
3377      *errorcodeptr = ERR6;
3378      goto FAILED;
3379      }
3380
3381
3382/* This code has been disabled because it would mean that \s counts as
3383an explicit \r or \n reference, and that's not really what is wanted. Now
3384we set the flag only if there is a literal "\r" or "\n" in the class. */
3385
3386#if 0
3387    /* Remember whether \r or \n are in this class */
3388
3389    if (negate_class)
3390      {
3391      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3392      }
3393    else
3394      {
3395      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3396      }
3397#endif
3398
3399
3400    /* If class_charcount is 1, we saw precisely one character whose value is
3401    less than 256. As long as there were no characters >= 128 and there was no
3402    use of \p or \P, in other words, no use of any XCLASS features, we can
3403    optimize.
3404
3405    In UTF-8 mode, we can optimize the negative case only if there were no
3406    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3407    operate on single-bytes only. This is an historical hangover. Maybe one day
3408    we can tidy these opcodes to handle multi-byte characters.
3409
3410    The optimization throws away the bit map. We turn the item into a
3411    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3412    that OP_NOT does not support multibyte characters. In the positive case, it
3413    can cause firstbyte to be set. Otherwise, there can be no first char if
3414    this item is first, whatever repeat count may follow. In the case of
3415    reqbyte, save the previous value for reinstating. */
3416
3417#ifdef SUPPORT_UTF8
3418    if (class_charcount == 1 && !class_utf8 &&
3419      (!utf8 || !negate_class || class_lastchar < 128))
3420#else
3421    if (class_charcount == 1)
3422#endif
3423      {
3424      zeroreqbyte = reqbyte;
3425
3426      /* The OP_NOT opcode works on one-byte characters only. */
3427
3428      if (negate_class)
3429        {
3430        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3431        zerofirstbyte = firstbyte;
3432        *code++ = OP_NOT;
3433        *code++ = class_lastchar;
3434        break;
3435        }
3436
3437      /* For a single, positive character, get the value into mcbuffer, and
3438      then we can handle this with the normal one-character code. */
3439
3440#ifdef SUPPORT_UTF8
3441      if (utf8 && class_lastchar > 127)
3442        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3443      else
3444#endif
3445        {
3446        mcbuffer[0] = class_lastchar;
3447        mclength = 1;
3448        }
3449      goto ONE_CHAR;
3450      }       /* End of 1-char optimization */
3451
3452    /* The general case - not the one-char optimization. If this is the first
3453    thing in the branch, there can be no first char setting, whatever the
3454    repeat count. Any reqbyte setting must remain unchanged after any kind of
3455    repeat. */
3456
3457    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3458    zerofirstbyte = firstbyte;
3459    zeroreqbyte = reqbyte;
3460
3461    /* If there are characters with values > 255, we have to compile an
3462    extended class, with its own opcode, unless there was a negated special
3463    such as \S in the class, because in that case all characters > 255 are in
3464    the class, so any that were explicitly given as well can be ignored. If
3465    (when there are explicit characters > 255 that must be listed) there are no
3466    characters < 256, we can omit the bitmap in the actual compiled code. */
3467
3468#ifdef SUPPORT_UTF8
3469    if (class_utf8 && !should_flip_negation)
3470      {
3471      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3472      *code++ = OP_XCLASS;
3473      code += LINK_SIZE;
3474      *code = negate_class? XCL_NOT : 0;
3475
3476      /* If the map is required, move up the extra data to make room for it;
3477      otherwise just move the code pointer to the end of the extra data. */
3478
3479      if (class_charcount > 0)
3480        {
3481        *code++ |= XCL_MAP;
3482        memmove(code + 32, code, class_utf8data - code);
3483        memcpy(code, classbits, 32);
3484        code = class_utf8data + 32;
3485        }
3486      else code = class_utf8data;
3487
3488      /* Now fill in the complete length of the item */
3489
3490      PUT(previous, 1, code - previous);
3491      break;   /* End of class handling */
3492      }
3493#endif
3494
3495    /* If there are no characters > 255, set the opcode to OP_CLASS or
3496    OP_NCLASS, depending on whether the whole class was negated and whether
3497    there were negative specials such as \S in the class. Then copy the 32-byte
3498    map into the code vector, negating it if necessary. */
3499
3500    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3501    if (negate_class)
3502      {
3503      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3504        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3505      }
3506    else
3507      {
3508      memcpy(code, classbits, 32);
3509      }
3510    code += 32;
3511    break;
3512
3513
3514    /* ===================================================================*/
3515    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3516    has been tested above. */
3517
3518    case '{':
3519    if (!is_quantifier) goto NORMAL_CHAR;
3520    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3521    if (*errorcodeptr != 0) goto FAILED;
3522    goto REPEAT;
3523
3524    case '*':
3525    repeat_min = 0;
3526    repeat_max = -1;
3527    goto REPEAT;
3528
3529    case '+':
3530    repeat_min = 1;
3531    repeat_max = -1;
3532    goto REPEAT;
3533
3534    case '?':
3535    repeat_min = 0;
3536    repeat_max = 1;
3537
3538    REPEAT:
3539    if (previous == NULL)
3540      {
3541      *errorcodeptr = ERR9;
3542      goto FAILED;
3543      }
3544
3545    if (repeat_min == 0)
3546      {
3547      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3548      reqbyte = zeroreqbyte;        /* Ditto */
3549      }
3550
3551    /* Remember whether this is a variable length repeat */
3552
3553    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3554
3555    op_type = 0;                    /* Default single-char op codes */
3556    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3557
3558    /* Save start of previous item, in case we have to move it up to make space
3559    for an inserted OP_ONCE for the additional '+' extension. */
3560
3561    tempcode = previous;
3562
3563    /* If the next character is '+', we have a possessive quantifier. This
3564    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3565    If the next character is '?' this is a minimizing repeat, by default,
3566    but if PCRE_UNGREEDY is set, it works the other way round. We change the
3567    repeat type to the non-default. */
3568
3569    if (ptr[1] == '+')
3570      {
3571      repeat_type = 0;                  /* Force greedy */
3572      possessive_quantifier = TRUE;
3573      ptr++;
3574      }
3575    else if (ptr[1] == '?')
3576      {
3577      repeat_type = greedy_non_default;
3578      ptr++;
3579      }
3580    else repeat_type = greedy_default;
3581
3582    /* If previous was a character match, abolish the item and generate a
3583    repeat item instead. If a char item has a minumum of more than one, ensure
3584    that it is set in reqbyte - it might not be if a sequence such as x{3} is
3585    the first thing in a branch because the x will have gone into firstbyte
3586    instead.  */
3587
3588    if (*previous == OP_CHAR || *previous == OP_CHARNC)
3589      {
3590      /* Deal with UTF-8 characters that take up more than one byte. It's
3591      easier to write this out separately than try to macrify it. Use c to
3592      hold the length of the character in bytes, plus 0x80 to flag that it's a
3593      length rather than a small character. */
3594
3595#ifdef SUPPORT_UTF8
3596      if (utf8 && (code[-1] & 0x80) != 0)
3597        {
3598        uschar *lastchar = code - 1;
3599        while((*lastchar & 0xc0) == 0x80) lastchar--;
3600        c = code - lastchar;            /* Length of UTF-8 character */
3601        memcpy(utf8_char, lastchar, c); /* Save the char */
3602        c |= 0x80;                      /* Flag c as a length */
3603        }
3604      else
3605#endif
3606
3607      /* Handle the case of a single byte - either with no UTF8 support, or
3608      with UTF-8 disabled, or for a UTF-8 character < 128. */
3609
3610        {
3611        c = code[-1];
3612        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3613        }
3614
3615      /* If the repetition is unlimited, it pays to see if the next thing on
3616      the line is something that cannot possibly match this character. If so,
3617      automatically possessifying this item gains some performance in the case
3618      where the match fails. */
3619
3620      if (!possessive_quantifier &&
3621          repeat_max < 0 &&
3622          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3623            options, cd))
3624        {
3625        repeat_type = 0;    /* Force greedy */
3626        possessive_quantifier = TRUE;
3627        }
3628
3629      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3630      }
3631
3632    /* If previous was a single negated character ([^a] or similar), we use
3633    one of the special opcodes, replacing it. The code is shared with single-
3634    character repeats by setting opt_type to add a suitable offset into
3635    repeat_type. We can also test for auto-possessification. OP_NOT is
3636    currently used only for single-byte chars. */
3637
3638    else if (*previous == OP_NOT)
3639      {
3640      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3641      c = previous[1];
3642      if (!possessive_quantifier &&
3643          repeat_max < 0 &&
3644          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3645        {
3646        repeat_type = 0;    /* Force greedy */
3647        possessive_quantifier = TRUE;
3648        }
3649      goto OUTPUT_SINGLE_REPEAT;
3650      }
3651
3652    /* If previous was a character type match (\d or similar), abolish it and
3653    create a suitable repeat item. The code is shared with single-character
3654    repeats by setting op_type to add a suitable offset into repeat_type. Note
3655    the the Unicode property types will be present only when SUPPORT_UCP is
3656    defined, but we don't wrap the little bits of code here because it just
3657    makes it horribly messy. */
3658
3659    else if (*previous < OP_EODN)
3660      {
3661      uschar *oldcode;
3662      int prop_type, prop_value;
3663      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3664      c = *previous;
3665
3666      if (!possessive_quantifier &&
3667          repeat_max < 0 &&
3668          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3669        {
3670        repeat_type = 0;    /* Force greedy */
3671        possessive_quantifier = TRUE;
3672        }
3673
3674      OUTPUT_SINGLE_REPEAT:
3675      if (*previous == OP_PROP || *previous == OP_NOTPROP)
3676        {
3677        prop_type = previous[1];
3678        prop_value = previous[2];
3679        }
3680      else prop_type = prop_value = -1;
3681
3682      oldcode = code;
3683      code = previous;                  /* Usually overwrite previous item */
3684
3685      /* If the maximum is zero then the minimum must also be zero; Perl allows
3686      this case, so we do too - by simply omitting the item altogether. */
3687
3688      if (repeat_max == 0) goto END_REPEAT;
3689
3690      /* All real repeats make it impossible to handle partial matching (maybe
3691      one day we will be able to remove this restriction). */
3692
3693      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3694
3695      /* Combine the op_type with the repeat_type */
3696
3697      repeat_type += op_type;
3698
3699      /* A minimum of zero is handled either as the special case * or ?, or as
3700      an UPTO, with the maximum given. */
3701
3702      if (repeat_min == 0)
3703        {
3704        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3705          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3706        else
3707          {
3708          *code++ = OP_UPTO + repeat_type;
3709          PUT2INC(code, 0, repeat_max);
3710          }
3711        }
3712
3713      /* A repeat minimum of 1 is optimized into some special cases. If the
3714      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3715      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3716      one less than the maximum. */
3717
3718      else if (repeat_min == 1)
3719        {
3720        if (repeat_max == -1)
3721          *code++ = OP_PLUS + repeat_type;
3722        else
3723          {
3724          code = oldcode;                 /* leave previous item in place */
3725          if (repeat_max == 1) goto END_REPEAT;
3726          *code++ = OP_UPTO + repeat_type;
3727          PUT2INC(code, 0, repeat_max - 1);
3728          }
3729        }
3730
3731      /* The case {n,n} is just an EXACT, while the general case {n,m} is
3732      handled as an EXACT followed by an UPTO. */
3733
3734      else
3735        {
3736        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3737        PUT2INC(code, 0, repeat_min);
3738
3739        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3740        we have to insert the character for the previous code. For a repeated
3741        Unicode property match, there are two extra bytes that define the
3742        required property. In UTF-8 mode, long characters have their length in
3743        c, with the 0x80 bit as a flag. */
3744
3745        if (repeat_max < 0)
3746          {
3747#ifdef SUPPORT_UTF8
3748          if (utf8 && c >= 128)
3749            {
3750            memcpy(code, utf8_char, c & 7);
3751            code += c & 7;
3752            }
3753          else
3754#endif
3755            {
3756            *code++ = c;
3757            if (prop_type >= 0)
3758              {
3759              *code++ = prop_type;
3760              *code++ = prop_value;
3761              }
3762            }
3763          *code++ = OP_STAR + repeat_type;
3764          }
3765
3766        /* Else insert an UPTO if the max is greater than the min, again
3767        preceded by the character, for the previously inserted code. If the
3768        UPTO is just for 1 instance, we can use QUERY instead. */
3769
3770        else if (repeat_max != repeat_min)
3771          {
3772#ifdef SUPPORT_UTF8
3773          if (utf8 && c >= 128)
3774            {
3775            memcpy(code, utf8_char, c & 7);
3776            code += c & 7;
3777            }
3778          else
3779#endif
3780          *code++ = c;
3781          if (prop_type >= 0)
3782            {
3783            *code++ = prop_type;
3784            *code++ = prop_value;
3785            }
3786          repeat_max -= repeat_min;
3787
3788          if (repeat_max == 1)
3789            {
3790            *code++ = OP_QUERY + repeat_type;
3791            }
3792          else
3793            {
3794            *code++ = OP_UPTO + repeat_type;
3795            PUT2INC(code, 0, repeat_max);
3796            }
3797          }
3798        }
3799
3800      /* The character or character type itself comes last in all cases. */
3801
3802#ifdef SUPPORT_UTF8
3803      if (utf8 && c >= 128)
3804        {
3805        memcpy(code, utf8_char, c & 7);
3806        code += c & 7;
3807        }
3808      else
3809#endif
3810      *code++ = c;
3811
3812      /* For a repeated Unicode property match, there are two extra bytes that
3813      define the required property. */
3814
3815#ifdef SUPPORT_UCP
3816      if (prop_type >= 0)
3817        {
3818        *code++ = prop_type;
3819        *code++ = prop_value;
3820        }
3821#endif
3822      }
3823
3824    /* If previous was a character class or a back reference, we put the repeat
3825    stuff after it, but just skip the item if the repeat was {0,0}. */
3826
3827    else if (*previous == OP_CLASS ||
3828             *previous == OP_NCLASS ||
3829#ifdef SUPPORT_UTF8
3830             *previous == OP_XCLASS ||
3831#endif
3832             *previous == OP_REF)
3833      {
3834      if (repeat_max == 0)
3835        {
3836        code = previous;
3837        goto END_REPEAT;
3838        }
3839
3840      /* All real repeats make it impossible to handle partial matching (maybe
3841      one day we will be able to remove this restriction). */
3842
3843      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3844
3845      if (repeat_min == 0 && repeat_max == -1)
3846        *code++ = OP_CRSTAR + repeat_type;
3847      else if (repeat_min == 1 && repeat_max == -1)
3848        *code++ = OP_CRPLUS + repeat_type;
3849      else if (repeat_min == 0 && repeat_max == 1)
3850        *code++ = OP_CRQUERY + repeat_type;
3851      else
3852        {
3853        *code++ = OP_CRRANGE + repeat_type;
3854        PUT2INC(code, 0, repeat_min);
3855        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3856        PUT2INC(code, 0, repeat_max);
3857        }
3858      }
3859
3860    /* If previous was a bracket group, we may have to replicate it in certain
3861    cases. */
3862
3863    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3864             *previous == OP_ONCE || *previous == OP_COND)
3865      {
3866      register int i;
3867      int ketoffset = 0;
3868      int len = code - previous;
3869      uschar *bralink = NULL;
3870
3871      /* Repeating a DEFINE group is pointless */
3872
3873      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3874        {
3875        *errorcodeptr = ERR55;
3876        goto FAILED;
3877        }
3878
3879      /* If the maximum repeat count is unlimited, find the end of the bracket
3880      by scanning through from the start, and compute the offset back to it
3881      from the current code pointer. There may be an OP_OPT setting following
3882      the final KET, so we can't find the end just by going back from the code
3883      pointer. */
3884
3885      if (repeat_max == -1)
3886        {
3887        register uschar *ket = previous;
3888        do ket += GET(ket, 1); while (*ket != OP_KET);
3889        ketoffset = code - ket;
3890        }
3891
3892      /* The case of a zero minimum is special because of the need to stick
3893      OP_BRAZERO in front of it, and because the group appears once in the
3894      data, whereas in other cases it appears the minimum number of times. For
3895      this reason, it is simplest to treat this case separately, as otherwise
3896      the code gets far too messy. There are several special subcases when the
3897      minimum is zero. */
3898
3899      if (repeat_min == 0)
3900        {
3901        /* If the maximum is also zero, we used to just omit the group from the
3902        output altogether, like this:
3903
3904        ** if (repeat_max == 0)
3905        **   {
3906        **   code = previous;
3907        **   goto END_REPEAT;
3908        **   }
3909
3910        However, that fails when a group is referenced as a subroutine from
3911        elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3912        so that it is skipped on execution. As we don't have a list of which
3913        groups are referenced, we cannot do this selectively.
3914
3915        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3916        and do no more at this point. However, we do need to adjust any
3917        OP_RECURSE calls inside the group that refer to the group itself or any
3918        internal or forward referenced group, because the offset is from the
3919        start of the whole regex. Temporarily terminate the pattern while doing
3920        this. */
3921
3922        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3923          {
3924          *code = OP_END;
3925          adjust_recurse(previous, 1, utf8, cd, save_hwm);
3926          memmove(previous+1, previous, len);
3927          code++;
3928          if (repeat_max == 0)
3929            {
3930            *previous++ = OP_SKIPZERO;
3931            goto END_REPEAT;
3932            }
3933          *previous++ = OP_BRAZERO + repeat_type;
3934          }
3935
3936        /* If the maximum is greater than 1 and limited, we have to replicate
3937        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3938        The first one has to be handled carefully because it's the original
3939        copy, which has to be moved up. The remainder can be handled by code
3940        that is common with the non-zero minimum case below. We have to
3941        adjust the value or repeat_max, since one less copy is required. Once
3942        again, we may have to adjust any OP_RECURSE calls inside the group. */
3943
3944        else
3945          {
3946          int offset;
3947          *code = OP_END;
3948          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3949          memmove(previous + 2 + LINK_SIZE, previous, len);
3950          code += 2 + LINK_SIZE;
3951          *previous++ = OP_BRAZERO + repeat_type;
3952          *previous++ = OP_BRA;
3953
3954          /* We chain together the bracket offset fields that have to be
3955          filled in later when the ends of the brackets are reached. */
3956
3957          offset = (bralink == NULL)? 0 : previous - bralink;
3958          bralink = previous;
3959          PUTINC(previous, 0, offset);
3960          }
3961
3962        repeat_max--;
3963        }
3964
3965      /* If the minimum is greater than zero, replicate the group as many
3966      times as necessary, and adjust the maximum to the number of subsequent
3967      copies that we need. If we set a first char from the group, and didn't
3968      set a required char, copy the latter from the former. If there are any
3969      forward reference subroutine calls in the group, there will be entries on
3970      the workspace list; replicate these with an appropriate increment. */
3971
3972      else
3973        {
3974        if (repeat_min > 1)
3975          {
3976          /* In the pre-compile phase, we don't actually do the replication. We
3977          just adjust the length as if we had. Do some paranoid checks for
3978          potential integer overflow. */
3979
3980          if (lengthptr != NULL)
3981            {
3982            int delta = (repeat_min - 1)*length_prevgroup;
3983            if ((double)(repeat_min - 1)*(double)length_prevgroup >
3984                                                            (double)INT_MAX ||
3985                OFLOW_MAX - *lengthptr < delta)
3986              {
3987              *errorcodeptr = ERR20;
3988              goto FAILED;
3989              }
3990            *lengthptr += delta;
3991            }
3992
3993          /* This is compiling for real */
3994
3995          else
3996            {
3997            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3998            for (i = 1; i < repeat_min; i++)
3999              {
4000              uschar *hc;
4001              uschar *this_hwm = cd->hwm;
4002              memcpy(code, previous, len);
4003              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4004                {
4005                PUT(cd->hwm, 0, GET(hc, 0) + len);
4006                cd->hwm += LINK_SIZE;
4007                }
4008              save_hwm = this_hwm;
4009              code += len;
4010              }
4011            }
4012          }
4013
4014        if (repeat_max > 0) repeat_max -= repeat_min;
4015        }
4016
4017      /* This code is common to both the zero and non-zero minimum cases. If
4018      the maximum is limited, it replicates the group in a nested fashion,
4019      remembering the bracket starts on a stack. In the case of a zero minimum,
4020      the first one was set up above. In all cases the repeat_max now specifies
4021      the number of additional copies needed. Again, we must remember to
4022      replicate entries on the forward reference list. */
4023
4024      if (repeat_max >= 0)
4025        {
4026        /* In the pre-compile phase, we don't actually do the replication. We
4027        just adjust the length as if we had. For each repetition we must add 1
4028        to the length for BRAZERO and for all but the last repetition we must
4029        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4030        paranoid checks to avoid integer overflow. */
4031
4032        if (lengthptr != NULL && repeat_max > 0)
4033          {
4034          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4035                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4036          if ((double)repeat_max *
4037                (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4038                  > (double)INT_MAX ||
4039              OFLOW_MAX - *lengthptr < delta)
4040            {
4041            *errorcodeptr = ERR20;
4042            goto FAILED;
4043            }
4044          *lengthptr += delta;
4045          }
4046
4047        /* This is compiling for real */
4048
4049        else for (i = repeat_max - 1; i >= 0; i--)
4050          {
4051          uschar *hc;
4052          uschar *this_hwm = cd->hwm;
4053
4054          *code++ = OP_BRAZERO + repeat_type;
4055
4056          /* All but the final copy start a new nesting, maintaining the
4057          chain of brackets outstanding. */
4058
4059          if (i != 0)
4060            {
4061            int offset;
4062            *code++ = OP_BRA;
4063            offset = (bralink == NULL)? 0 : code - bralink;
4064            bralink = code;
4065            PUTINC(code, 0, offset);
4066            }
4067
4068          memcpy(code, previous, len);
4069          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4070            {
4071            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4072            cd->hwm += LINK_SIZE;
4073            }
4074          save_hwm = this_hwm;
4075          code += len;
4076          }
4077
4078        /* Now chain through the pending brackets, and fill in their length
4079        fields (which are holding the chain links pro tem). */
4080
4081        while (bralink != NULL)
4082          {
4083          int oldlinkoffset;
4084          int offset = code - bralink + 1;
4085          uschar *bra = code - offset;
4086          oldlinkoffset = GET(bra, 1);
4087          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4088          *code++ = OP_KET;
4089          PUTINC(code, 0, offset);
4090          PUT(bra, 1, offset);
4091          }
4092        }
4093
4094      /* If the maximum is unlimited, set a repeater in the final copy. We
4095      can't just offset backwards from the current code point, because we
4096      don't know if there's been an options resetting after the ket. The
4097      correct offset was computed above.
4098
4099      Then, when we are doing the actual compile phase, check to see whether
4100      this group is a non-atomic one that could match an empty string. If so,
4101      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4102      that runtime checking can be done. [This check is also applied to
4103      atomic groups at runtime, but in a different way.] */
4104
4105      else
4106        {
4107        uschar *ketcode = code - ketoffset;
4108        uschar *bracode = ketcode - GET(ketcode, 1);
4109        *ketcode = OP_KETRMAX + repeat_type;
4110        if (lengthptr == NULL && *bracode != OP_ONCE)
4111          {
4112          uschar *scode = bracode;
4113          do
4114            {
4115            if (could_be_empty_branch(scode, ketcode, utf8))
4116              {
4117              *bracode += OP_SBRA - OP_BRA;
4118              break;
4119              }
4120            scode += GET(scode, 1);
4121            }
4122          while (*scode == OP_ALT);
4123          }
4124        }
4125      }
4126
4127    /* If previous is OP_FAIL, it was generated by an empty class [] in
4128    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4129    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4130    error above. We can just ignore the repeat in JS case. */
4131
4132    else if (*previous == OP_FAIL) goto END_REPEAT;
4133
4134    /* Else there's some kind of shambles */
4135
4136    else
4137      {
4138      *errorcodeptr = ERR11;
4139      goto FAILED;
4140      }
4141
4142    /* If the character following a repeat is '+', or if certain optimization
4143    tests above succeeded, possessive_quantifier is TRUE. For some of the
4144    simpler opcodes, there is an special alternative opcode for this. For
4145    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4146    The '+' notation is just syntactic sugar, taken from Sun's Java package,
4147    but the special opcodes can optimize it a bit. The repeated item starts at
4148    tempcode, not at previous, which might be the first part of a string whose
4149    (former) last char we repeated.
4150
4151    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4152    an 'upto' may follow. We skip over an 'exact' item, and then test the
4153    length of what remains before proceeding. */
4154
4155    if (possessive_quantifier)
4156      {
4157      int len;
4158      if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4159          *tempcode == OP_NOTEXACT)
4160        tempcode += _pcre_OP_lengths[*tempcode] +
4161          ((*tempcode == OP_TYPEEXACT &&
4162             (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4163      len = code - tempcode;
4164      if (len > 0) switch (*tempcode)
4165        {
4166        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4167        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4168        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4169        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4170
4171        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4172        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4173        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4174        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4175
4176        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4177        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4178        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4179        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4180
4181        default:
4182        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4183        code += 1 + LINK_SIZE;
4184        len += 1 + LINK_SIZE;
4185        tempcode[0] = OP_ONCE;
4186        *code++ = OP_KET;
4187        PUTINC(code, 0, len);
4188        PUT(tempcode, 1, len);
4189        break;
4190        }
4191      }
4192
4193    /* In all case we no longer have a previous item. We also set the
4194    "follows varying string" flag for subsequently encountered reqbytes if
4195    it isn't already set and we have just passed a varying length item. */
4196
4197    END_REPEAT:
4198    previous = NULL;
4199    cd->req_varyopt |= reqvary;
4200    break;
4201
4202
4203    /* ===================================================================*/
4204    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4205    lookbehind or option setting or condition or all the other extended
4206    parenthesis forms.  */
4207
4208    case '(':
4209    newoptions = options;
4210    skipbytes = 0;
4211    bravalue = OP_CBRA;
4212    save_hwm = cd->hwm;
4213    reset_bracount = FALSE;
4214
4215    /* First deal with various "verbs" that can be introduced by '*'. */
4216
4217    if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4218      {
4219      int i, namelen;
4220      const char *vn = verbnames;
4221      const uschar *name = ++ptr;
4222      previous = NULL;
4223      while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4224      if (*ptr == ':')
4225        {
4226        *errorcodeptr = ERR59;   /* Not supported */
4227        goto FAILED;
4228        }
4229      if (*ptr != ')')
4230        {
4231        *errorcodeptr = ERR60;
4232        goto FAILED;
4233        }
4234      namelen = ptr - name;
4235      for (i = 0; i < verbcount; i++)
4236        {
4237        if (namelen == verbs[i].len &&
4238            strncmp((char *)name, vn, namelen) == 0)
4239          {
4240          *code = verbs[i].op;
4241          if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4242          break;
4243          }
4244        vn += verbs[i].len + 1;
4245        }
4246      if (i < verbcount) continue;
4247      *errorcodeptr = ERR60;
4248      goto FAILED;
4249      }
4250
4251    /* Deal with the extended parentheses; all are introduced by '?', and the
4252    appearance of any of them means that this is not a capturing group. */
4253
4254    else if (*ptr == '?')
4255      {
4256      int i, set, unset, namelen;
4257      int *optset;
4258      const uschar *name;
4259      uschar *slot;
4260
4261      switch (*(++ptr))
4262        {
4263        case '#':                 /* Comment; skip to ket */
4264        ptr++;
4265        while (*ptr != 0 && *ptr != ')') ptr++;
4266        if (*ptr == 0)
4267          {
4268          *errorcodeptr = ERR18;
4269          goto FAILED;
4270          }
4271        continue;
4272
4273
4274        /* ------------------------------------------------------------ */
4275        case '|':                 /* Reset capture count for each branch */
4276        reset_bracount = TRUE;
4277        /* Fall through */
4278
4279        /* ------------------------------------------------------------ */
4280        case ':':                 /* Non-capturing bracket */
4281        bravalue = OP_BRA;
4282        ptr++;
4283        break;
4284
4285
4286        /* ------------------------------------------------------------ */
4287        case '(':
4288        bravalue = OP_COND;       /* Conditional group */
4289
4290        /* A condition can be an assertion, a number (referring to a numbered
4291        group), a name (referring to a named group), or 'R', referring to
4292        recursion. R<digits> and R&name are also permitted for recursion tests.
4293
4294        There are several syntaxes for testing a named group: (?(name)) is used
4295        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4296
4297        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4298        be the recursive thing or the name 'R' (and similarly for 'R' followed
4299        by digits), and (b) a number could be a name that consists of digits.
4300        In both cases, we look for a name first; if not found, we try the other
4301        cases. */
4302
4303        /* For conditions that are assertions, check the syntax, and then exit
4304        the switch. This will take control down to where bracketed groups,
4305        including assertions, are processed. */
4306
4307        if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4308          break;
4309
4310        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4311        below), and all need to skip 3 bytes at the start of the group. */
4312
4313        code[1+LINK_SIZE] = OP_CREF;
4314        skipbytes = 3;
4315        refsign = -1;
4316
4317        /* Check for a test for recursion in a named group. */
4318
4319        if (ptr[1] == 'R' && ptr[2] == '&')
4320          {
4321          terminator = -1;
4322          ptr += 2;
4323          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4324          }
4325
4326        /* Check for a test for a named group's having been set, using the Perl
4327        syntax (?(<name>) or (?('name') */
4328
4329        else if (ptr[1] == '<')
4330          {
4331          terminator = '>';
4332          ptr++;
4333          }
4334        else if (ptr[1] == '\'')
4335          {
4336          terminator = '\'';
4337          ptr++;
4338          }
4339        else
4340          {
4341          terminator = 0;
4342          if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4343          }
4344
4345        /* We now expect to read a name; any thing else is an error */
4346
4347        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4348          {
4349          ptr += 1;  /* To get the right offset */
4350          *errorcodeptr = ERR28;
4351          goto FAILED;
4352          }
4353
4354        /* Read the name, but also get it as a number if it's all digits */
4355
4356        recno = 0;
4357        name = ++ptr;
4358        while ((cd->ctypes[*ptr] & ctype_word) != 0)
4359          {
4360          if (recno >= 0)
4361            recno = ((digitab[*ptr] & ctype_digit) != 0)?
4362              recno * 10 + *ptr - '0' : -1;
4363          ptr++;
4364          }
4365        namelen = ptr - name;
4366
4367        if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4368          {
4369          ptr--;      /* Error offset */
4370          *errorcodeptr = ERR26;
4371          goto FAILED;
4372          }
4373
4374        /* Do no further checking in the pre-compile phase. */
4375
4376        if (lengthptr != NULL) break;
4377
4378        /* In the real compile we do the work of looking for the actual
4379        reference. If the string started with "+" or "-" we require the rest to
4380        be digits, in which case recno will be set. */
4381
4382        if (refsign > 0)
4383          {
4384          if (recno <= 0)
4385            {
4386            *errorcodeptr = ERR58;
4387            goto FAILED;
4388            }
4389          recno = (refsign == '-')?
4390            cd->bracount - recno + 1 : recno +cd->bracount;
4391          if (recno <= 0 || recno > cd->final_bracount)
4392            {
4393            *errorcodeptr = ERR15;
4394            goto FAILED;
4395            }
4396          PUT2(code, 2+LINK_SIZE, recno);
4397          break;
4398          }
4399
4400        /* Otherwise (did not start with "+" or "-"), start by looking for the
4401        name. */
4402
4403        slot = cd->name_table;
4404        for (i = 0; i < cd->names_found; i++)
4405          {
4406          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4407          slot += cd->name_entry_size;
4408          }
4409
4410        /* Found a previous named subpattern */
4411
4412        if (i < cd->names_found)
4413          {
4414          recno = GET2(slot, 0);
4415          PUT2(code, 2+LINK_SIZE, recno);
4416          }
4417
4418        /* Search the pattern for a forward reference */
4419
4420        else if ((i = find_parens(ptr, cd, name, namelen,
4421                        (options & PCRE_EXTENDED) != 0)) > 0)
4422          {
4423          PUT2(code, 2+LINK_SIZE, i);
4424          }
4425
4426        /* If terminator == 0 it means that the name followed directly after
4427        the opening parenthesis [e.g. (?(abc)...] and in this case there are
4428        some further alternatives to try. For the cases where terminator != 0
4429        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4430        now checked all the possibilities, so give an error. */
4431
4432        else if (terminator != 0)
4433          {
4434          *errorcodeptr = ERR15;
4435          goto FAILED;
4436          }
4437
4438        /* Check for (?(R) for recursion. Allow digits after R to specify a
4439        specific group number. */
4440
4441        else if (*name == 'R')
4442          {
4443          recno = 0;
4444          for (i = 1; i < namelen; i++)
4445            {
4446            if ((digitab[name[i]] & ctype_digit) == 0)
4447              {
4448              *errorcodeptr = ERR15;
4449              goto FAILED;
4450              }
4451            recno = recno * 10 + name[i] - '0';
4452            }
4453          if (recno == 0) recno = RREF_ANY;
4454          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4455          PUT2(code, 2+LINK_SIZE, recno);
4456          }
4457
4458        /* Similarly, check for the (?(DEFINE) "condition", which is always
4459        false. */
4460
4461        else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4462          {
4463          code[1+LINK_SIZE] = OP_DEF;
4464          skipbytes = 1;
4465          }
4466
4467        /* Check for the "name" actually being a subpattern number. We are
4468        in the second pass here, so final_bracount is set. */
4469
4470        else if (recno > 0 && recno <= cd->final_bracount)
4471          {
4472          PUT2(code, 2+LINK_SIZE, recno);
4473          }
4474
4475        /* Either an unidentified subpattern, or a reference to (?(0) */
4476
4477        else
4478          {
4479          *errorcodeptr = (recno == 0)? ERR35: ERR15;
4480          goto FAILED;
4481          }
4482        break;
4483
4484
4485        /* ------------------------------------------------------------ */
4486        case '=':                 /* Positive lookahead */
4487        bravalue = OP_ASSERT;
4488        ptr++;
4489        break;
4490
4491
4492        /* ------------------------------------------------------------ */
4493        case '!':                 /* Negative lookahead */
4494        ptr++;
4495        if (*ptr == ')')          /* Optimize (?!) */
4496          {
4497          *code++ = OP_FAIL;
4498          previous = NULL;
4499          continue;
4500          }
4501        bravalue = OP_ASSERT_NOT;
4502        break;
4503
4504
4505        /* ------------------------------------------------------------ */
4506        case '<':                 /* Lookbehind or named define */
4507        switch (ptr[1])
4508          {
4509          case '=':               /* Positive lookbehind */
4510          bravalue = OP_ASSERTBACK;
4511          ptr += 2;
4512          break;
4513
4514          case '!':               /* Negative lookbehind */
4515          bravalue = OP_ASSERTBACK_NOT;
4516          ptr += 2;
4517          break;
4518
4519          default:                /* Could be name define, else bad */
4520          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4521          ptr++;                  /* Correct offset for error */
4522          *errorcodeptr = ERR24;
4523          goto FAILED;
4524          }
4525        break;
4526
4527
4528        /* ------------------------------------------------------------ */
4529        case '>':                 /* One-time brackets */
4530        bravalue = OP_ONCE;
4531        ptr++;
4532        break;
4533
4534
4535        /* ------------------------------------------------------------ */
4536        case 'C':                 /* Callout - may be followed by digits; */
4537        previous_callout = code;  /* Save for later completion */
4538        after_manual_callout = 1; /* Skip one item before completing */
4539        *code++ = OP_CALLOUT;
4540          {
4541          int n = 0;
4542          while ((digitab[*(++ptr)] & ctype_digit) != 0)
4543            n = n * 10 + *ptr - '0';
4544          if (*ptr != ')')
4545            {
4546            *errorcodeptr = ERR39;
4547            goto FAILED;
4548            }
4549          if (n > 255)
4550            {
4551            *errorcodeptr = ERR38;
4552            goto FAILED;
4553            }
4554          *code++ = n;
4555          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4556          PUT(code, LINK_SIZE, 0);                    /* Default length */
4557          code += 2 * LINK_SIZE;
4558          }
4559        previous = NULL;
4560        continue;
4561
4562
4563        /* ------------------------------------------------------------ */
4564        case 'P':                 /* Python-style named subpattern handling */
4565        if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4566          {
4567          is_recurse = *ptr == '>';
4568          terminator = ')';
4569          goto NAMED_REF_OR_RECURSE;
4570          }
4571        else if (*ptr != '<')    /* Test for Python-style definition */
4572          {
4573          *errorcodeptr = ERR41;
4574          goto FAILED;
4575          }
4576        /* Fall through to handle (?P< as (?< is handled */
4577
4578
4579        /* ------------------------------------------------------------ */
4580        DEFINE_NAME:    /* Come here from (?< handling */
4581        case '\'':
4582          {
4583          terminator = (*ptr == '<')? '>' : '\'';
4584          name = ++ptr;
4585
4586          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4587          namelen = ptr - name;
4588
4589          /* In the pre-compile phase, just do a syntax check. */
4590
4591          if (lengthptr != NULL)
4592            {
4593            if (*ptr != terminator)
4594              {
4595              *errorcodeptr = ERR42;
4596              goto FAILED;
4597              }
4598            if (cd->names_found >= MAX_NAME_COUNT)
4599              {
4600              *errorcodeptr = ERR49;
4601              goto FAILED;
4602              }
4603            if (namelen + 3 > cd->name_entry_size)
4604              {
4605              cd->name_entry_size = namelen + 3;
4606              if (namelen > MAX_NAME_SIZE)
4607                {
4608                *errorcodeptr = ERR48;
4609                goto FAILED;
4610                }
4611              }
4612            }
4613
4614          /* In the real compile, create the entry in the table */
4615
4616          else
4617            {
4618            slot = cd->name_table;
4619            for (i = 0; i < cd->names_found; i++)
4620              {
4621              int crc = memcmp(name, slot+2, namelen);
4622              if (crc == 0)
4623                {
4624                if (slot[2+namelen] == 0)
4625                  {
4626                  if ((options & PCRE_DUPNAMES) == 0)
4627                    {
4628                    *errorcodeptr = ERR43;
4629                    goto FAILED;
4630                    }
4631                  }
4632                else crc = -1;      /* Current name is substring */
4633                }
4634              if (crc < 0)
4635                {
4636                memmove(slot + cd->name_entry_size, slot,
4637                  (cd->names_found - i) * cd->name_entry_size);
4638                break;
4639                }
4640              slot += cd->name_entry_size;
4641              }
4642
4643            PUT2(slot, 0, cd->bracount + 1);
4644            memcpy(slot + 2, name, namelen);
4645            slot[2+namelen] = 0;
4646            }
4647          }
4648
4649        /* In both cases, count the number of names we've encountered. */
4650
4651        ptr++;                    /* Move past > or ' */
4652        cd->names_found++;
4653        goto NUMBERED_GROUP;
4654
4655
4656        /* ------------------------------------------------------------ */
4657        case '&':                 /* Perl recursion/subroutine syntax */
4658        terminator = ')';
4659        is_recurse = TRUE;
4660        /* Fall through */
4661
4662        /* We come here from the Python syntax above that handles both
4663        references (?P=name) and recursion (?P>name), as well as falling
4664        through from the Perl recursion syntax (?&name). We also come here from
4665        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4666        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4667
4668        NAMED_REF_OR_RECURSE:
4669        name = ++ptr;
4670        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4671        namelen = ptr - name;
4672
4673        /* In the pre-compile phase, do a syntax check and set a dummy
4674        reference number. */
4675
4676        if (lengthptr != NULL)
4677          {
4678          if (namelen == 0)
4679            {
4680            *errorcodeptr = ERR62;
4681            goto FAILED;
4682            }
4683          if (*ptr != terminator)
4684            {
4685            *errorcodeptr = ERR42;
4686            goto FAILED;
4687            }
4688          if (namelen > MAX_NAME_SIZE)
4689            {
4690            *errorcodeptr = ERR48;
4691            goto FAILED;
4692            }
4693          recno = 0;
4694          }
4695
4696        /* In the real compile, seek the name in the table. We check the name
4697        first, and then check that we have reached the end of the name in the
4698        table. That way, if the name that is longer than any in the table,
4699        the comparison will fail without reading beyond the table entry. */
4700
4701        else
4702          {
4703          slot = cd->name_table;
4704          for (i = 0; i < cd->names_found; i++)
4705            {
4706            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4707                slot[2+namelen] == 0)
4708              break;
4709            slot += cd->name_entry_size;
4710            }
4711
4712          if (i < cd->names_found)         /* Back reference */
4713            {
4714            recno = GET2(slot, 0);
4715            }
4716          else if ((recno =                /* Forward back reference */
4717                    find_parens(ptr, cd, name, namelen,
4718                      (options & PCRE_EXTENDED) != 0)) <= 0)
4719            {
4720            *errorcodeptr = ERR15;
4721            goto FAILED;
4722            }
4723          }
4724
4725        /* In both phases, we can now go to the code than handles numerical
4726        recursion or backreferences. */
4727
4728        if (is_recurse) goto HANDLE_RECURSION;
4729          else goto HANDLE_REFERENCE;
4730
4731
4732        /* ------------------------------------------------------------ */
4733        case 'R':                 /* Recursion */
4734        ptr++;                    /* Same as (?0)      */
4735        /* Fall through */
4736
4737
4738        /* ------------------------------------------------------------ */
4739        case '-': case '+':
4740        case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4741        case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4742          {
4743          const uschar *called;
4744          terminator = ')';
4745
4746          /* Come here from the \g<...> and \g'...' code (Oniguruma
4747          compatibility). However, the syntax has been checked to ensure that
4748          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4749          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4750          ever be taken. */
4751
4752          HANDLE_NUMERICAL_RECURSION:
4753
4754          if ((refsign = *ptr) == '+')
4755            {
4756            ptr++;
4757            if ((digitab[*ptr] & ctype_digit) == 0)
4758              {
4759              *errorcodeptr = ERR63;
4760              goto FAILED;
4761              }
4762            }
4763          else if (refsign == '-')
4764            {
4765            if ((digitab[ptr[1]] & ctype_digit) == 0)
4766              goto OTHER_CHAR_AFTER_QUERY;
4767            ptr++;
4768            }
4769
4770          recno = 0;
4771          while((digitab[*ptr] & ctype_digit) != 0)
4772            recno = recno * 10 + *ptr++ - '0';
4773
4774          if (*ptr != terminator)
4775            {
4776            *errorcodeptr = ERR29;
4777            goto FAILED;
4778            }
4779
4780          if (refsign == '-')
4781            {
4782            if (recno == 0)
4783              {
4784              *errorcodeptr = ERR58;
4785              goto FAILED;
4786              }
4787            recno = cd->bracount - recno + 1;
4788            if (recno <= 0)
4789              {
4790              *errorcodeptr = ERR15;
4791              goto FAILED;
4792              }
4793            }
4794          else if (refsign == '+')
4795            {
4796            if (recno == 0)
4797              {
4798              *errorcodeptr = ERR58;
4799              goto FAILED;
4800              }
4801            recno += cd->bracount;
4802            }
4803
4804          /* Come here from code above that handles a named recursion */
4805
4806          HANDLE_RECURSION:
4807
4808          previous = code;
4809          called = cd->start_code;
4810
4811          /* When we are actually compiling, find the bracket that is being
4812          referenced. Temporarily end the regex in case it doesn't exist before
4813          this point. If we end up with a forward reference, first check that
4814          the bracket does occur later so we can give the error (and position)
4815          now. Then remember this forward reference in the workspace so it can
4816          be filled in at the end. */
4817
4818          if (lengthptr == NULL)
4819            {
4820            *code = OP_END;
4821            if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4822
4823            /* Forward reference */
4824
4825            if (called == NULL)
4826              {
4827              if (find_parens(ptr, cd, NULL, recno,
4828                    (options & PCRE_EXTENDED) != 0) < 0)
4829                {
4830                *errorcodeptr = ERR15;
4831                goto FAILED;
4832                }
4833              called = cd->start_code + recno;
4834              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4835              }
4836
4837            /* If not a forward reference, and the subpattern is still open,
4838            this is a recursive call. We check to see if this is a left
4839            recursion that could loop for ever, and diagnose that case. */
4840
4841            else if (GET(called, 1) == 0 &&
4842                     could_be_empty(called, code, bcptr, utf8))
4843              {
4844              *errorcodeptr = ERR40;
4845              goto FAILED;
4846              }
4847            }
4848
4849          /* Insert the recursion/subroutine item, automatically wrapped inside
4850          "once" brackets. Set up a "previous group" length so that a
4851          subsequent quantifier will work. */
4852
4853          *code = OP_ONCE;
4854          PUT(code, 1, 2 + 2*LINK_SIZE);
4855          code += 1 + LINK_SIZE;
4856
4857          *code = OP_RECURSE;
4858          PUT(code, 1, called - cd->start_code);
4859          code += 1 + LINK_SIZE;
4860
4861          *code = OP_KET;
4862          PUT(code, 1, 2 + 2*LINK_SIZE);
4863          code += 1 + LINK_SIZE;
4864
4865          length_prevgroup = 3 + 3*LINK_SIZE;
4866          }
4867
4868        /* Can't determine a first byte now */
4869
4870        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4871        continue;
4872
4873
4874        /* ------------------------------------------------------------ */
4875        default:              /* Other characters: check option setting */
4876        OTHER_CHAR_AFTER_QUERY:
4877        set = unset = 0;
4878        optset = &set;
4879
4880        while (*ptr != ')' && *ptr != ':')
4881          {
4882          switch (*ptr++)
4883            {
4884            case '-': optset = &unset; break;
4885
4886            case 'J':    /* Record that it changed in the external options */
4887            *optset |= PCRE_DUPNAMES;
4888            cd->external_flags |= PCRE_JCHANGED;
4889            break;
4890
4891            case 'i': *optset |= PCRE_CASELESS; break;
4892            case 'm': *optset |= PCRE_MULTILINE; break;
4893            case 's': *optset |= PCRE_DOTALL; break;
4894            case 'x': *optset |= PCRE_EXTENDED; break;
4895            case 'U': *optset |= PCRE_UNGREEDY; break;
4896            case 'X': *optset |= PCRE_EXTRA; break;
4897
4898            default:  *errorcodeptr = ERR12;
4899                      ptr--;    /* Correct the offset */
4900                      goto FAILED;
4901            }
4902          }
4903
4904        /* Set up the changed option bits, but don't change anything yet. */
4905
4906        newoptions = (options | set) & (~unset);
4907
4908        /* If the options ended with ')' this is not the start of a nested
4909        group with option changes, so the options change at this level. If this
4910        item is right at the start of the pattern, the options can be
4911        abstracted and made external in the pre-compile phase, and ignored in
4912        the compile phase. This can be helpful when matching -- for instance in
4913        caseless checking of required bytes.
4914
4915        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4916        definitely *not* at the start of the pattern because something has been
4917        compiled. In the pre-compile phase, however, the code pointer can have
4918        that value after the start, because it gets reset as code is discarded
4919        during the pre-compile. However, this can happen only at top level - if
4920        we are within parentheses, the starting BRA will still be present. At
4921        any parenthesis level, the length value can be used to test if anything
4922        has been compiled at that level. Thus, a test for both these conditions
4923        is necessary to ensure we correctly detect the start of the pattern in
4924        both phases.
4925
4926        If we are not at the pattern start, compile code to change the ims
4927        options if this setting actually changes any of them, and reset the
4928        greedy defaults and the case value for firstbyte and reqbyte. */
4929
4930        if (*ptr == ')')
4931          {
4932          if (code == cd->start_code + 1 + LINK_SIZE &&
4933               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4934            {
4935            cd->external_options = newoptions;
4936            }
4937         else
4938            {
4939            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4940              {
4941              *code++ = OP_OPT;
4942              *code++ = newoptions & PCRE_IMS;
4943              }
4944            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4945            greedy_non_default = greedy_default ^ 1;
4946            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4947            }
4948
4949          /* Change options at this level, and pass them back for use
4950          in subsequent branches. When not at the start of the pattern, this
4951          information is also necessary so that a resetting item can be
4952          compiled at the end of a group (if we are in a group). */
4953
4954          *optionsptr = options = newoptions;
4955          previous = NULL;       /* This item can't be repeated */
4956          continue;              /* It is complete */
4957          }
4958
4959        /* If the options ended with ':' we are heading into a nested group
4960        with possible change of options. Such groups are non-capturing and are
4961        not assertions of any kind. All we need to do is skip over the ':';
4962        the newoptions value is handled below. */
4963
4964        bravalue = OP_BRA;
4965        ptr++;
4966        }     /* End of switch for character following (? */
4967      }       /* End of (? handling */
4968
4969    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4970    all unadorned brackets become non-capturing and behave like (?:...)
4971    brackets. */
4972
4973    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4974      {
4975      bravalue = OP_BRA;
4976      }
4977
4978    /* Else we have a capturing group. */
4979
4980    else
4981      {
4982      NUMBERED_GROUP:
4983      cd->bracount += 1;
4984      PUT2(code, 1+LINK_SIZE, cd->bracount);
4985      skipbytes = 2;
4986      }
4987
4988    /* Process nested bracketed regex. Assertions may not be repeated, but
4989    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4990    non-register variable in order to be able to pass its address because some
4991    compilers complain otherwise. Pass in a new setting for the ims options if
4992    they have changed. */
4993
4994    previous = (bravalue >= OP_ONCE)? code : NULL;
4995    *code = bravalue;
4996    tempcode = code;
4997    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4998    length_prevgroup = 0;              /* Initialize for pre-compile phase */
4999
5000    if (!compile_regex(
5001         newoptions,                   /* The complete new option state */
5002         options & PCRE_IMS,           /* The previous ims option state */
5003         &tempcode,                    /* Where to put code (updated) */
5004         &ptr,                         /* Input pointer (updated) */
5005         errorcodeptr,                 /* Where to put an error message */
5006         (bravalue == OP_ASSERTBACK ||
5007          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5008         reset_bracount,               /* True if (?| group */
5009         skipbytes,                    /* Skip over bracket number */
5010         &subfirstbyte,                /* For possible first char */
5011         &subreqbyte,                  /* For possible last char */
5012         bcptr,                        /* Current branch chain */
5013         cd,                           /* Tables block */
5014         (lengthptr == NULL)? NULL :   /* Actual compile phase */
5015           &length_prevgroup           /* Pre-compile phase */
5016         ))
5017      goto FAILED;
5018
5019    /* At the end of compiling, code is still pointing to the start of the
5020    group, while tempcode has been updated to point past the end of the group
5021    and any option resetting that may follow it. The pattern pointer (ptr)
5022    is on the bracket. */
5023
5024    /* If this is a conditional bracket, check that there are no more than
5025    two branches in the group, or just one if it's a DEFINE group. We do this
5026    in the real compile phase, not in the pre-pass, where the whole group may
5027    not be available. */
5028
5029    if (bravalue == OP_COND && lengthptr == NULL)
5030      {
5031      uschar *tc = code;
5032      int condcount = 0;
5033
5034      do {
5035         condcount++;
5036         tc += GET(tc,1);
5037         }
5038      while (*tc != OP_KET);
5039
5040      /* A DEFINE group is never obeyed inline (the "condition" is always
5041      false). It must have only one branch. */
5042
5043      if (code[LINK_SIZE+1] == OP_DEF)
5044        {
5045        if (condcount > 1)
5046          {
5047          *errorcodeptr = ERR54;
5048          goto FAILED;
5049          }
5050        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5051        }
5052
5053      /* A "normal" conditional group. If there is just one branch, we must not
5054      make use of its firstbyte or reqbyte, because this is equivalent to an
5055      empty second branch. */
5056
5057      else
5058        {
5059        if (condcount > 2)
5060          {
5061          *errorcodeptr = ERR27;
5062          goto FAILED;
5063          }
5064        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5065        }
5066      }
5067
5068    /* Error if hit end of pattern */
5069
5070    if (*ptr != ')')
5071      {
5072      *errorcodeptr = ERR14;
5073      goto FAILED;
5074      }
5075
5076    /* In the pre-compile phase, update the length by the length of the group,
5077    less the brackets at either end. Then reduce the compiled code to just a
5078    set of non-capturing brackets so that it doesn't use much memory if it is
5079    duplicated by a quantifier.*/
5080
5081    if (lengthptr != NULL)
5082      {
5083      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5084        {
5085        *errorcodeptr = ERR20;
5086        goto FAILED;
5087        }
5088      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5089      *code++ = OP_BRA;
5090      PUTINC(code, 0, 1 + LINK_SIZE);
5091      *code++ = OP_KET;
5092      PUTINC(code, 0, 1 + LINK_SIZE);
5093      break;    /* No need to waste time with special character handling */
5094      }
5095
5096    /* Otherwise update the main code pointer to the end of the group. */
5097
5098    code = tempcode;
5099
5100    /* For a DEFINE group, required and first character settings are not
5101    relevant. */
5102
5103    if (bravalue == OP_DEF) break;
5104
5105    /* Handle updating of the required and first characters for other types of
5106    group. Update for normal brackets of all kinds, and conditions with two
5107    branches (see code above). If the bracket is followed by a quantifier with
5108    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5109    zerofirstbyte outside the main loop so that they can be accessed for the
5110    back off. */
5111
5112    zeroreqbyte = reqbyte;
5113    zerofirstbyte = firstbyte;
5114    groupsetfirstbyte = FALSE;
5115
5116    if (bravalue >= OP_ONCE)
5117      {
5118      /* If we have not yet set a firstbyte in this branch, take it from the
5119      subpattern, remembering that it was set here so that a repeat of more
5120      than one can replicate it as reqbyte if necessary. If the subpattern has
5121      no firstbyte, set "none" for the whole branch. In both cases, a zero
5122      repeat forces firstbyte to "none". */
5123
5124      if (firstbyte == REQ_UNSET)
5125        {
5126        if (subfirstbyte >= 0)
5127          {
5128          firstbyte = subfirstbyte;
5129          groupsetfirstbyte = TRUE;
5130          }
5131        else firstbyte = REQ_NONE;
5132        zerofirstbyte = REQ_NONE;
5133        }
5134
5135      /* If firstbyte was previously set, convert the subpattern's firstbyte
5136      into reqbyte if there wasn't one, using the vary flag that was in
5137      existence beforehand. */
5138
5139      else if (subfirstbyte >= 0 && subreqbyte < 0)
5140        subreqbyte = subfirstbyte | tempreqvary;
5141
5142      /* If the subpattern set a required byte (or set a first byte that isn't
5143      really the first byte - see above), set it. */
5144
5145      if (subreqbyte >= 0) reqbyte = subreqbyte;
5146      }
5147
5148    /* For a forward assertion, we take the reqbyte, if set. This can be
5149    helpful if the pattern that follows the assertion doesn't set a different
5150    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5151    for an assertion, however because it leads to incorrect effect for patterns
5152    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5153    of a firstbyte. This is overcome by a scan at the end if there's no
5154    firstbyte, looking for an asserted first char. */
5155
5156    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5157    break;     /* End of processing '(' */
5158
5159
5160    /* ===================================================================*/
5161    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5162    are arranged to be the negation of the corresponding OP_values. For the
5163    back references, the values are ESC_REF plus the reference number. Only
5164    back references and those types that consume a character may be repeated.
5165    We can test for values between ESC_b and ESC_Z for the latter; this may
5166    have to change if any new ones are ever created. */
5167
5168    case '\\':
5169    tempptr = ptr;
5170    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5171    if (*errorcodeptr != 0) goto FAILED;
5172
5173    if (c < 0)
5174      {
5175      if (-c == ESC_Q)            /* Handle start of quoted string */
5176        {
5177        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5178          else inescq = TRUE;
5179        continue;
5180        }
5181
5182      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5183
5184      /* For metasequences that actually match a character, we disable the
5185      setting of a first character if it hasn't already been set. */
5186
5187      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5188        firstbyte = REQ_NONE;
5189
5190      /* Set values to reset to if this is followed by a zero repeat. */
5191
5192      zerofirstbyte = firstbyte;
5193      zeroreqbyte = reqbyte;
5194
5195      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5196      is a subroutine call by number (Oniguruma syntax). In fact, the value
5197      -ESC_g is returned only for these cases. So we don't need to check for <
5198      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5199      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5200      that is a synonym for a named back reference). */
5201
5202      if (-c == ESC_g)
5203        {
5204        const uschar *p;
5205        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5206        terminator = (*(++ptr) == '<')? '>' : '\'';
5207
5208        /* These two statements stop the compiler for warning about possibly
5209        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5210        fact, because we actually check for a number below, the paths that
5211        would actually be in error are never taken. */
5212
5213        skipbytes = 0;
5214        reset_bracount = FALSE;
5215
5216        /* Test for a name */
5217
5218        if (ptr[1] != '+' && ptr[1] != '-')
5219          {
5220          BOOL isnumber = TRUE;
5221          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5222            {
5223            if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5224            if ((cd->ctypes[*p] & ctype_word) == 0) break;
5225            }
5226          if (*p != terminator)
5227            {
5228            *errorcodeptr = ERR57;
5229            break;
5230            }
5231          if (isnumber)
5232            {
5233            ptr++;
5234            goto HANDLE_NUMERICAL_RECURSION;
5235            }
5236          is_recurse = TRUE;
5237          goto NAMED_REF_OR_RECURSE;
5238          }
5239
5240        /* Test a signed number in angle brackets or quotes. */
5241
5242        p = ptr + 2;
5243        while ((digitab[*p] & ctype_digit) != 0) p++;
5244        if (*p != terminator)
5245          {
5246          *errorcodeptr = ERR57;
5247          break;
5248          }
5249        ptr++;
5250        goto HANDLE_NUMERICAL_RECURSION;
5251        }
5252
5253      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5254      We also support \k{name} (.NET syntax) */
5255
5256      if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5257        {
5258        is_recurse = FALSE;
5259        terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5260        goto NAMED_REF_OR_RECURSE;
5261        }
5262
5263      /* Back references are handled specially; must disable firstbyte if
5264      not set to cope with cases like (?=(\w+))\1: which would otherwise set
5265      ':' later. */
5266
5267      if (-c >= ESC_REF)
5268        {
5269        recno = -c - ESC_REF;
5270
5271        HANDLE_REFERENCE:    /* Come here from named backref handling */
5272        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5273        previous = code;
5274        *code++ = OP_REF;
5275        PUT2INC(code, 0, recno);
5276        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5277        if (recno > cd->top_backref) cd->top_backref = recno;
5278        }
5279
5280      /* So are Unicode property matches, if supported. */
5281
5282#ifdef SUPPORT_UCP
5283      else if (-c == ESC_P || -c == ESC_p)
5284        {
5285        BOOL negated;
5286        int pdata;
5287        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5288        if (ptype < 0) goto FAILED;
5289        previous = code;
5290        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5291        *code++ = ptype;
5292        *code++ = pdata;
5293        }
5294#else
5295
5296      /* If Unicode properties are not supported, \X, \P, and \p are not
5297      allowed. */
5298
5299      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5300        {
5301        *errorcodeptr = ERR45;
5302        goto FAILED;
5303        }
5304#endif
5305
5306      /* For the rest (including \X when Unicode properties are supported), we
5307      can obtain the OP value by negating the escape value. */
5308
5309      else
5310        {
5311        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5312        *code++ = -c;
5313        }
5314      continue;
5315      }
5316
5317    /* We have a data character whose value is in c. In UTF-8 mode it may have
5318    a value > 127. We set its representation in the length/buffer, and then
5319    handle it as a data character. */
5320
5321#ifdef SUPPORT_UTF8
5322    if (utf8 && c > 127)
5323      mclength = _pcre_ord2utf8(c, mcbuffer);
5324    else
5325#endif
5326
5327     {
5328     mcbuffer[0] = c;
5329     mclength = 1;
5330     }
5331    goto ONE_CHAR;
5332
5333
5334    /* ===================================================================*/
5335    /* Handle a literal character. It is guaranteed not to be whitespace or #
5336    when the extended flag is set. If we are in UTF-8 mode, it may be a
5337    multi-byte literal character. */
5338
5339    default:
5340    NORMAL_CHAR:
5341    mclength = 1;
5342    mcbuffer[0] = c;
5343
5344#ifdef SUPPORT_UTF8
5345    if (utf8 && c >= 0xc0)
5346      {
5347      while ((ptr[1] & 0xc0) == 0x80)
5348        mcbuffer[mclength++] = *(++ptr);
5349      }
5350#endif
5351
5352    /* At this point we have the character's bytes in mcbuffer, and the length
5353    in mclength. When not in UTF-8 mode, the length is always 1. */
5354
5355    ONE_CHAR:
5356    previous = code;
5357    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5358    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5359
5360    /* Remember if \r or \n were seen */
5361
5362    if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5363      cd->external_flags |= PCRE_HASCRORLF;
5364
5365    /* Set the first and required bytes appropriately. If no previous first
5366    byte, set it from this character, but revert to none on a zero repeat.
5367    Otherwise, leave the firstbyte value alone, and don't change it on a zero
5368    repeat. */
5369
5370    if (firstbyte == REQ_UNSET)
5371      {
5372      zerofirstbyte = REQ_NONE;
5373      zeroreqbyte = reqbyte;
5374
5375      /* If the character is more than one byte long, we can set firstbyte
5376      only if it is not to be matched caselessly. */
5377
5378      if (mclength == 1 || req_caseopt == 0)
5379        {
5380        firstbyte = mcbuffer[0] | req_caseopt;
5381        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5382        }
5383      else firstbyte = reqbyte = REQ_NONE;
5384      }
5385
5386    /* firstbyte was previously set; we can set reqbyte only the length is
5387    1 or the matching is caseful. */
5388
5389    else
5390      {
5391      zerofirstbyte = firstbyte;
5392      zeroreqbyte = reqbyte;
5393      if (mclength == 1 || req_caseopt == 0)
5394        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5395      }
5396
5397    break;            /* End of literal character handling */
5398    }
5399  }                   /* end of big loop */
5400
5401
5402/* Control never reaches here by falling through, only by a goto for all the
5403error states. Pass back the position in the pattern so that it can be displayed
5404to the user for diagnosing the error. */
5405
5406FAILED:
5407*ptrptr = ptr;
5408return FALSE;
5409}
5410
5411
5412
5413
5414/*************************************************
5415*     Compile sequence of alternatives           *
5416*************************************************/
5417
5418/* On entry, ptr is pointing past the bracket character, but on return it
5419points to the closing bracket, or vertical bar, or end of string. The code
5420variable is pointing at the byte into which the BRA operator has been stored.
5421If the ims options are changed at the start (for a (?ims: group) or during any
5422branch, we need to insert an OP_OPT item at the start of every following branch
5423to ensure they get set correctly at run time, and also pass the new options
5424into every subsequent branch compile.
5425
5426This function is used during the pre-compile phase when we are trying to find
5427out the amount of memory needed, as well as during the real compile phase. The
5428value of lengthptr distinguishes the two phases.
5429
5430Arguments:
5431  options        option bits, including any changes for this subpattern
5432  oldims         previous settings of ims option bits
5433  codeptr        -> the address of the current code pointer
5434  ptrptr         -> the address of the current pattern pointer
5435  errorcodeptr   -> pointer to error code variable
5436  lookbehind     TRUE if this is a lookbehind assertion
5437  reset_bracount TRUE to reset the count for each branch
5438  skipbytes      skip this many bytes at start (for brackets and OP_COND)
5439  firstbyteptr   place to put the first required character, or a negative number
5440  reqbyteptr     place to put the last required character, or a negative number
5441  bcptr          pointer to the chain of currently open branches
5442  cd             points to the data block with tables pointers etc.
5443  lengthptr      NULL during the real compile phase
5444                 points to length accumulator during pre-compile phase
5445
5446Returns:         TRUE on success
5447*/
5448
5449static BOOL
5450compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5451  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5452  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5453  int *lengthptr)
5454{
5455const uschar *ptr = *ptrptr;
5456uschar *code = *codeptr;
5457uschar *last_branch = code;
5458uschar *start_bracket = code;
5459uschar *reverse_count = NULL;
5460int firstbyte, reqbyte;
5461int branchfirstbyte, branchreqbyte;
5462int length;
5463int orig_bracount;
5464int max_bracount;
5465branch_chain bc;
5466
5467bc.outer = bcptr;
5468bc.current = code;
5469
5470firstbyte = reqbyte = REQ_UNSET;
5471
5472/* Accumulate the length for use in the pre-compile phase. Start with the
5473length of the BRA and KET and any extra bytes that are required at the
5474beginning. We accumulate in a local variable to save frequent testing of
5475lenthptr for NULL. We cannot do this by looking at the value of code at the
5476start and end of each alternative, because compiled items are discarded during
5477the pre-compile phase so that the work space is not exceeded. */
5478
5479length = 2 + 2*LINK_SIZE + skipbytes;
5480
5481/* WARNING: If the above line is changed for any reason, you must also change
5482the code that abstracts option settings at the start of the pattern and makes
5483them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5484pre-compile phase to find out whether anything has yet been compiled or not. */
5485
5486/* Offset is set zero to mark that this bracket is still open */
5487
5488PUT(code, 1, 0);
5489code += 1 + LINK_SIZE + skipbytes;
5490
5491/* Loop for each alternative branch */
5492
5493orig_bracount = max_bracount = cd->bracount;
5494for (;;)
5495  {
5496  /* For a (?| group, reset the capturing bracket count so that each branch
5497  uses the same numbers. */
5498
5499  if (reset_bracount) cd->bracount = orig_bracount;
5500
5501  /* Handle a change of ims options at the start of the branch */
5502
5503  if ((options & PCRE_IMS) != oldims)
5504    {
5505    *code++ = OP_OPT;
5506    *code++ = options & PCRE_IMS;
5507    length += 2;
5508    }
5509
5510  /* Set up dummy OP_REVERSE if lookbehind assertion */
5511
5512  if (lookbehind)
5513    {
5514    *code++ = OP_REVERSE;
5515    reverse_count = code;
5516    PUTINC(code, 0, 0);
5517    length += 1 + LINK_SIZE;
5518    }
5519
5520  /* Now compile the branch; in the pre-compile phase its length gets added
5521  into the length. */
5522
5523  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5524        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5525    {
5526    *ptrptr = ptr;
5527    return FALSE;
5528    }
5529
5530  /* Keep the highest bracket count in case (?| was used and some branch
5531  has fewer than the rest. */
5532
5533  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5534
5535  /* In the real compile phase, there is some post-processing to be done. */
5536
5537  if (lengthptr == NULL)
5538    {
5539    /* If this is the first branch, the firstbyte and reqbyte values for the
5540    branch become the values for the regex. */
5541
5542    if (*last_branch != OP_ALT)
5543      {
5544      firstbyte = branchfirstbyte;
5545      reqbyte = branchreqbyte;
5546      }
5547
5548    /* If this is not the first branch, the first char and reqbyte have to
5549    match the values from all the previous branches, except that if the
5550    previous value for reqbyte didn't have REQ_VARY set, it can still match,
5551    and we set REQ_VARY for the regex. */
5552
5553    else
5554      {
5555      /* If we previously had a firstbyte, but it doesn't match the new branch,
5556      we have to abandon the firstbyte for the regex, but if there was
5557      previously no reqbyte, it takes on the value of the old firstbyte. */
5558
5559      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5560        {
5561        if (reqbyte < 0) reqbyte = firstbyte;
5562        firstbyte = REQ_NONE;
5563        }
5564
5565      /* If we (now or from before) have no firstbyte, a firstbyte from the
5566      branch becomes a reqbyte if there isn't a branch reqbyte. */
5567
5568      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5569          branchreqbyte = branchfirstbyte;
5570
5571      /* Now ensure that the reqbytes match */
5572
5573      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5574        reqbyte = REQ_NONE;
5575      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5576      }
5577
5578    /* If lookbehind, check that this branch matches a fixed-length string, and
5579    put the length into the OP_REVERSE item. Temporarily mark the end of the
5580    branch with OP_END. */
5581
5582    if (lookbehind)
5583      {
5584      int fixed_length;
5585      *code = OP_END;
5586      fixed_length = find_fixedlength(last_branch, options);
5587      DPRINTF(("fixed length = %d\n", fixed_length));
5588      if (fixed_length < 0)
5589        {
5590        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5591        *ptrptr = ptr;
5592        return FALSE;
5593        }
5594      PUT(reverse_count, 0, fixed_length);
5595      }
5596    }
5597
5598  /* Reached end of expression, either ')' or end of pattern. In the real
5599  compile phase, go back through the alternative branches and reverse the chain
5600  of offsets, with the field in the BRA item now becoming an offset to the
5601  first alternative. If there are no alternatives, it points to the end of the
5602  group. The length in the terminating ket is always the length of the whole
5603  bracketed item. If any of the ims options were changed inside the group,
5604  compile a resetting op-code following, except at the very end of the pattern.
5605  Return leaving the pointer at the terminating char. */
5606
5607  if (*ptr != '|')
5608    {
5609    if (lengthptr == NULL)
5610      {
5611      int branch_length = code - last_branch;
5612      do
5613        {
5614        int prev_length = GET(last_branch, 1);
5615        PUT(last_branch, 1, branch_length);
5616        branch_length = prev_length;
5617        last_branch -= branch_length;
5618        }
5619      while (branch_length > 0);
5620      }
5621
5622    /* Fill in the ket */
5623
5624    *code = OP_KET;
5625    PUT(code, 1, code - start_bracket);
5626    code += 1 + LINK_SIZE;
5627
5628    /* Resetting option if needed */
5629
5630    if ((options & PCRE_IMS) != oldims && *ptr == ')')
5631      {
5632      *code++ = OP_OPT;
5633      *code++ = oldims;
5634      length += 2;
5635      }
5636
5637    /* Retain the highest bracket number, in case resetting was used. */
5638
5639    cd->bracount = max_bracount;
5640
5641    /* Set values to pass back */
5642
5643    *codeptr = code;
5644    *ptrptr = ptr;
5645    *firstbyteptr = firstbyte;
5646    *reqbyteptr = reqbyte;
5647    if (lengthptr != NULL)
5648      {
5649      if (OFLOW_MAX - *lengthptr < length)
5650        {
5651        *errorcodeptr = ERR20;
5652        return FALSE;
5653        }
5654      *lengthptr += length;
5655      }
5656    return TRUE;
5657    }
5658
5659  /* Another branch follows. In the pre-compile phase, we can move the code
5660  pointer back to where it was for the start of the first branch. (That is,
5661  pretend that each branch is the only one.)
5662
5663  In the real compile phase, insert an ALT node. Its length field points back
5664  to the previous branch while the bracket remains open. At the end the chain
5665  is reversed. It's done like this so that the start of the bracket has a
5666  zero offset until it is closed, making it possible to detect recursion. */
5667
5668  if (lengthptr != NULL)
5669    {
5670    code = *codeptr + 1 + LINK_SIZE + skipbytes;
5671    length += 1 + LINK_SIZE;
5672    }
5673  else
5674    {
5675    *code = OP_ALT;
5676    PUT(code, 1, code - last_branch);
5677    bc.current = last_branch = code;
5678    code += 1 + LINK_SIZE;
5679    }
5680
5681  ptr++;
5682  }
5683/* Control never reaches here */
5684}
5685
5686
5687
5688
5689/*************************************************
5690*          Check for anchored expression         *
5691*************************************************/
5692
5693/* Try to find out if this is an anchored regular expression. Consider each
5694alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5695all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5696it's anchored. However, if this is a multiline pattern, then only OP_SOD
5697counts, since OP_CIRC can match in the middle.
5698
5699We can also consider a regex to be anchored if OP_SOM starts all its branches.
5700This is the code for \G, which means "match at start of match position, taking
5701into account the match offset".
5702
5703A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5704because that will try the rest of the pattern at all possible matching points,
5705so there is no point trying again.... er ....
5706
5707.... except when the .* appears inside capturing parentheses, and there is a
5708subsequent back reference to those parentheses. We haven't enough information
5709to catch that case precisely.
5710
5711At first, the best we could do was to detect when .* was in capturing brackets
5712and the highest back reference was greater than or equal to that level.
5713However, by keeping a bitmap of the first 31 back references, we can catch some
5714of the more common cases more precisely.
5715
5716Arguments:
5717  code           points to start of expression (the bracket)
5718  options        points to the options setting
5719  bracket_map    a bitmap of which brackets we are inside while testing; this
5720                  handles up to substring 31; after that we just have to take
5721                  the less precise approach
5722  backref_map    the back reference bitmap
5723
5724Returns:     TRUE or FALSE
5725*/
5726
5727static BOOL
5728is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5729  unsigned int backref_map)
5730{
5731do {
5732   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5733     options, PCRE_MULTILINE, FALSE);
5734   register int op = *scode;
5735
5736   /* Non-capturing brackets */
5737
5738   if (op == OP_BRA)
5739     {
5740     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5741     }
5742
5743   /* Capturing brackets */
5744
5745   else if (op == OP_CBRA)
5746     {
5747     int n = GET2(scode, 1+LINK_SIZE);
5748     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5749     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5750     }
5751
5752   /* Other brackets */
5753
5754   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5755     {
5756     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5757     }
5758
5759   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5760   it isn't in brackets that are or may be referenced. */
5761
5762   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5763             op == OP_TYPEPOSSTAR))
5764     {
5765     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5766       return FALSE;
5767     }
5768
5769   /* Check for explicit anchoring */
5770
5771   else if (op != OP_SOD && op != OP_SOM &&
5772           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5773     return FALSE;
5774   code += GET(code, 1);
5775   }
5776while (*code == OP_ALT);   /* Loop for each alternative */
5777return TRUE;
5778}
5779
5780
5781
5782/*************************************************
5783*         Check for starting with ^ or .*        *
5784*************************************************/
5785
5786/* This is called to find out if every branch starts with ^ or .* so that
5787"first char" processing can be done to speed things up in multiline
5788matching and for non-DOTALL patterns that start with .* (which must start at
5789the beginning or after \n). As in the case of is_anchored() (see above), we
5790have to take account of back references to capturing brackets that contain .*
5791because in that case we can't make the assumption.
5792
5793Arguments:
5794  code           points to start of expression (the bracket)
5795  bracket_map    a bitmap of which brackets we are inside while testing; this
5796                  handles up to substring 31; after that we just have to take
5797                  the less precise approach
5798  backref_map    the back reference bitmap
5799
5800Returns:         TRUE or FALSE
5801*/
5802
5803static BOOL
5804is_startline(const uschar *code, unsigned int bracket_map,
5805  unsigned int backref_map)
5806{
5807do {
5808   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5809     NULL, 0, FALSE);
5810   register int op = *scode;
5811
5812   /* Non-capturing brackets */
5813
5814   if (op == OP_BRA)
5815     {
5816     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5817     }
5818
5819   /* Capturing brackets */
5820
5821   else if (op == OP_CBRA)
5822     {
5823     int n = GET2(scode, 1+LINK_SIZE);
5824     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5825     if (!is_startline(scode, new_map, backref_map)) return FALSE;
5826     }
5827
5828   /* Other brackets */
5829
5830   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5831     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5832
5833   /* .* means "start at start or after \n" if it isn't in brackets that
5834   may be referenced. */
5835
5836   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5837     {
5838     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5839     }
5840
5841   /* Check for explicit circumflex */
5842
5843   else if (op != OP_CIRC) return FALSE;
5844
5845   /* Move on to the next alternative */
5846
5847   code += GET(code, 1);
5848   }
5849while (*code == OP_ALT);  /* Loop for each alternative */
5850return TRUE;
5851}
5852
5853
5854
5855/*************************************************
5856*       Check for asserted fixed first char      *
5857*************************************************/
5858
5859/* During compilation, the "first char" settings from forward assertions are
5860discarded, because they can cause conflicts with actual literals that follow.
5861However, if we end up without a first char setting for an unanchored pattern,
5862it is worth scanning the regex to see if there is an initial asserted first
5863char. If all branches start with the same asserted char, or with a bracket all
5864of whose alternatives start with the same asserted char (recurse ad lib), then
5865we return that char, otherwise -1.
5866
5867Arguments:
5868  code       points to start of expression (the bracket)
5869  options    pointer to the options (used to check casing changes)
5870  inassert   TRUE if in an assertion
5871
5872Returns:     -1 or the fixed first char
5873*/
5874
5875static int
5876find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5877{
5878register int c = -1;
5879do {
5880   int d;
5881   const uschar *scode =
5882     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5883   register int op = *scode;
5884
5885   switch(op)
5886     {
5887     default:
5888     return -1;
5889
5890     case OP_BRA:
5891     case OP_CBRA:
5892     case OP_ASSERT:
5893     case OP_ONCE:
5894     case OP_COND:
5895     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5896       return -1;
5897     if (c < 0) c = d; else if (c != d) return -1;
5898     break;
5899
5900     case OP_EXACT:       /* Fall through */
5901     scode += 2;
5902
5903     case OP_CHAR:
5904     case OP_CHARNC:
5905     case OP_PLUS:
5906     case OP_MINPLUS:
5907     case OP_POSPLUS:
5908     if (!inassert) return -1;
5909     if (c < 0)
5910       {
5911       c = scode[1];
5912       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5913       }
5914     else if (c != scode[1]) return -1;
5915     break;
5916     }
5917
5918   code += GET(code, 1);
5919   }
5920while (*code == OP_ALT);
5921return c;
5922}
5923
5924
5925
5926/*************************************************
5927*        Compile a Regular Expression            *
5928*************************************************/
5929
5930/* This function takes a string and returns a pointer to a block of store
5931holding a compiled version of the expression. The original API for this
5932function had no error code return variable; it is retained for backwards
5933compatibility. The new function is given a new name.
5934
5935Arguments:
5936  pattern       the regular expression
5937  options       various option bits
5938  errorcodeptr  pointer to error code variable (pcre_compile2() only)
5939                  can be NULL if you don't want a code value
5940  errorptr      pointer to pointer to error text
5941  erroroffset   ptr offset in pattern where error was detected
5942  tables        pointer to character tables or NULL
5943
5944Returns:        pointer to compiled data block, or NULL on error,
5945                with errorptr and erroroffset set
5946*/
5947
5948PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5949pcre_compile(const char *pattern, int options, const char **errorptr,
5950  int *erroroffset, const unsigned char *tables)
5951{
5952return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5953}
5954
5955
5956PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5957pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5958  const char **errorptr, int *erroroffset, const unsigned char *tables)
5959{
5960real_pcre *re;
5961int length = 1;  /* For final END opcode */
5962int firstbyte, reqbyte, newline;
5963int errorcode = 0;
5964int skipatstart = 0;
5965#ifdef SUPPORT_UTF8
5966BOOL utf8;
5967#endif
5968size_t size;
5969uschar *code;
5970const uschar *codestart;
5971const uschar *ptr;
5972compile_data compile_block;
5973compile_data *cd = &compile_block;
5974
5975/* This space is used for "compiling" into during the first phase, when we are
5976computing the amount of memory that is needed. Compiled items are thrown away
5977as soon as possible, so that a fairly large buffer should be sufficient for
5978this purpose. The same space is used in the second phase for remembering where
5979to fill in forward references to subpatterns. */
5980
5981uschar cworkspace[COMPILE_WORK_SIZE];
5982
5983/* Set this early so that early errors get offset 0. */
5984
5985ptr = (const uschar *)pattern;
5986
5987/* We can't pass back an error message if errorptr is NULL; I guess the best we
5988can do is just return NULL, but we can set a code value if there is a code
5989pointer. */
5990
5991if (errorptr == NULL)
5992  {
5993  if (errorcodeptr != NULL) *errorcodeptr = 99;
5994  return NULL;
5995  }
5996
5997*errorptr = NULL;
5998if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5999
6000/* However, we can give a message for this error */
6001
6002if (erroroffset == NULL)
6003  {
6004  errorcode = ERR16;
6005  goto PCRE_EARLY_ERROR_RETURN2;
6006  }
6007
6008*erroroffset = 0;
6009
6010/* Can't support UTF8 unless PCRE has been compiled to include the code. */
6011
6012#ifdef SUPPORT_UTF8
6013utf8 = (options & PCRE_UTF8) != 0;
6014if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6015     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6016  {
6017  errorcode = ERR44;
6018  goto PCRE_EARLY_ERROR_RETURN2;
6019  }
6020#else
6021if ((options & PCRE_UTF8) != 0)
6022  {
6023  errorcode = ERR32;
6024  goto PCRE_EARLY_ERROR_RETURN;
6025  }
6026#endif
6027
6028if ((options & ~PUBLIC_OPTIONS) != 0)
6029  {
6030  errorcode = ERR17;
6031  goto PCRE_EARLY_ERROR_RETURN;
6032  }
6033
6034/* Set up pointers to the individual character tables */
6035
6036if (tables == NULL) tables = _pcre_default_tables;
6037cd->lcc = tables + lcc_offset;
6038cd->fcc = tables + fcc_offset;
6039cd->cbits = tables + cbits_offset;
6040cd->ctypes = tables + ctypes_offset;
6041
6042/* Check for global one-time settings at the start of the pattern, and remember
6043the offset for later. */
6044
6045while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
6046  {
6047  int newnl = 0;
6048  int newbsr = 0;
6049
6050  if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6051    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6052  else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
6053    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6054  else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
6055    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6056  else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6057    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6058  else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
6059    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6060
6061  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6062    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6063  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6064    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6065
6066  if (newnl != 0)
6067    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6068  else if (newbsr != 0)
6069    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6070  else break;
6071  }
6072
6073/* Check validity of \R options. */
6074
6075switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6076  {
6077  case 0:
6078  case PCRE_BSR_ANYCRLF:
6079  case PCRE_BSR_UNICODE:
6080  break;
6081  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6082  }
6083
6084/* Handle different types of newline. The three bits give seven cases. The
6085current code allows for fixed one- or two-byte sequences, plus "any" and
6086"anycrlf". */
6087
6088switch (options & PCRE_NEWLINE_BITS)
6089  {
6090  case 0: newline = NEWLINE; break;   /* Build-time default */
6091  case PCRE_NEWLINE_CR: newline = '\r'; break;
6092  case PCRE_NEWLINE_LF: newline = '\n'; break;
6093  case PCRE_NEWLINE_CR+
6094       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6095  case PCRE_NEWLINE_ANY: newline = -1; break;
6096  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6097  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6098  }
6099
6100if (newline == -2)
6101  {
6102  cd->nltype = NLTYPE_ANYCRLF;
6103  }
6104else if (newline < 0)
6105  {
6106  cd->nltype = NLTYPE_ANY;
6107  }
6108else
6109  {
6110  cd->nltype = NLTYPE_FIXED;
6111  if (newline > 255)
6112    {
6113    cd->nllen = 2;
6114    cd->nl[0] = (newline >> 8) & 255;
6115    cd->nl[1] = newline & 255;
6116    }
6117  else
6118    {
6119    cd->nllen = 1;
6120    cd->nl[0] = newline;
6121    }
6122  }
6123
6124/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6125references to help in deciding whether (.*) can be treated as anchored or not.
6126*/
6127
6128cd->top_backref = 0;
6129cd->backref_map = 0;
6130
6131/* Reflect pattern for debugging output */
6132
6133DPRINTF(("------------------------------------------------------------------\n"));
6134DPRINTF(("%s\n", pattern));
6135
6136/* Pretend to compile the pattern while actually just accumulating the length
6137of memory required. This behaviour is triggered by passing a non-NULL final
6138argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6139to compile parts of the pattern into; the compiled code is discarded when it is
6140no longer needed, so hopefully this workspace will never overflow, though there
6141is a test for its doing so. */
6142
6143cd->bracount = cd->final_bracount = 0;
6144cd->names_found = 0;
6145cd->name_entry_size = 0;
6146cd->name_table = NULL;
6147cd->start_workspace = cworkspace;
6148cd->start_code = cworkspace;
6149cd->hwm = cworkspace;
6150cd->start_pattern = (const uschar *)pattern;
6151cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6152cd->req_varyopt = 0;
6153cd->external_options = options;
6154cd->external_flags = 0;
6155
6156/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6157don't need to look at the result of the function here. The initial options have
6158been put into the cd block so that they can be changed if an option setting is
6159found within the regex right at the beginning. Bringing initial option settings
6160outside can help speed up starting point checks. */
6161
6162ptr += skipatstart;
6163code = cworkspace;
6164*code = OP_BRA;
6165(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6166  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6167  &length);
6168if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6169
6170DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6171  cd->hwm - cworkspace));
6172
6173if (length > MAX_PATTERN_SIZE)
6174  {
6175  errorcode = ERR20;
6176  goto PCRE_EARLY_ERROR_RETURN;
6177  }
6178
6179/* Compute the size of data block needed and get it, either from malloc or
6180externally provided function. Integer overflow should no longer be possible
6181because nowadays we limit the maximum value of cd->names_found and
6182cd->name_entry_size. */
6183
6184size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6185re = (real_pcre *)(pcre_malloc)(size);
6186
6187if (re == NULL)
6188  {
6189  errorcode = ERR21;
6190  goto PCRE_EARLY_ERROR_RETURN;
6191  }
6192
6193/* Put in the magic number, and save the sizes, initial options, internal
6194flags, and character table pointer. NULL is used for the default character
6195tables. The nullpad field is at the end; it's there to help in the case when a
6196regex compiled on a system with 4-byte pointers is run on another with 8-byte
6197pointers. */
6198
6199re->magic_number = MAGIC_NUMBER;
6200re->size = size;
6201re->options = cd->external_options;
6202re->flags = cd->external_flags;
6203re->dummy1 = 0;
6204re->first_byte = 0;
6205re->req_byte = 0;
6206re->name_table_offset = sizeof(real_pcre);
6207re->name_entry_size = cd->name_entry_size;
6208re->name_count = cd->names_found;
6209re->ref_count = 0;
6210re->tables = (tables == _pcre_default_tables)? NULL : tables;
6211re->nullpad = NULL;
6212
6213/* The starting points of the name/number translation table and of the code are
6214passed around in the compile data block. The start/end pattern and initial
6215options are already set from the pre-compile phase, as is the name_entry_size
6216field. Reset the bracket count and the names_found field. Also reset the hwm
6217field; this time it's used for remembering forward references to subpatterns.
6218*/
6219
6220cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6221cd->bracount = 0;
6222cd->names_found = 0;
6223cd->name_table = (uschar *)re + re->name_table_offset;
6224codestart = cd->name_table + re->name_entry_size * re->name_count;
6225cd->start_code = codestart;
6226cd->hwm = cworkspace;
6227cd->req_varyopt = 0;
6228cd->had_accept = FALSE;
6229
6230/* Set up a starting, non-extracting bracket, then compile the expression. On
6231error, errorcode will be set non-zero, so we don't need to look at the result
6232of the function here. */
6233
6234ptr = (const uschar *)pattern + skipatstart;
6235code = (uschar *)codestart;
6236*code = OP_BRA;
6237(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6238  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6239re->top_bracket = cd->bracount;
6240re->top_backref = cd->top_backref;
6241re->flags = cd->external_flags;
6242
6243if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6244
6245/* If not reached end of pattern on success, there's an excess bracket. */
6246
6247if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6248
6249/* Fill in the terminating state and check for disastrous overflow, but
6250if debugging, leave the test till after things are printed out. */
6251
6252*code++ = OP_END;
6253
6254#ifndef DEBUG
6255if (code - codestart > length) errorcode = ERR23;
6256#endif
6257
6258/* Fill in any forward references that are required. */
6259
6260while (errorcode == 0 && cd->hwm > cworkspace)
6261  {
6262  int offset, recno;
6263  const uschar *groupptr;
6264  cd->hwm -= LINK_SIZE;
6265  offset = GET(cd->hwm, 0);
6266  recno = GET(codestart, offset);
6267  groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6268  if (groupptr == NULL) errorcode = ERR53;
6269    else PUT(((uschar *)codestart), offset, groupptr - codestart);
6270  }
6271
6272/* Give an error if there's back reference to a non-existent capturing
6273subpattern. */
6274
6275if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6276
6277/* Failed to compile, or error while post-processing */
6278
6279if (errorcode != 0)
6280  {
6281  (pcre_free)(re);
6282  PCRE_EARLY_ERROR_RETURN:
6283  *erroroffset = ptr - (const uschar *)pattern;
6284  PCRE_EARLY_ERROR_RETURN2:
6285  *errorptr = find_error_text(errorcode);
6286  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6287  return NULL;
6288  }
6289
6290/* If the anchored option was not passed, set the flag if we can determine that
6291the pattern is anchored by virtue of ^ characters or \A or anything else (such
6292as starting with .* when DOTALL is set).
6293
6294Otherwise, if we know what the first byte has to be, save it, because that
6295speeds up unanchored matches no end. If not, see if we can set the
6296PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6297start with ^. and also when all branches start with .* for non-DOTALL matches.
6298*/
6299
6300if ((re->options & PCRE_ANCHORED) == 0)
6301  {
6302  int temp_options = re->options;   /* May get changed during these scans */
6303  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6304    re->options |= PCRE_ANCHORED;
6305  else
6306    {
6307    if (firstbyte < 0)
6308      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6309    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6310      {
6311      int ch = firstbyte & 255;
6312      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6313         cd->fcc[ch] == ch)? ch : firstbyte;
6314      re->flags |= PCRE_FIRSTSET;
6315      }
6316    else if (is_startline(codestart, 0, cd->backref_map))
6317      re->flags |= PCRE_STARTLINE;
6318    }
6319  }
6320
6321/* For an anchored pattern, we use the "required byte" only if it follows a
6322variable length item in the regex. Remove the caseless flag for non-caseable
6323bytes. */
6324
6325if (reqbyte >= 0 &&
6326     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6327  {
6328  int ch = reqbyte & 255;
6329  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6330    cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6331  re->flags |= PCRE_REQCHSET;
6332  }
6333
6334/* Print out the compiled data if debugging is enabled. This is never the
6335case when building a production library. */
6336
6337#ifdef DEBUG
6338
6339printf("Length = %d top_bracket = %d top_backref = %d\n",
6340  length, re->top_bracket, re->top_backref);
6341
6342printf("Options=%08x\n", re->options);
6343
6344if ((re->flags & PCRE_FIRSTSET) != 0)
6345  {
6346  int ch = re->first_byte & 255;
6347  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6348    "" : " (caseless)";
6349  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6350    else printf("First char = \\x%02x%s\n", ch, caseless);
6351  }
6352
6353if ((re->flags & PCRE_REQCHSET) != 0)
6354  {
6355  int ch = re->req_byte & 255;
6356  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6357    "" : " (caseless)";
6358  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6359    else printf("Req char = \\x%02x%s\n", ch, caseless);
6360  }
6361
6362pcre_printint(re, stdout, TRUE);
6363
6364/* This check is done here in the debugging case so that the code that
6365was compiled can be seen. */
6366
6367if (code - codestart > length)
6368  {
6369  (pcre_free)(re);
6370  *errorptr = find_error_text(ERR23);
6371  *erroroffset = ptr - (uschar *)pattern;
6372  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6373  return NULL;
6374  }
6375#endif   /* DEBUG */
6376
6377return (pcre *)re;
6378}
6379
6380/* End of pcre_compile.c */
Note: See TracBrowser for help on using the repository browser.