Context Navigation

source: XMLIO_V2/external/src/POCO/Foundation.save/pcre_compile.c @ 80

Last change on this file since 80 was 80, checked in by ymipsl, 14 years ago
ajout lib externe
Property svn:eol-style set to `native`
File size: 198.1 KB

Line
1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/* PCRE is a library of functions to support regular expressions whose syntax
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Copyright (c) 1997-2008 University of Cambridge
10
11	-----------------------------------------------------------------------------
12	Redistribution and use in source and binary forms, with or without
13	modification, are permitted provided that the following conditions are met:
14
15	* Redistributions of source code must retain the above copyright notice,
16	this list of conditions and the following disclaimer.
17
18	* Redistributions in binary form must reproduce the above copyright
19	notice, this list of conditions and the following disclaimer in the
20	documentation and/or other materials provided with the distribution.
21
22	* Neither the name of the University of Cambridge nor the names of its
23	contributors may be used to endorse or promote products derived from
24	this software without specific prior written permission.
25
26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36	POSSIBILITY OF SUCH DAMAGE.
37	-----------------------------------------------------------------------------
38	*/
39
40
41	/* This module contains the external function pcre_compile(), along with
42	supporting internal functions that are not used by other modules. */
43
44
45	#include "pcre_config.h"
46
47	#define NLBLOCK cd /* Block containing newline information */
48	#define PSSTART start_pattern /* Field containing processed string start */
49	#define PSEND end_pattern /* Field containing processed string end */
50
51	#include "pcre_internal.h"
52
53
54	/* When DEBUG is defined, we need the pcre_printint() function, which is also
55	used by pcretest. DEBUG is not defined when building a production library. */
56
57	#ifdef DEBUG
58	#include "pcre_printint.src"
59	#endif
60
61
62	/* Macro for setting individual bits in class bitmaps. */
63
64	#define SETBIT(a,b) a[b/8] \|= (1 << (b%8))
65
66	/* Maximum length value to check against when making sure that the integer that
67	holds the compiled pattern length does not overflow. We make it a bit less than
68	INT_MAX to allow for adding in group terminating bytes, so that we don't have
69	to check them every time. */
70
71	#define OFLOW_MAX (INT_MAX - 20)
72
73
74	/*************************************************
75	* Code parameters and static tables *
76	*************************************************/
77
78	/* This value specifies the size of stack workspace that is used during the
79	first pre-compile phase that determines how much memory is required. The regex
80	is partly compiled into this space, but the compiled parts are discarded as
81	soon as they can be, so that hopefully there will never be an overrun. The code
82	does, however, check for an overrun. The largest amount I've seen used is 218,
83	so this number is very generous.
84
85	The same workspace is used during the second, actual compile phase for
86	remembering forward references to groups so that they can be filled in at the
87	end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
88	is 4 there is plenty of room. */
89
90	#define COMPILE_WORK_SIZE (4096)
91
92
93	/* Table for handling escaped characters in the range '0'-'z'. Positive returns
94	are simple data values; negative values are for special things like \d and so
95	on. Zero means further processing is needed (for things like \x), or the escape
96	is invalid. */
97
98	#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
99	static const short int escapes[] = {
100	0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
101	0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
102	'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
103	-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
104	-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
105	-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
106	'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
107	-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
108	-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
109	0, 0, -ESC_z /* x - z */
110	};
111
112	#else /* This is the "abnormal" table for EBCDIC systems */
113	static const short int escapes[] = {
114	/* 48 */ 0, 0, 0, '.', '<', '(', '+', '\|',
115	/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
116	/* 58 / 0, 0, '!', '$', '', ')', ';', '~',
117	/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
118	/* 68 */ 0, 0, '\|', ',', '%', '_', '>', '?',
119	/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
120	/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
121	/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
122	/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
123	/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
124	/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
125	/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
126	/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
127	/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128	/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
129	/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
130	/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
131	/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
132	/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
133	/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
134	/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
135	/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
136	/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
137	};
138	#endif
139
140
141	/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
142	searched linearly. Put all the names into a single string, in order to reduce
143	the number of relocations when a shared library is dynamically linked. */
144
145	typedef struct verbitem {
146	int len;
147	int op;
148	} verbitem;
149
150	static const char verbnames[] =
151	"ACCEPT\0"
152	"COMMIT\0"
153	"F\0"
154	"FAIL\0"
155	"PRUNE\0"
156	"SKIP\0"
157	"THEN";
158
159	static const verbitem verbs[] = {
160	{ 6, OP_ACCEPT },
161	{ 6, OP_COMMIT },
162	{ 1, OP_FAIL },
163	{ 4, OP_FAIL },
164	{ 5, OP_PRUNE },
165	{ 4, OP_SKIP },
166	{ 4, OP_THEN }
167	};
168
169	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
170
171
172	/* Tables of names of POSIX character classes and their lengths. The names are
173	now all in a single string, to reduce the number of relocations when a shared
174	library is dynamically loaded. The list of lengths is terminated by a zero
175	length entry. The first three must be alpha, lower, upper, as this is assumed
176	for handling case independence. */
177
178	static const char posix_names[] =
179	"alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
180	"cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
181	"word\0" "xdigit";
182
183	static const uschar posix_name_lengths[] = {
184	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
185
186	/* Table of class bit maps for each POSIX class. Each class is formed from a
187	base map, with an optional addition or removal of another map. Then, for some
188	classes, there is some additional tweaking: for [:blank:] the vertical space
189	characters are removed, and for [:alpha:] and [:alnum:] the underscore
190	character is removed. The triples in the table consist of the base map offset,
191	second map offset or -1 if no second map, and a non-negative value for map
192	addition or a negative value for map subtraction (if there are two maps). The
193	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
194	remove vertical space characters, 2 => remove underscore. */
195
196	static const int posix_class_maps[] = {
197	cbit_word, cbit_digit, -2, /* alpha */
198	cbit_lower, -1, 0, /* lower */
199	cbit_upper, -1, 0, /* upper */
200	cbit_word, -1, 2, /* alnum - word without underscore */
201	cbit_print, cbit_cntrl, 0, /* ascii */
202	cbit_space, -1, 1, /* blank - a GNU extension */
203	cbit_cntrl, -1, 0, /* cntrl */
204	cbit_digit, -1, 0, /* digit */
205	cbit_graph, -1, 0, /* graph */
206	cbit_print, -1, 0, /* print */
207	cbit_punct, -1, 0, /* punct */
208	cbit_space, -1, 0, /* space */
209	cbit_word, -1, 0, /* word - a Perl extension */
210	cbit_xdigit,-1, 0 /* xdigit */
211	};
212
213
214	#define STRING(a) # a
215	#define XSTRING(s) STRING(s)
216
217	/* The texts of compile-time error messages. These are "char *" because they
218	are passed to the outside world. Do not ever re-use any error number, because
219	they are documented. Always add a new error instead. Messages marked DEAD below
220	are no longer used. This used to be a table of strings, but in order to reduce
221	the number of relocations needed when a shared library is loaded dynamically,
222	it is now one long string. We cannot use a table of offsets, because the
223	lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
224	simply count through to the one we want - this isn't a performance issue
225	because these strings are used only when there is a compilation error. */
226
227	static const char error_texts[] =
228	"no error\0"
229	"\\ at end of pattern\0"
230	"\\c at end of pattern\0"
231	"unrecognized character follows \\\0"
232	"numbers out of order in {} quantifier\0"
233	/* 5 */
234	"number too big in {} quantifier\0"
235	"missing terminating ] for character class\0"
236	"invalid escape sequence in character class\0"
237	"range out of order in character class\0"
238	"nothing to repeat\0"
239	/* 10 */
240	"operand of unlimited repeat could match the empty string\0" / DEAD /
241	"internal error: unexpected repeat\0"
242	"unrecognized character after (? or (?-\0"
243	"POSIX named classes are supported only within a class\0"
244	"missing )\0"
245	/* 15 */
246	"reference to non-existent subpattern\0"
247	"erroffset passed as NULL\0"
248	"unknown option bit(s) set\0"
249	"missing ) after comment\0"
250	"parentheses nested too deeply\0" / DEAD /
251	/* 20 */
252	"regular expression is too large\0"
253	"failed to get memory\0"
254	"unmatched parentheses\0"
255	"internal error: code overflow\0"
256	"unrecognized character after (?<\0"
257	/* 25 */
258	"lookbehind assertion is not fixed length\0"
259	"malformed number or name after (?(\0"
260	"conditional group contains more than two branches\0"
261	"assertion expected after (?(\0"
262	"(?R or (?[+-]digits must be followed by )\0"
263	/* 30 */
264	"unknown POSIX class name\0"
265	"POSIX collating elements are not supported\0"
266	"this version of PCRE is not compiled with PCRE_UTF8 support\0"
267	"spare error\0" / DEAD /
268	"character value in \\x{...} sequence is too large\0"
269	/* 35 */
270	"invalid condition (?(0)\0"
271	"\\C not allowed in lookbehind assertion\0"
272	"PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
273	"number after (?C is > 255\0"
274	"closing ) for (?C expected\0"
275	/* 40 */
276	"recursive call could loop indefinitely\0"
277	"unrecognized character after (?P\0"
278	"syntax error in subpattern name (missing terminator)\0"
279	"two named subpatterns have the same name\0"
280	"invalid UTF-8 string\0"
281	/* 45 */
282	"support for \\P, \\p, and \\X has not been compiled\0"
283	"malformed \\P or \\p sequence\0"
284	"unknown property name after \\P or \\p\0"
285	"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
286	"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
287	/* 50 */
288	"repeated subpattern is too long\0" / DEAD /
289	"octal value is greater than \\377 (not in UTF-8 mode)\0"
290	"internal error: overran compiling workspace\0"
291	"internal error: previously-checked referenced subpattern not found\0"
292	"DEFINE group contains more than one branch\0"
293	/* 55 */
294	"repeating a DEFINE group is not allowed\0"
295	"inconsistent NEWLINE options\0"
296	"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
297	"a numbered reference must not be zero\0"
298	"(*VERB) with an argument is not supported\0"
299	/* 60 */
300	"(*VERB) not recognized\0"
301	"number is too big\0"
302	"subpattern name expected\0"
303	"digit expected after (?+\0"
304	"] is an invalid data character in JavaScript compatibility mode";
305
306
307	/* Table to identify digits and hex digits. This is used when compiling
308	patterns. Note that the tables in chartables are dependent on the locale, and
309	may mark arbitrary characters as digits - but the PCRE compiling code expects
310	to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
311	a private table here. It costs 256 bytes, but it is a lot faster than doing
312	character value tests (at least in some simple cases I timed), and in some
313	applications one wants PCRE to compile efficiently as well as match
314	efficiently.
315
316	For convenience, we use the same bit definitions as in chartables:
317
318	0x04 decimal digit
319	0x08 hexadecimal digit
320
321	Then we can use ctype_digit and ctype_xdigit in the code. */
322
323	#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
324	static const unsigned char digitab[] =
325	{
326	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
327	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
328	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
329	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
330	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
331	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
332	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
333	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
334	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
335	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
336	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
337	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
338	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
339	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
340	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
341	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
342	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
343	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
344	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
345	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
346	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
347	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
348	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
349	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
350	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
351	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
352	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
353	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
354	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
355	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
356	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
357	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
358
359	#else /* This is the "abnormal" case, for EBCDIC systems */
360	static const unsigned char digitab[] =
361	{
362	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
363	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
364	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
365	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
366	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
367	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
368	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
369	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
370	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
371	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- \| */
372	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
373	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
374	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
375	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
376	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
377	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
378	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
379	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
380	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
381	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
382	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
383	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
384	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
385	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
386	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
387	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
388	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
389	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
390	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
391	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
392	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
393	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
394
395	static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
396	0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
397	0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
398	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
399	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
400	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
401	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
402	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
403	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
404	0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
405	0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- \| */
406	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
407	0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
408	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
409	0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
410	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
411	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
412	0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
413	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
414	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
415	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
416	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
417	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
418	0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
419	0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
420	0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
421	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
422	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
423	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
424	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
425	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
426	0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
427	0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
428	#endif
429
430
431	/* Definition to allow mutual recursion */
432
433	static BOOL
434	compile_regex(int, int, uschar , const uschar , int *, BOOL, BOOL, int,
435	int , int , branch_chain , compile_data , int *);
436
437
438
439	/*************************************************
440	* Find an error text *
441	*************************************************/
442
443	/* The error texts are now all in one long string, to save on relocations. As
444	some of the text is of unknown length, we can't use a table of offsets.
445	Instead, just count through the strings. This is not a performance issue
446	because it happens only when there has been a compilation error.
447
448	Argument: the error number
449	Returns: pointer to the error string
450	*/
451
452	static const char *
453	find_error_text(int n)
454	{
455	const char *s = error_texts;
456	for (; n > 0; n--) while (*s++ != 0) {};
457	return s;
458	}
459
460
461	/*************************************************
462	* Handle escapes *
463	*************************************************/
464
465	/* This function is called when a \ has been encountered. It either returns a
466	positive value for a simple escape such as \n, or a negative value which
467	encodes one of the more complicated things such as \d. A backreference to group
468	n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
469	UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
470	ptr is pointing at the \. On exit, it is on the final character of the escape
471	sequence.
472
473	Arguments:
474	ptrptr points to the pattern position pointer
475	errorcodeptr points to the errorcode variable
476	bracount number of previous extracting brackets
477	options the options bits
478	isclass TRUE if inside a character class
479
480	Returns: zero or positive => a data character
481	negative => a special escape sequence
482	on error, errorcodeptr is set
483	*/
484
485	static int
486	check_escape(const uschar *ptrptr, int errorcodeptr, int bracount,
487	int options, BOOL isclass)
488	{
489	BOOL utf8 = (options & PCRE_UTF8) != 0;
490	const uschar ptr = ptrptr + 1;
491	int c, i;
492
493	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
494	ptr--; /* Set pointer back to the last byte */
495
496	/* If backslash is at the end of the pattern, it's an error. */
497
498	if (c == 0) *errorcodeptr = ERR1;
499
500	/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
501	in a table. A non-zero result is something that can be returned immediately.
502	Otherwise further processing may be required. */
503
504	#ifndef EBCDIC /* ASCII coding */
505	else if (c < '0' \|\| c > 'z') {} /* Not alphanumeric */
506	else if ((i = escapes[c - '0']) != 0) c = i;
507
508	#else /* EBCDIC coding */
509	else if (c < 'a' \|\| (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
510	else if ((i = escapes[c - 0x48]) != 0) c = i;
511	#endif
512
513	/* Escapes that need further processing, or are illegal. */
514
515	else
516	{
517	const uschar *oldptr;
518	BOOL braced, negated;
519
520	switch (c)
521	{
522	/* A number of Perl escapes are not handled by PCRE. We give an explicit
523	error. */
524
525	case 'l':
526	case 'L':
527	case 'N':
528	case 'u':
529	case 'U':
530	*errorcodeptr = ERR37;
531	break;
532
533	/* \g must be followed by one of a number of specific things:
534
535	(1) A number, either plain or braced. If positive, it is an absolute
536	backreference. If negative, it is a relative backreference. This is a Perl
537	5.10 feature.
538
539	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
540	is part of Perl's movement towards a unified syntax for back references. As
541	this is synonymous with \k{name}, we fudge it up by pretending it really
542	was \k.
543
544	(3) For Oniguruma compatibility we also support \g followed by a name or a
545	number either in angle brackets or in single quotes. However, these are
546	(possibly recursive) subroutine calls, _not_ backreferences. Just return
547	the -ESC_g code (cf \k). */
548
549	case 'g':
550	if (ptr[1] == '<' \|\| ptr[1] == '\'')
551	{
552	c = -ESC_g;
553	break;
554	}
555
556	/* Handle the Perl-compatible cases */
557
558	if (ptr[1] == '{')
559	{
560	const uschar *p;
561	for (p = ptr+2; p != 0 && p != '}'; p++)
562	if (p != '-' && (digitab[p] & ctype_digit) == 0) break;
563	if (p != 0 && p != '}')
564	{
565	c = -ESC_k;
566	break;
567	}
568	braced = TRUE;
569	ptr++;
570	}
571	else braced = FALSE;
572
573	if (ptr[1] == '-')
574	{
575	negated = TRUE;
576	ptr++;
577	}
578	else negated = FALSE;
579
580	c = 0;
581	while ((digitab[ptr[1]] & ctype_digit) != 0)
582	c = c * 10 + *(++ptr) - '0';
583
584	if (c < 0) /* Integer overflow */
585	{
586	*errorcodeptr = ERR61;
587	break;
588	}
589
590	if (braced && *(++ptr) != '}')
591	{
592	*errorcodeptr = ERR57;
593	break;
594	}
595
596	if (c == 0)
597	{
598	*errorcodeptr = ERR58;
599	break;
600	}
601
602	if (negated)
603	{
604	if (c > bracount)
605	{
606	*errorcodeptr = ERR15;
607	break;
608	}
609	c = bracount - (c - 1);
610	}
611
612	c = -(ESC_REF + c);
613	break;
614
615	/* The handling of escape sequences consisting of a string of digits
616	starting with one that is not zero is not straightforward. By experiment,
617	the way Perl works seems to be as follows:
618
619	Outside a character class, the digits are read as a decimal number. If the
620	number is less than 10, or if there are that many previous extracting
621	left brackets, then it is a back reference. Otherwise, up to three octal
622	digits are read to form an escaped byte. Thus \123 is likely to be octal
623	123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
624	value is greater than 377, the least significant 8 bits are taken. Inside a
625	character class, \ followed by a digit is always an octal number. */
626
627	case '1': case '2': case '3': case '4': case '5':
628	case '6': case '7': case '8': case '9':
629
630	if (!isclass)
631	{
632	oldptr = ptr;
633	c -= '0';
634	while ((digitab[ptr[1]] & ctype_digit) != 0)
635	c = c * 10 + *(++ptr) - '0';
636	if (c < 0) /* Integer overflow */
637	{
638	*errorcodeptr = ERR61;
639	break;
640	}
641	if (c < 10 \|\| c <= bracount)
642	{
643	c = -(ESC_REF + c);
644	break;
645	}
646	ptr = oldptr; /* Put the pointer back and fall through */
647	}
648
649	/* Handle an octal number following \. If the first digit is 8 or 9, Perl
650	generates a binary zero byte and treats the digit as a following literal.
651	Thus we have to pull back the pointer by one. */
652
653	if ((c = *ptr) >= '8')
654	{
655	ptr--;
656	c = 0;
657	break;
658	}
659
660	/* \0 always starts an octal number, but we may drop through to here with a
661	larger first octal digit. The original code used just to take the least
662	significant 8 bits of octal numbers (I think this is what early Perls used
663	to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
664	than 3 octal digits. */
665
666	case '0':
667	c -= '0';
668	while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
669	c = c * 8 + *(++ptr) - '0';
670	if (!utf8 && c > 255) *errorcodeptr = ERR51;
671	break;
672
673	/* \x is complicated. \x{ddd} is a character number which can be greater
674	than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
675	treated as a data character. */
676
677	case 'x':
678	if (ptr[1] == '{')
679	{
680	const uschar *pt = ptr + 2;
681	int count = 0;
682
683	c = 0;
684	while ((digitab[*pt] & ctype_xdigit) != 0)
685	{
686	register int cc = *pt++;
687	if (c == 0 && cc == '0') continue; /* Leading zeroes */
688	count++;
689
690	#ifndef EBCDIC /* ASCII coding */
691	if (cc >= 'a') cc -= 32; /* Convert to upper case */
692	c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
693	#else /* EBCDIC coding */
694	if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
695	c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
696	#endif
697	}
698
699	if (*pt == '}')
700	{
701	if (c < 0 \|\| count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
702	ptr = pt;
703	break;
704	}
705
706	/* If the sequence of hex digits does not end with '}', then we don't
707	recognize this construct; fall through to the normal \x handling. */
708	}
709
710	/* Read just a single-byte hex-defined char */
711
712	c = 0;
713	while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
714	{
715	int cc; /* Some compilers don't like ++ */
716	cc = (++ptr); / in initializers */
717	#ifndef EBCDIC /* ASCII coding */
718	if (cc >= 'a') cc -= 32; /* Convert to upper case */
719	c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
720	#else /* EBCDIC coding */
721	if (cc <= 'z') cc += 64; /* Convert to upper case */
722	c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
723	#endif
724	}
725	break;
726
727	/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
728	This coding is ASCII-specific, but then the whole concept of \cx is
729	ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
730
731	case 'c':
732	c = *(++ptr);
733	if (c == 0)
734	{
735	*errorcodeptr = ERR2;
736	break;
737	}
738
739	#ifndef EBCDIC /* ASCII coding */
740	if (c >= 'a' && c <= 'z') c -= 32;
741	c ^= 0x40;
742	#else /* EBCDIC coding */
743	if (c >= 'a' && c <= 'z') c += 64;
744	c ^= 0xC0;
745	#endif
746	break;
747
748	/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
749	other alphanumeric following \ is an error if PCRE_EXTRA was set;
750	otherwise, for Perl compatibility, it is a literal. This code looks a bit
751	odd, but there used to be some cases other than the default, and there may
752	be again in future, so I haven't "optimized" it. */
753
754	default:
755	if ((options & PCRE_EXTRA) != 0) switch(c)
756	{
757	default:
758	*errorcodeptr = ERR3;
759	break;
760	}
761	break;
762	}
763	}
764
765	*ptrptr = ptr;
766	return c;
767	}
768
769
770
771	#ifdef SUPPORT_UCP
772	/*************************************************
773	* Handle \P and \p *
774	*************************************************/
775
776	/* This function is called after \P or \p has been encountered, provided that
777	PCRE is compiled with support for Unicode properties. On entry, ptrptr is
778	pointing at the P or p. On exit, it is pointing at the final character of the
779	escape sequence.
780
781	Argument:
782	ptrptr points to the pattern position pointer
783	negptr points to a boolean that is set TRUE for negation else FALSE
784	dptr points to an int that is set to the detailed property value
785	errorcodeptr points to the error code variable
786
787	Returns: type value from ucp_type_table, or -1 for an invalid type
788	*/
789
790	static int
791	get_ucp(const uschar *ptrptr, BOOL negptr, int dptr, int errorcodeptr)
792	{
793	int c, i, bot, top;
794	const uschar ptr = ptrptr;
795	char name[32];
796
797	c = *(++ptr);
798	if (c == 0) goto ERROR_RETURN;
799
800	*negptr = FALSE;
801
802	/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
803	negation. */
804
805	if (c == '{')
806	{
807	if (ptr[1] == '^')
808	{
809	*negptr = TRUE;
810	ptr++;
811	}
812	for (i = 0; i < (int)sizeof(name) - 1; i++)
813	{
814	c = *(++ptr);
815	if (c == 0) goto ERROR_RETURN;
816	if (c == '}') break;
817	name[i] = c;
818	}
819	if (c !='}') goto ERROR_RETURN;
820	name[i] = 0;
821	}
822
823	/* Otherwise there is just one following character */
824
825	else
826	{
827	name[0] = c;
828	name[1] = 0;
829	}
830
831	*ptrptr = ptr;
832
833	/* Search for a recognized property name using binary chop */
834
835	bot = 0;
836	top = _pcre_utt_size;
837
838	while (bot < top)
839	{
840	i = (bot + top) >> 1;
841	c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
842	if (c == 0)
843	{
844	*dptr = _pcre_utt[i].value;
845	return _pcre_utt[i].type;
846	}
847	if (c > 0) bot = i + 1; else top = i;
848	}
849
850	*errorcodeptr = ERR47;
851	*ptrptr = ptr;
852	return -1;
853
854	ERROR_RETURN:
855	*errorcodeptr = ERR46;
856	*ptrptr = ptr;
857	return -1;
858	}
859	#endif
860
861
862
863
864	/*************************************************
865	* Check for counted repeat *
866	*************************************************/
867
868	/* This function is called when a '{' is encountered in a place where it might
869	start a quantifier. It looks ahead to see if it really is a quantifier or not.
870	It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
871	where the ddds are digits.
872
873	Arguments:
874	p pointer to the first char after '{'
875
876	Returns: TRUE or FALSE
877	*/
878
879	static BOOL
880	is_counted_repeat(const uschar *p)
881	{
882	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
883	while ((digitab[*p] & ctype_digit) != 0) p++;
884	if (*p == '}') return TRUE;
885
886	if (*p++ != ',') return FALSE;
887	if (*p == '}') return TRUE;
888
889	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
890	while ((digitab[*p] & ctype_digit) != 0) p++;
891
892	return (*p == '}');
893	}
894
895
896
897	/*************************************************
898	* Read repeat counts *
899	*************************************************/
900
901	/* Read an item of the form {n,m} and return the values. This is called only
902	after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
903	so the syntax is guaranteed to be correct, but we need to check the values.
904
905	Arguments:
906	p pointer to first char after '{'
907	minp pointer to int for min
908	maxp pointer to int for max
909	returned as -1 if no max
910	errorcodeptr points to error code variable
911
912	Returns: pointer to '}' on success;
913	current ptr on error, with errorcodeptr set non-zero
914	*/
915
916	static const uschar *
917	read_repeat_counts(const uschar p, int minp, int maxp, int errorcodeptr)
918	{
919	int min = 0;
920	int max = -1;
921
922	/* Read the minimum value and do a paranoid check: a negative value indicates
923	an integer overflow. */
924
925	while ((digitab[p] & ctype_digit) != 0) min = min 10 + *p++ - '0';
926	if (min < 0 \|\| min > 65535)
927	{
928	*errorcodeptr = ERR5;
929	return p;
930	}
931
932	/* Read the maximum value if there is one, and again do a paranoid on its size.
933	Also, max must not be less than min. */
934
935	if (*p == '}') max = min; else
936	{
937	if (*(++p) != '}')
938	{
939	max = 0;
940	while((digitab[p] & ctype_digit) != 0) max = max 10 + *p++ - '0';
941	if (max < 0 \|\| max > 65535)
942	{
943	*errorcodeptr = ERR5;
944	return p;
945	}
946	if (max < min)
947	{
948	*errorcodeptr = ERR4;
949	return p;
950	}
951	}
952	}
953
954	/* Fill in the required variables, and pass back the pointer to the terminating
955	'}'. */
956
957	*minp = min;
958	*maxp = max;
959	return p;
960	}
961
962
963
964	/*************************************************
965	* Find forward referenced subpattern *
966	*************************************************/
967
968	/* This function scans along a pattern's text looking for capturing
969	subpatterns, and counting them. If it finds a named pattern that matches the
970	name it is given, it returns its number. Alternatively, if the name is NULL, it
971	returns when it reaches a given numbered subpattern. This is used for forward
972	references to subpatterns. We know that if (?P< is encountered, the name will
973	be terminated by '>' because that is checked in the first pass.
974
975	Arguments:
976	ptr current position in the pattern
977	cd compile background data
978	name name to seek, or NULL if seeking a numbered subpattern
979	lorn name length, or subpattern number if name is NULL
980	xmode TRUE if we are in /x mode
981
982	Returns: the number of the named subpattern, or -1 if not found
983	*/
984
985	static int
986	find_parens(const uschar ptr, compile_data cd, const uschar *name, int lorn,
987	BOOL xmode)
988	{
989	const uschar *thisname;
990	int count = cd->bracount;
991
992	for (; *ptr != 0; ptr++)
993	{
994	int term;
995
996	/* Skip over backslashed characters and also entire \Q...\E */
997
998	if (*ptr == '\\')
999	{
1000	if (*(++ptr) == 0) return -1;
1001	if (*ptr == 'Q') for (;;)
1002	{
1003	while ((++ptr) != 0 && ptr != '\\') {};
1004	if (*ptr == 0) return -1;
1005	if (*(++ptr) == 'E') break;
1006	}
1007	continue;
1008	}
1009
1010	/* Skip over character classes; this logic must be similar to the way they
1011	are handled for real. If the first character is '^', skip it. Also, if the
1012	first few characters (either before or after ^) are \Q\E or \E we skip them
1013	too. This makes for compatibility with Perl. */
1014
1015	if (*ptr == '[')
1016	{
1017	BOOL negate_class = FALSE;
1018	for (;;)
1019	{
1020	int c = *(++ptr);
1021	if (c == '\\')
1022	{
1023	if (ptr[1] == 'E') ptr++;
1024	else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1025	else break;
1026	}
1027	else if (!negate_class && c == '^')
1028	negate_class = TRUE;
1029	else break;
1030	}
1031
1032	/* If the next character is ']', it is a data character that must be
1033	skipped, except in JavaScript compatibility mode. */
1034
1035	if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1036	ptr++;
1037
1038	while (*(++ptr) != ']')
1039	{
1040	if (*ptr == 0) return -1;
1041	if (*ptr == '\\')
1042	{
1043	if (*(++ptr) == 0) return -1;
1044	if (*ptr == 'Q') for (;;)
1045	{
1046	while ((++ptr) != 0 && ptr != '\\') {};
1047	if (*ptr == 0) return -1;
1048	if (*(++ptr) == 'E') break;
1049	}
1050	continue;
1051	}
1052	}
1053	continue;
1054	}
1055
1056	/* Skip comments in /x mode */
1057
1058	if (xmode && *ptr == '#')
1059	{
1060	while ((++ptr) != 0 && ptr != '\n') {};
1061	if (*ptr == 0) return -1;
1062	continue;
1063	}
1064
1065	/* An opening parens must now be a real metacharacter */
1066
1067	if (*ptr != '(') continue;
1068	if (ptr[1] != '?' && ptr[1] != '*')
1069	{
1070	count++;
1071	if (name == NULL && count == lorn) return count;
1072	continue;
1073	}
1074
1075	ptr += 2;
1076	if (ptr == 'P') ptr++; / Allow optional P */
1077
1078	/* We have to disambiguate (?<! and (?<= from (?<name> */
1079
1080	if ((*ptr != '<' \|\| ptr[1] == '!' \|\| ptr[1] == '=') &&
1081	*ptr != '\'')
1082	continue;
1083
1084	count++;
1085
1086	if (name == NULL && count == lorn) return count;
1087	term = *ptr++;
1088	if (term == '<') term = '>';
1089	thisname = ptr;
1090	while (*ptr != term) ptr++;
1091	if (name != NULL && lorn == ptr - thisname &&
1092	strncmp((const char )name, (const char )thisname, lorn) == 0)
1093	return count;
1094	}
1095
1096	return -1;
1097	}
1098
1099
1100
1101	/*************************************************
1102	* Find first significant op code *
1103	*************************************************/
1104
1105	/* This is called by several functions that scan a compiled expression looking
1106	for a fixed first character, or an anchoring op code etc. It skips over things
1107	that do not influence this. For some calls, a change of option is important.
1108	For some calls, it makes sense to skip negative forward and all backward
1109	assertions, and also the \b assertion; for others it does not.
1110
1111	Arguments:
1112	code pointer to the start of the group
1113	options pointer to external options
1114	optbit the option bit whose changing is significant, or
1115	zero if none are
1116	skipassert TRUE if certain assertions are to be skipped
1117
1118	Returns: pointer to the first significant opcode
1119	*/
1120
1121	static const uschar*
1122	first_significant_code(const uschar code, int options, int optbit,
1123	BOOL skipassert)
1124	{
1125	for (;;)
1126	{
1127	switch ((int)*code)
1128	{
1129	case OP_OPT:
1130	if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1131	*options = (int)code[1];
1132	code += 2;
1133	break;
1134
1135	case OP_ASSERT_NOT:
1136	case OP_ASSERTBACK:
1137	case OP_ASSERTBACK_NOT:
1138	if (!skipassert) return code;
1139	do code += GET(code, 1); while (*code == OP_ALT);
1140	code += _pcre_OP_lengths[*code];
1141	break;
1142
1143	case OP_WORD_BOUNDARY:
1144	case OP_NOT_WORD_BOUNDARY:
1145	if (!skipassert) return code;
1146	/* Fall through */
1147
1148	case OP_CALLOUT:
1149	case OP_CREF:
1150	case OP_RREF:
1151	case OP_DEF:
1152	code += _pcre_OP_lengths[*code];
1153	break;
1154
1155	default:
1156	return code;
1157	}
1158	}
1159	/* Control never reaches here */
1160	}
1161
1162
1163
1164
1165	/*************************************************
1166	* Find the fixed length of a pattern *
1167	*************************************************/
1168
1169	/* Scan a pattern and compute the fixed length of subject that will match it,
1170	if the length is fixed. This is needed for dealing with backward assertions.
1171	In UTF8 mode, the result is in characters rather than bytes.
1172
1173	Arguments:
1174	code points to the start of the pattern (the bracket)
1175	options the compiling options
1176
1177	Returns: the fixed length, or -1 if there is no fixed length,
1178	or -2 if \C was encountered
1179	*/
1180
1181	static int
1182	find_fixedlength(uschar *code, int options)
1183	{
1184	int length = -1;
1185
1186	register int branchlength = 0;
1187	register uschar *cc = code + 1 + LINK_SIZE;
1188
1189	/* Scan along the opcodes for this branch. If we get to the end of the
1190	branch, check the length against that of the other branches. */
1191
1192	for (;;)
1193	{
1194	int d;
1195	register int op = *cc;
1196	switch (op)
1197	{
1198	case OP_CBRA:
1199	case OP_BRA:
1200	case OP_ONCE:
1201	case OP_COND:
1202	d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1203	if (d < 0) return d;
1204	branchlength += d;
1205	do cc += GET(cc, 1); while (*cc == OP_ALT);
1206	cc += 1 + LINK_SIZE;
1207	break;
1208
1209	/* Reached end of a branch; if it's a ket it is the end of a nested
1210	call. If it's ALT it is an alternation in a nested call. If it is
1211	END it's the end of the outer call. All can be handled by the same code. */
1212
1213	case OP_ALT:
1214	case OP_KET:
1215	case OP_KETRMAX:
1216	case OP_KETRMIN:
1217	case OP_END:
1218	if (length < 0) length = branchlength;
1219	else if (length != branchlength) return -1;
1220	if (*cc != OP_ALT) return length;
1221	cc += 1 + LINK_SIZE;
1222	branchlength = 0;
1223	break;
1224
1225	/* Skip over assertive subpatterns */
1226
1227	case OP_ASSERT:
1228	case OP_ASSERT_NOT:
1229	case OP_ASSERTBACK:
1230	case OP_ASSERTBACK_NOT:
1231	do cc += GET(cc, 1); while (*cc == OP_ALT);
1232	/* Fall through */
1233
1234	/* Skip over things that don't match chars */
1235
1236	case OP_REVERSE:
1237	case OP_CREF:
1238	case OP_RREF:
1239	case OP_DEF:
1240	case OP_OPT:
1241	case OP_CALLOUT:
1242	case OP_SOD:
1243	case OP_SOM:
1244	case OP_EOD:
1245	case OP_EODN:
1246	case OP_CIRC:
1247	case OP_DOLL:
1248	case OP_NOT_WORD_BOUNDARY:
1249	case OP_WORD_BOUNDARY:
1250	cc += _pcre_OP_lengths[*cc];
1251	break;
1252
1253	/* Handle literal characters */
1254
1255	case OP_CHAR:
1256	case OP_CHARNC:
1257	case OP_NOT:
1258	branchlength++;
1259	cc += 2;
1260	#ifdef SUPPORT_UTF8
1261	if ((options & PCRE_UTF8) != 0)
1262	{
1263	while ((*cc & 0xc0) == 0x80) cc++;
1264	}
1265	#endif
1266	break;
1267
1268	/* Handle exact repetitions. The count is already in characters, but we
1269	need to skip over a multibyte character in UTF8 mode. */
1270
1271	case OP_EXACT:
1272	branchlength += GET2(cc,1);
1273	cc += 4;
1274	#ifdef SUPPORT_UTF8
1275	if ((options & PCRE_UTF8) != 0)
1276	{
1277	while((*cc & 0x80) == 0x80) cc++;
1278	}
1279	#endif
1280	break;
1281
1282	case OP_TYPEEXACT:
1283	branchlength += GET2(cc,1);
1284	if (cc[3] == OP_PROP \|\| cc[3] == OP_NOTPROP) cc += 2;
1285	cc += 4;
1286	break;
1287
1288	/* Handle single-char matchers */
1289
1290	case OP_PROP:
1291	case OP_NOTPROP:
1292	cc += 2;
1293	/* Fall through */
1294
1295	case OP_NOT_DIGIT:
1296	case OP_DIGIT:
1297	case OP_NOT_WHITESPACE:
1298	case OP_WHITESPACE:
1299	case OP_NOT_WORDCHAR:
1300	case OP_WORDCHAR:
1301	case OP_ANY:
1302	case OP_ALLANY:
1303	branchlength++;
1304	cc++;
1305	break;
1306
1307	/* The single-byte matcher isn't allowed */
1308
1309	case OP_ANYBYTE:
1310	return -2;
1311
1312	/* Check a class for variable quantification */
1313
1314	#ifdef SUPPORT_UTF8
1315	case OP_XCLASS:
1316	cc += GET(cc, 1) - 33;
1317	/* Fall through */
1318	#endif
1319
1320	case OP_CLASS:
1321	case OP_NCLASS:
1322	cc += 33;
1323
1324	switch (*cc)
1325	{
1326	case OP_CRSTAR:
1327	case OP_CRMINSTAR:
1328	case OP_CRQUERY:
1329	case OP_CRMINQUERY:
1330	return -1;
1331
1332	case OP_CRRANGE:
1333	case OP_CRMINRANGE:
1334	if (GET2(cc,1) != GET2(cc,3)) return -1;
1335	branchlength += GET2(cc,1);
1336	cc += 5;
1337	break;
1338
1339	default:
1340	branchlength++;
1341	}
1342	break;
1343
1344	/* Anything else is variable length */
1345
1346	default:
1347	return -1;
1348	}
1349	}
1350	/* Control never gets here */
1351	}
1352
1353
1354
1355
1356	/*************************************************
1357	* Scan compiled regex for numbered bracket *
1358	*************************************************/
1359
1360	/* This little function scans through a compiled pattern until it finds a
1361	capturing bracket with the given number.
1362
1363	Arguments:
1364	code points to start of expression
1365	utf8 TRUE in UTF-8 mode
1366	number the required bracket number
1367
1368	Returns: pointer to the opcode for the bracket, or NULL if not found
1369	*/
1370
1371	static const uschar *
1372	find_bracket(const uschar *code, BOOL utf8, int number)
1373	{
1374	for (;;)
1375	{
1376	register int c = *code;
1377	if (c == OP_END) return NULL;
1378
1379	/* XCLASS is used for classes that cannot be represented just by a bit
1380	map. This includes negated single high-valued characters. The length in
1381	the table is zero; the actual length is stored in the compiled code. */
1382
1383	if (c == OP_XCLASS) code += GET(code, 1);
1384
1385	/* Handle capturing bracket */
1386
1387	else if (c == OP_CBRA)
1388	{
1389	int n = GET2(code, 1+LINK_SIZE);
1390	if (n == number) return (uschar *)code;
1391	code += _pcre_OP_lengths[c];
1392	}
1393
1394	/* Otherwise, we can get the item's length from the table, except that for
1395	repeated character types, we have to test for \p and \P, which have an extra
1396	two bytes of parameters. */
1397
1398	else
1399	{
1400	switch(c)
1401	{
1402	case OP_TYPESTAR:
1403	case OP_TYPEMINSTAR:
1404	case OP_TYPEPLUS:
1405	case OP_TYPEMINPLUS:
1406	case OP_TYPEQUERY:
1407	case OP_TYPEMINQUERY:
1408	case OP_TYPEPOSSTAR:
1409	case OP_TYPEPOSPLUS:
1410	case OP_TYPEPOSQUERY:
1411	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
1412	break;
1413
1414	case OP_TYPEUPTO:
1415	case OP_TYPEMINUPTO:
1416	case OP_TYPEEXACT:
1417	case OP_TYPEPOSUPTO:
1418	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
1419	break;
1420	}
1421
1422	/* Add in the fixed length from the table */
1423
1424	code += _pcre_OP_lengths[c];
1425
1426	/* In UTF-8 mode, opcodes that are followed by a character may be followed by
1427	a multi-byte character. The length in the table is a minimum, so we have to
1428	arrange to skip the extra bytes. */
1429
1430	#ifdef SUPPORT_UTF8
1431	if (utf8) switch(c)
1432	{
1433	case OP_CHAR:
1434	case OP_CHARNC:
1435	case OP_EXACT:
1436	case OP_UPTO:
1437	case OP_MINUPTO:
1438	case OP_POSUPTO:
1439	case OP_STAR:
1440	case OP_MINSTAR:
1441	case OP_POSSTAR:
1442	case OP_PLUS:
1443	case OP_MINPLUS:
1444	case OP_POSPLUS:
1445	case OP_QUERY:
1446	case OP_MINQUERY:
1447	case OP_POSQUERY:
1448	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1449	break;
1450	}
1451	#else
1452	(void)(utf8); /* Keep compiler happy by referencing function argument */
1453	#endif
1454	}
1455	}
1456	}
1457
1458
1459
1460	/*************************************************
1461	* Scan compiled regex for recursion reference *
1462	*************************************************/
1463
1464	/* This little function scans through a compiled pattern until it finds an
1465	instance of OP_RECURSE.
1466
1467	Arguments:
1468	code points to start of expression
1469	utf8 TRUE in UTF-8 mode
1470
1471	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1472	*/
1473
1474	static const uschar *
1475	find_recurse(const uschar *code, BOOL utf8)
1476	{
1477	for (;;)
1478	{
1479	register int c = *code;
1480	if (c == OP_END) return NULL;
1481	if (c == OP_RECURSE) return code;
1482
1483	/* XCLASS is used for classes that cannot be represented just by a bit
1484	map. This includes negated single high-valued characters. The length in
1485	the table is zero; the actual length is stored in the compiled code. */
1486
1487	if (c == OP_XCLASS) code += GET(code, 1);
1488
1489	/* Otherwise, we can get the item's length from the table, except that for
1490	repeated character types, we have to test for \p and \P, which have an extra
1491	two bytes of parameters. */
1492
1493	else
1494	{
1495	switch(c)
1496	{
1497	case OP_TYPESTAR:
1498	case OP_TYPEMINSTAR:
1499	case OP_TYPEPLUS:
1500	case OP_TYPEMINPLUS:
1501	case OP_TYPEQUERY:
1502	case OP_TYPEMINQUERY:
1503	case OP_TYPEPOSSTAR:
1504	case OP_TYPEPOSPLUS:
1505	case OP_TYPEPOSQUERY:
1506	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
1507	break;
1508
1509	case OP_TYPEPOSUPTO:
1510	case OP_TYPEUPTO:
1511	case OP_TYPEMINUPTO:
1512	case OP_TYPEEXACT:
1513	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
1514	break;
1515	}
1516
1517	/* Add in the fixed length from the table */
1518
1519	code += _pcre_OP_lengths[c];
1520
1521	/* In UTF-8 mode, opcodes that are followed by a character may be followed
1522	by a multi-byte character. The length in the table is a minimum, so we have
1523	to arrange to skip the extra bytes. */
1524
1525	#ifdef SUPPORT_UTF8
1526	if (utf8) switch(c)
1527	{
1528	case OP_CHAR:
1529	case OP_CHARNC:
1530	case OP_EXACT:
1531	case OP_UPTO:
1532	case OP_MINUPTO:
1533	case OP_POSUPTO:
1534	case OP_STAR:
1535	case OP_MINSTAR:
1536	case OP_POSSTAR:
1537	case OP_PLUS:
1538	case OP_MINPLUS:
1539	case OP_POSPLUS:
1540	case OP_QUERY:
1541	case OP_MINQUERY:
1542	case OP_POSQUERY:
1543	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1544	break;
1545	}
1546	#else
1547	(void)(utf8); /* Keep compiler happy by referencing function argument */
1548	#endif
1549	}
1550	}
1551	}
1552
1553
1554
1555	/*************************************************
1556	* Scan compiled branch for non-emptiness *
1557	*************************************************/
1558
1559	/* This function scans through a branch of a compiled pattern to see whether it
1560	can match the empty string or not. It is called from could_be_empty()
1561	below and from compile_branch() when checking for an unlimited repeat of a
1562	group that can match nothing. Note that first_significant_code() skips over
1563	backward and negative forward assertions when its final argument is TRUE. If we
1564	hit an unclosed bracket, we return "empty" - this means we've struck an inner
1565	bracket whose current branch will already have been scanned.
1566
1567	Arguments:
1568	code points to start of search
1569	endcode points to where to stop
1570	utf8 TRUE if in UTF8 mode
1571
1572	Returns: TRUE if what is matched could be empty
1573	*/
1574
1575	static BOOL
1576	could_be_empty_branch(const uschar code, const uschar endcode, BOOL utf8)
1577	{
1578	register int c;
1579	for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1580	code < endcode;
1581	code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1582	{
1583	const uschar *ccode;
1584
1585	c = *code;
1586
1587	/* Skip over forward assertions; the other assertions are skipped by
1588	first_significant_code() with a TRUE final argument. */
1589
1590	if (c == OP_ASSERT)
1591	{
1592	do code += GET(code, 1); while (*code == OP_ALT);
1593	c = *code;
1594	continue;
1595	}
1596
1597	/* Groups with zero repeats can of course be empty; skip them. */
1598
1599	if (c == OP_BRAZERO \|\| c == OP_BRAMINZERO \|\| c == OP_SKIPZERO)
1600	{
1601	code += _pcre_OP_lengths[c];
1602	do code += GET(code, 1); while (*code == OP_ALT);
1603	c = *code;
1604	continue;
1605	}
1606
1607	/* For other groups, scan the branches. */
1608
1609	if (c == OP_BRA \|\| c == OP_CBRA \|\| c == OP_ONCE \|\| c == OP_COND)
1610	{
1611	BOOL empty_branch;
1612	if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1613
1614	/* Scan a closed bracket */
1615
1616	empty_branch = FALSE;
1617	do
1618	{
1619	if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1620	empty_branch = TRUE;
1621	code += GET(code, 1);
1622	}
1623	while (*code == OP_ALT);
1624	if (!empty_branch) return FALSE; /* All branches are non-empty */
1625	c = *code;
1626	continue;
1627	}
1628
1629	/* Handle the other opcodes */
1630
1631	switch (c)
1632	{
1633	/* Check for quantifiers after a class. XCLASS is used for classes that
1634	cannot be represented just by a bit map. This includes negated single
1635	high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1636	actual length is stored in the compiled code, so we must update "code"
1637	here. */
1638
1639	#ifdef SUPPORT_UTF8
1640	case OP_XCLASS:
1641	ccode = code += GET(code, 1);
1642	goto CHECK_CLASS_REPEAT;
1643	#endif
1644
1645	case OP_CLASS:
1646	case OP_NCLASS:
1647	ccode = code + 33;
1648
1649	#ifdef SUPPORT_UTF8
1650	CHECK_CLASS_REPEAT:
1651	#endif
1652
1653	switch (*ccode)
1654	{
1655	case OP_CRSTAR: /* These could be empty; continue */
1656	case OP_CRMINSTAR:
1657	case OP_CRQUERY:
1658	case OP_CRMINQUERY:
1659	break;
1660
1661	default: /* Non-repeat => class must match */
1662	case OP_CRPLUS: /* These repeats aren't empty */
1663	case OP_CRMINPLUS:
1664	return FALSE;
1665
1666	case OP_CRRANGE:
1667	case OP_CRMINRANGE:
1668	if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1669	break;
1670	}
1671	break;
1672
1673	/* Opcodes that must match a character */
1674
1675	case OP_PROP:
1676	case OP_NOTPROP:
1677	case OP_EXTUNI:
1678	case OP_NOT_DIGIT:
1679	case OP_DIGIT:
1680	case OP_NOT_WHITESPACE:
1681	case OP_WHITESPACE:
1682	case OP_NOT_WORDCHAR:
1683	case OP_WORDCHAR:
1684	case OP_ANY:
1685	case OP_ALLANY:
1686	case OP_ANYBYTE:
1687	case OP_CHAR:
1688	case OP_CHARNC:
1689	case OP_NOT:
1690	case OP_PLUS:
1691	case OP_MINPLUS:
1692	case OP_POSPLUS:
1693	case OP_EXACT:
1694	case OP_NOTPLUS:
1695	case OP_NOTMINPLUS:
1696	case OP_NOTPOSPLUS:
1697	case OP_NOTEXACT:
1698	case OP_TYPEPLUS:
1699	case OP_TYPEMINPLUS:
1700	case OP_TYPEPOSPLUS:
1701	case OP_TYPEEXACT:
1702	return FALSE;
1703
1704	/* These are going to continue, as they may be empty, but we have to
1705	fudge the length for the \p and \P cases. */
1706
1707	case OP_TYPESTAR:
1708	case OP_TYPEMINSTAR:
1709	case OP_TYPEPOSSTAR:
1710	case OP_TYPEQUERY:
1711	case OP_TYPEMINQUERY:
1712	case OP_TYPEPOSQUERY:
1713	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
1714	break;
1715
1716	/* Same for these */
1717
1718	case OP_TYPEUPTO:
1719	case OP_TYPEMINUPTO:
1720	case OP_TYPEPOSUPTO:
1721	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
1722	break;
1723
1724	/* End of branch */
1725
1726	case OP_KET:
1727	case OP_KETRMAX:
1728	case OP_KETRMIN:
1729	case OP_ALT:
1730	return TRUE;
1731
1732	/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1733	MINUPTO, and POSUPTO may be followed by a multibyte character */
1734
1735	#ifdef SUPPORT_UTF8
1736	case OP_STAR:
1737	case OP_MINSTAR:
1738	case OP_POSSTAR:
1739	case OP_QUERY:
1740	case OP_MINQUERY:
1741	case OP_POSQUERY:
1742	case OP_UPTO:
1743	case OP_MINUPTO:
1744	case OP_POSUPTO:
1745	if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1746	break;
1747	#endif
1748	}
1749	}
1750
1751	return TRUE;
1752	}
1753
1754
1755
1756	/*************************************************
1757	* Scan compiled regex for non-emptiness *
1758	*************************************************/
1759
1760	/* This function is called to check for left recursive calls. We want to check
1761	the current branch of the current pattern to see if it could match the empty
1762	string. If it could, we must look outwards for branches at other levels,
1763	stopping when we pass beyond the bracket which is the subject of the recursion.
1764
1765	Arguments:
1766	code points to start of the recursion
1767	endcode points to where to stop (current RECURSE item)
1768	bcptr points to the chain of current (unclosed) branch starts
1769	utf8 TRUE if in UTF-8 mode
1770
1771	Returns: TRUE if what is matched could be empty
1772	*/
1773
1774	static BOOL
1775	could_be_empty(const uschar code, const uschar endcode, branch_chain *bcptr,
1776	BOOL utf8)
1777	{
1778	while (bcptr != NULL && bcptr->current >= code)
1779	{
1780	if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1781	bcptr = bcptr->outer;
1782	}
1783	return TRUE;
1784	}
1785
1786
1787
1788	/*************************************************
1789	* Check for POSIX class syntax *
1790	*************************************************/
1791
1792	/* This function is called when the sequence "[:" or "[." or "[=" is
1793	encountered in a character class. It checks whether this is followed by a
1794	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1795	reach an unescaped ']' without the special preceding character, return FALSE.
1796
1797	Originally, this function only recognized a sequence of letters between the
1798	terminators, but it seems that Perl recognizes any sequence of characters,
1799	though of course unknown POSIX names are subsequently rejected. Perl gives an
1800	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1801	didn't consider this to be a POSIX class. Likewise for [:1234:].
1802
1803	The problem in trying to be exactly like Perl is in the handling of escapes. We
1804	have to be sure that [abc[:x\]pqr] is not treated as containing a POSIX
1805	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1806	below handles the special case of \], but does not try to do any other escape
1807	processing. This makes it different from Perl for cases such as [:l\ower:]
1808	where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1809	"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1810	I think.
1811
1812	Arguments:
1813	ptr pointer to the initial [
1814	endptr where to return the end pointer
1815
1816	Returns: TRUE or FALSE
1817	*/
1818
1819	static BOOL
1820	check_posix_syntax(const uschar ptr, const uschar *endptr)
1821	{
1822	int terminator; /* Don't combine these lines; the Solaris cc */
1823	terminator = (++ptr); / compiler warns about "non-constant" initializer. */
1824	for (++ptr; *ptr != 0; ptr++)
1825	{
1826	if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1827	{
1828	if (*ptr == ']') return FALSE;
1829	if (*ptr == terminator && ptr[1] == ']')
1830	{
1831	*endptr = ptr;
1832	return TRUE;
1833	}
1834	}
1835	}
1836	return FALSE;
1837	}
1838
1839
1840
1841
1842	/*************************************************
1843	* Check POSIX class name *
1844	*************************************************/
1845
1846	/* This function is called to check the name given in a POSIX-style class entry
1847	such as [:alnum:].
1848
1849	Arguments:
1850	ptr points to the first letter
1851	len the length of the name
1852
1853	Returns: a value representing the name, or -1 if unknown
1854	*/
1855
1856	static int
1857	check_posix_name(const uschar *ptr, int len)
1858	{
1859	const char *pn = posix_names;
1860	register int yield = 0;
1861	while (posix_name_lengths[yield] != 0)
1862	{
1863	if (len == posix_name_lengths[yield] &&
1864	strncmp((const char *)ptr, pn, len) == 0) return yield;
1865	pn += posix_name_lengths[yield] + 1;
1866	yield++;
1867	}
1868	return -1;
1869	}
1870
1871
1872	/*************************************************
1873	* Adjust OP_RECURSE items in repeated group *
1874	*************************************************/
1875
1876	/* OP_RECURSE items contain an offset from the start of the regex to the group
1877	that is referenced. This means that groups can be replicated for fixed
1878	repetition simply by copying (because the recursion is allowed to refer to
1879	earlier groups that are outside the current group). However, when a group is
1880	optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1881	inserted before it, after it has been compiled. This means that any OP_RECURSE
1882	items within it that refer to the group itself or any contained groups have to
1883	have their offsets adjusted. That one of the jobs of this function. Before it
1884	is called, the partially compiled regex must be temporarily terminated with
1885	OP_END.
1886
1887	This function has been extended with the possibility of forward references for
1888	recursions and subroutine calls. It must also check the list of such references
1889	for the group we are dealing with. If it finds that one of the recursions in
1890	the current group is on this list, it adjusts the offset in the list, not the
1891	value in the reference (which is a group number).
1892
1893	Arguments:
1894	group points to the start of the group
1895	adjust the amount by which the group is to be moved
1896	utf8 TRUE in UTF-8 mode
1897	cd contains pointers to tables etc.
1898	save_hwm the hwm forward reference pointer at the start of the group
1899
1900	Returns: nothing
1901	*/
1902
1903	static void
1904	adjust_recurse(uschar group, int adjust, BOOL utf8, compile_data cd,
1905	uschar *save_hwm)
1906	{
1907	uschar *ptr = group;
1908
1909	while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1910	{
1911	int offset;
1912	uschar *hc;
1913
1914	/* See if this recursion is on the forward reference list. If so, adjust the
1915	reference. */
1916
1917	for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1918	{
1919	offset = GET(hc, 0);
1920	if (cd->start_code + offset == ptr + 1)
1921	{
1922	PUT(hc, 0, offset + adjust);
1923	break;
1924	}
1925	}
1926
1927	/* Otherwise, adjust the recursion offset if it's after the start of this
1928	group. */
1929
1930	if (hc >= cd->hwm)
1931	{
1932	offset = GET(ptr, 1);
1933	if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1934	}
1935
1936	ptr += 1 + LINK_SIZE;
1937	}
1938	}
1939
1940
1941
1942	/*************************************************
1943	* Insert an automatic callout point *
1944	*************************************************/
1945
1946	/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1947	callout points before each pattern item.
1948
1949	Arguments:
1950	code current code pointer
1951	ptr current pattern pointer
1952	cd pointers to tables etc
1953
1954	Returns: new code pointer
1955	*/
1956
1957	static uschar *
1958	auto_callout(uschar code, const uschar ptr, compile_data *cd)
1959	{
1960	*code++ = OP_CALLOUT;
1961	*code++ = 255;
1962	PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1963	PUT(code, LINK_SIZE, 0); /* Default length */
1964	return code + 2*LINK_SIZE;
1965	}
1966
1967
1968
1969	/*************************************************
1970	* Complete a callout item *
1971	*************************************************/
1972
1973	/* A callout item contains the length of the next item in the pattern, which
1974	we can't fill in till after we have reached the relevant point. This is used
1975	for both automatic and manual callouts.
1976
1977	Arguments:
1978	previous_callout points to previous callout item
1979	ptr current pattern pointer
1980	cd pointers to tables etc
1981
1982	Returns: nothing
1983	*/
1984
1985	static void
1986	complete_callout(uschar previous_callout, const uschar ptr, compile_data *cd)
1987	{
1988	int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1989	PUT(previous_callout, 2 + LINK_SIZE, length);
1990	}
1991
1992
1993
1994	#ifdef SUPPORT_UCP
1995	/*************************************************
1996	* Get othercase range *
1997	*************************************************/
1998
1999	/* This function is passed the start and end of a class range, in UTF-8 mode
2000	with UCP support. It searches up the characters, looking for internal ranges of
2001	characters in the "other" case. Each call returns the next one, updating the
2002	start address.
2003
2004	Arguments:
2005	cptr points to starting character value; updated
2006	d end value
2007	ocptr where to put start of othercase range
2008	odptr where to put end of othercase range
2009
2010	Yield: TRUE when range returned; FALSE when no more
2011	*/
2012
2013	static BOOL
2014	get_othercase_range(unsigned int cptr, unsigned int d, unsigned int ocptr,
2015	unsigned int *odptr)
2016	{
2017	unsigned int c, othercase, next;
2018
2019	for (c = *cptr; c <= d; c++)
2020	{ if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2021
2022	if (c > d) return FALSE;
2023
2024	*ocptr = othercase;
2025	next = othercase + 1;
2026
2027	for (++c; c <= d; c++)
2028	{
2029	if (UCD_OTHERCASE(c) != next) break;
2030	next++;
2031	}
2032
2033	*odptr = next - 1;
2034	*cptr = c;
2035
2036	return TRUE;
2037	}
2038	#endif /* SUPPORT_UCP */
2039
2040
2041
2042	/*************************************************
2043	* Check if auto-possessifying is possible *
2044	*************************************************/
2045
2046	/* This function is called for unlimited repeats of certain items, to see
2047	whether the next thing could possibly match the repeated item. If not, it makes
2048	sense to automatically possessify the repeated item.
2049
2050	Arguments:
2051	op_code the repeated op code
2052	this data for this item, depends on the opcode
2053	utf8 TRUE in UTF-8 mode
2054	utf8_char used for utf8 character bytes, NULL if not relevant
2055	ptr next character in pattern
2056	options options bits
2057	cd contains pointers to tables etc.
2058
2059	Returns: TRUE if possessifying is wanted
2060	*/
2061
2062	static BOOL
2063	check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2064	const uschar ptr, int options, compile_data cd)
2065	{
2066	int next;
2067
2068	/* Skip whitespace and comments in extended mode */
2069
2070	if ((options & PCRE_EXTENDED) != 0)
2071	{
2072	for (;;)
2073	{
2074	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2075	if (*ptr == '#')
2076	{
2077	while (*(++ptr) != 0)
2078	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2079	}
2080	else break;
2081	}
2082	}
2083
2084	/* If the next item is one that we can handle, get its value. A non-negative
2085	value is a character, a negative value is an escape value. */
2086
2087	if (*ptr == '\\')
2088	{
2089	int temperrorcode = 0;
2090	next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2091	if (temperrorcode != 0) return FALSE;
2092	ptr++; /* Point after the escape sequence */
2093	}
2094
2095	else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2096	{
2097	#ifdef SUPPORT_UTF8
2098	if (utf8) { GETCHARINC(next, ptr); } else
2099	#endif
2100	next = *ptr++;
2101	}
2102
2103	else return FALSE;
2104
2105	/* Skip whitespace and comments in extended mode */
2106
2107	if ((options & PCRE_EXTENDED) != 0)
2108	{
2109	for (;;)
2110	{
2111	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2112	if (*ptr == '#')
2113	{
2114	while (*(++ptr) != 0)
2115	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2116	}
2117	else break;
2118	}
2119	}
2120
2121	/* If the next thing is itself optional, we have to give up. */
2122
2123	if (ptr == '' \|\| ptr == '?' \|\| strncmp((char )ptr, "{0,", 3) == 0)
2124	return FALSE;
2125
2126	/* Now compare the next item with the previous opcode. If the previous is a
2127	positive single character match, "item" either contains the character or, if
2128	"item" is greater than 127 in utf8 mode, the character's bytes are in
2129	utf8_char. */
2130
2131
2132	/* Handle cases when the next item is a character. */
2133
2134	if (next >= 0) switch(op_code)
2135	{
2136	case OP_CHAR:
2137	#ifdef SUPPORT_UTF8
2138	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2139	#else
2140	(void)(utf8_char); /* Keep compiler happy by referencing function argument */
2141	#endif
2142	return item != next;
2143
2144	/* For CHARNC (caseless character) we must check the other case. If we have
2145	Unicode property support, we can use it to test the other case of
2146	high-valued characters. */
2147
2148	case OP_CHARNC:
2149	#ifdef SUPPORT_UTF8
2150	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2151	#endif
2152	if (item == next) return FALSE;
2153	#ifdef SUPPORT_UTF8
2154	if (utf8)
2155	{
2156	unsigned int othercase;
2157	if (next < 128) othercase = cd->fcc[next]; else
2158	#ifdef SUPPORT_UCP
2159	othercase = UCD_OTHERCASE((unsigned int)next);
2160	#else
2161	othercase = NOTACHAR;
2162	#endif
2163	return (unsigned int)item != othercase;
2164	}
2165	else
2166	#endif /* SUPPORT_UTF8 */
2167	return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2168
2169	/* For OP_NOT, "item" must be a single-byte character. */
2170
2171	case OP_NOT:
2172	if (item == next) return TRUE;
2173	if ((options & PCRE_CASELESS) == 0) return FALSE;
2174	#ifdef SUPPORT_UTF8
2175	if (utf8)
2176	{
2177	unsigned int othercase;
2178	if (next < 128) othercase = cd->fcc[next]; else
2179	#ifdef SUPPORT_UCP
2180	othercase = UCD_OTHERCASE(next);
2181	#else
2182	othercase = NOTACHAR;
2183	#endif
2184	return (unsigned int)item == othercase;
2185	}
2186	else
2187	#endif /* SUPPORT_UTF8 */
2188	return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2189
2190	case OP_DIGIT:
2191	return next > 127 \|\| (cd->ctypes[next] & ctype_digit) == 0;
2192
2193	case OP_NOT_DIGIT:
2194	return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2195
2196	case OP_WHITESPACE:
2197	return next > 127 \|\| (cd->ctypes[next] & ctype_space) == 0;
2198
2199	case OP_NOT_WHITESPACE:
2200	return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2201
2202	case OP_WORDCHAR:
2203	return next > 127 \|\| (cd->ctypes[next] & ctype_word) == 0;
2204
2205	case OP_NOT_WORDCHAR:
2206	return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2207
2208	case OP_HSPACE:
2209	case OP_NOT_HSPACE:
2210	switch(next)
2211	{
2212	case 0x09:
2213	case 0x20:
2214	case 0xa0:
2215	case 0x1680:
2216	case 0x180e:
2217	case 0x2000:
2218	case 0x2001:
2219	case 0x2002:
2220	case 0x2003:
2221	case 0x2004:
2222	case 0x2005:
2223	case 0x2006:
2224	case 0x2007:
2225	case 0x2008:
2226	case 0x2009:
2227	case 0x200A:
2228	case 0x202f:
2229	case 0x205f:
2230	case 0x3000:
2231	return op_code != OP_HSPACE;
2232	default:
2233	return op_code == OP_HSPACE;
2234	}
2235
2236	case OP_VSPACE:
2237	case OP_NOT_VSPACE:
2238	switch(next)
2239	{
2240	case 0x0a:
2241	case 0x0b:
2242	case 0x0c:
2243	case 0x0d:
2244	case 0x85:
2245	case 0x2028:
2246	case 0x2029:
2247	return op_code != OP_VSPACE;
2248	default:
2249	return op_code == OP_VSPACE;
2250	}
2251
2252	default:
2253	return FALSE;
2254	}
2255
2256
2257	/* Handle the case when the next item is \d, \s, etc. */
2258
2259	switch(op_code)
2260	{
2261	case OP_CHAR:
2262	case OP_CHARNC:
2263	#ifdef SUPPORT_UTF8
2264	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2265	#endif
2266	switch(-next)
2267	{
2268	case ESC_d:
2269	return item > 127 \|\| (cd->ctypes[item] & ctype_digit) == 0;
2270
2271	case ESC_D:
2272	return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2273
2274	case ESC_s:
2275	return item > 127 \|\| (cd->ctypes[item] & ctype_space) == 0;
2276
2277	case ESC_S:
2278	return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2279
2280	case ESC_w:
2281	return item > 127 \|\| (cd->ctypes[item] & ctype_word) == 0;
2282
2283	case ESC_W:
2284	return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2285
2286	case ESC_h:
2287	case ESC_H:
2288	switch(item)
2289	{
2290	case 0x09:
2291	case 0x20:
2292	case 0xa0:
2293	case 0x1680:
2294	case 0x180e:
2295	case 0x2000:
2296	case 0x2001:
2297	case 0x2002:
2298	case 0x2003:
2299	case 0x2004:
2300	case 0x2005:
2301	case 0x2006:
2302	case 0x2007:
2303	case 0x2008:
2304	case 0x2009:
2305	case 0x200A:
2306	case 0x202f:
2307	case 0x205f:
2308	case 0x3000:
2309	return -next != ESC_h;
2310	default:
2311	return -next == ESC_h;
2312	}
2313
2314	case ESC_v:
2315	case ESC_V:
2316	switch(item)
2317	{
2318	case 0x0a:
2319	case 0x0b:
2320	case 0x0c:
2321	case 0x0d:
2322	case 0x85:
2323	case 0x2028:
2324	case 0x2029:
2325	return -next != ESC_v;
2326	default:
2327	return -next == ESC_v;
2328	}
2329
2330	default:
2331	return FALSE;
2332	}
2333
2334	case OP_DIGIT:
2335	return next == -ESC_D \|\| next == -ESC_s \|\| next == -ESC_W \|\|
2336	next == -ESC_h \|\| next == -ESC_v;
2337
2338	case OP_NOT_DIGIT:
2339	return next == -ESC_d;
2340
2341	case OP_WHITESPACE:
2342	return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w;
2343
2344	case OP_NOT_WHITESPACE:
2345	return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;
2346
2347	case OP_HSPACE:
2348	return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\| next == -ESC_w;
2349
2350	case OP_NOT_HSPACE:
2351	return next == -ESC_h;
2352
2353	/* Can't have \S in here because VT matches \S (Perl anomaly) */
2354	case OP_VSPACE:
2355	return next == -ESC_V \|\| next == -ESC_d \|\| next == -ESC_w;
2356
2357	case OP_NOT_VSPACE:
2358	return next == -ESC_v;
2359
2360	case OP_WORDCHAR:
2361	return next == -ESC_W \|\| next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;
2362
2363	case OP_NOT_WORDCHAR:
2364	return next == -ESC_w \|\| next == -ESC_d;
2365
2366	default:
2367	return FALSE;
2368	}
2369
2370	/* Control does not reach here */
2371	}
2372
2373
2374
2375	/*************************************************
2376	* Compile one branch *
2377	*************************************************/
2378
2379	/* Scan the pattern, compiling it into the a vector. If the options are
2380	changed during the branch, the pointer is used to change the external options
2381	bits. This function is used during the pre-compile phase when we are trying
2382	to find out the amount of memory needed, as well as during the real compile
2383	phase. The value of lengthptr distinguishes the two phases.
2384
2385	Arguments:
2386	optionsptr pointer to the option bits
2387	codeptr points to the pointer to the current code point
2388	ptrptr points to the current pattern pointer
2389	errorcodeptr points to error code variable
2390	firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2391	reqbyteptr set to the last literal character required, else < 0
2392	bcptr points to current branch chain
2393	cd contains pointers to tables etc.
2394	lengthptr NULL during the real compile phase
2395	points to length accumulator during pre-compile phase
2396
2397	Returns: TRUE on success
2398	FALSE, with *errorcodeptr set non-zero on error
2399	*/
2400
2401	static BOOL
2402	compile_branch(int optionsptr, uschar codeptr, const uschar *ptrptr,
2403	int errorcodeptr, int firstbyteptr, int reqbyteptr, branch_chain bcptr,
2404	compile_data cd, int lengthptr)
2405	{
2406	int repeat_type, op_type;
2407	int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2408	int bravalue = 0;
2409	int greedy_default, greedy_non_default;
2410	int firstbyte, reqbyte;
2411	int zeroreqbyte, zerofirstbyte;
2412	int req_caseopt, reqvary, tempreqvary;
2413	int options = *optionsptr;
2414	int after_manual_callout = 0;
2415	int length_prevgroup = 0;
2416	register int c;
2417	register uschar code = codeptr;
2418	uschar *last_code = code;
2419	uschar *orig_code = code;
2420	uschar *tempcode;
2421	BOOL inescq = FALSE;
2422	BOOL groupsetfirstbyte = FALSE;
2423	const uschar ptr = ptrptr;
2424	const uschar *tempptr;
2425	uschar *previous = NULL;
2426	uschar *previous_callout = NULL;
2427	uschar *save_hwm = NULL;
2428	uschar classbits[32];
2429
2430	#ifdef SUPPORT_UTF8
2431	BOOL class_utf8;
2432	BOOL utf8 = (options & PCRE_UTF8) != 0;
2433	uschar *class_utf8data;
2434	uschar *class_utf8data_base;
2435	uschar utf8_char[6];
2436	#else
2437	BOOL utf8 = FALSE;
2438	uschar *utf8_char = NULL;
2439	#endif
2440
2441	#ifdef DEBUG
2442	if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2443	#endif
2444
2445	/* Set up the default and non-default settings for greediness */
2446
2447	greedy_default = ((options & PCRE_UNGREEDY) != 0);
2448	greedy_non_default = greedy_default ^ 1;
2449
2450	/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2451	matching encountered yet". It gets changed to REQ_NONE if we hit something that
2452	matches a non-fixed char first char; reqbyte just remains unset if we never
2453	find one.
2454
2455	When we hit a repeat whose minimum is zero, we may have to adjust these values
2456	to take the zero repeat into account. This is implemented by setting them to
2457	zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2458	item types that can be repeated set these backoff variables appropriately. */
2459
2460	firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2461
2462	/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2463	according to the current setting of the caseless flag. REQ_CASELESS is a bit
2464	value > 255. It is added into the firstbyte or reqbyte variables to record the
2465	case status of the value. This is used only for ASCII characters. */
2466
2467	req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2468
2469	/* Switch on next character until the end of the branch */
2470
2471	for (;; ptr++)
2472	{
2473	BOOL negate_class;
2474	BOOL should_flip_negation;
2475	BOOL possessive_quantifier;
2476	BOOL is_quantifier;
2477	BOOL is_recurse;
2478	BOOL reset_bracount;
2479	int class_charcount;
2480	int class_lastchar;
2481	int newoptions;
2482	int recno;
2483	int refsign;
2484	int skipbytes;
2485	int subreqbyte;
2486	int subfirstbyte;
2487	int terminator;
2488	int mclength;
2489	uschar mcbuffer[8];
2490
2491	/* Get next byte in the pattern */
2492
2493	c = *ptr;
2494
2495	/* If we are in the pre-compile phase, accumulate the length used for the
2496	previous cycle of this loop. */
2497
2498	if (lengthptr != NULL)
2499	{
2500	#ifdef DEBUG
2501	if (code > cd->hwm) cd->hwm = code; /* High water info */
2502	#endif
2503	if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2504	{
2505	*errorcodeptr = ERR52;
2506	goto FAILED;
2507	}
2508
2509	/* There is at least one situation where code goes backwards: this is the
2510	case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2511	the class is simply eliminated. However, it is created first, so we have to
2512	allow memory for it. Therefore, don't ever reduce the length at this point.
2513	*/
2514
2515	if (code < last_code) code = last_code;
2516
2517	/* Paranoid check for integer overflow */
2518
2519	if (OFLOW_MAX - *lengthptr < code - last_code)
2520	{
2521	*errorcodeptr = ERR20;
2522	goto FAILED;
2523	}
2524
2525	*lengthptr += code - last_code;
2526	DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2527
2528	/* If "previous" is set and it is not at the start of the work space, move
2529	it back to there, in order to avoid filling up the work space. Otherwise,
2530	if "previous" is NULL, reset the current code pointer to the start. */
2531
2532	if (previous != NULL)
2533	{
2534	if (previous > orig_code)
2535	{
2536	memmove(orig_code, previous, code - previous);
2537	code -= previous - orig_code;
2538	previous = orig_code;
2539	}
2540	}
2541	else code = orig_code;
2542
2543	/* Remember where this code item starts so we can pick up the length
2544	next time round. */
2545
2546	last_code = code;
2547	}
2548
2549	/* In the real compile phase, just check the workspace used by the forward
2550	reference list. */
2551
2552	else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2553	{
2554	*errorcodeptr = ERR52;
2555	goto FAILED;
2556	}
2557
2558	/* If in \Q...\E, check for the end; if not, we have a literal */
2559
2560	if (inescq && c != 0)
2561	{
2562	if (c == '\\' && ptr[1] == 'E')
2563	{
2564	inescq = FALSE;
2565	ptr++;
2566	continue;
2567	}
2568	else
2569	{
2570	if (previous_callout != NULL)
2571	{
2572	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2573	complete_callout(previous_callout, ptr, cd);
2574	previous_callout = NULL;
2575	}
2576	if ((options & PCRE_AUTO_CALLOUT) != 0)
2577	{
2578	previous_callout = code;
2579	code = auto_callout(code, ptr, cd);
2580	}
2581	goto NORMAL_CHAR;
2582	}
2583	}
2584
2585	/* Fill in length of a previous callout, except when the next thing is
2586	a quantifier. */
2587
2588	is_quantifier = c == '*' \|\| c == '+' \|\| c == '?' \|\|
2589	(c == '{' && is_counted_repeat(ptr+1));
2590
2591	if (!is_quantifier && previous_callout != NULL &&
2592	after_manual_callout-- <= 0)
2593	{
2594	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2595	complete_callout(previous_callout, ptr, cd);
2596	previous_callout = NULL;
2597	}
2598
2599	/* In extended mode, skip white space and comments */
2600
2601	if ((options & PCRE_EXTENDED) != 0)
2602	{
2603	if ((cd->ctypes[c] & ctype_space) != 0) continue;
2604	if (c == '#')
2605	{
2606	while (*(++ptr) != 0)
2607	{
2608	if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2609	}
2610	if (*ptr != 0) continue;
2611
2612	/* Else fall through to handle end of string */
2613	c = 0;
2614	}
2615	}
2616
2617	/* No auto callout for quantifiers. */
2618
2619	if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2620	{
2621	previous_callout = code;
2622	code = auto_callout(code, ptr, cd);
2623	}
2624
2625	switch(c)
2626	{
2627	/* ===================================================================*/
2628	case 0: /* The branch terminates at string end */
2629	case '\|': /* or \| or ) */
2630	case ')':
2631	*firstbyteptr = firstbyte;
2632	*reqbyteptr = reqbyte;
2633	*codeptr = code;
2634	*ptrptr = ptr;
2635	if (lengthptr != NULL)
2636	{
2637	if (OFLOW_MAX - *lengthptr < code - last_code)
2638	{
2639	*errorcodeptr = ERR20;
2640	goto FAILED;
2641	}
2642	lengthptr += code - last_code; / To include callout length */
2643	DPRINTF((">> end branch\n"));
2644	}
2645	return TRUE;
2646
2647
2648	/* ===================================================================*/
2649	/* Handle single-character metacharacters. In multiline mode, ^ disables
2650	the setting of any following char as a first character. */
2651
2652	case '^':
2653	if ((options & PCRE_MULTILINE) != 0)
2654	{
2655	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2656	}
2657	previous = NULL;
2658	*code++ = OP_CIRC;
2659	break;
2660
2661	case '$':
2662	previous = NULL;
2663	*code++ = OP_DOLL;
2664	break;
2665
2666	/* There can never be a first char if '.' is first, whatever happens about
2667	repeats. The value of reqbyte doesn't change either. */
2668
2669	case '.':
2670	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2671	zerofirstbyte = firstbyte;
2672	zeroreqbyte = reqbyte;
2673	previous = code;
2674	*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2675	break;
2676
2677
2678	/* ===================================================================*/
2679	/* Character classes. If the included characters are all < 256, we build a
2680	32-byte bitmap of the permitted characters, except in the special case
2681	where there is only one such character. For negated classes, we build the
2682	map as usual, then invert it at the end. However, we use a different opcode
2683	so that data characters > 255 can be handled correctly.
2684
2685	If the class contains characters outside the 0-255 range, a different
2686	opcode is compiled. It may optionally have a bit map for characters < 256,
2687	but those above are are explicitly listed afterwards. A flag byte tells
2688	whether the bitmap is present, and whether this is a negated class or not.
2689
2690	In JavaScript compatibility mode, an isolated ']' causes an error. In
2691	default (Perl) mode, it is treated as a data character. */
2692
2693	case ']':
2694	if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2695	{
2696	*errorcodeptr = ERR64;
2697	goto FAILED;
2698	}
2699	goto NORMAL_CHAR;
2700
2701	case '[':
2702	previous = code;
2703
2704	/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2705	they are encountered at the top level, so we'll do that too. */
2706
2707	if ((ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') &&
2708	check_posix_syntax(ptr, &tempptr))
2709	{
2710	*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2711	goto FAILED;
2712	}
2713
2714	/* If the first character is '^', set the negation flag and skip it. Also,
2715	if the first few characters (either before or after ^) are \Q\E or \E we
2716	skip them too. This makes for compatibility with Perl. */
2717
2718	negate_class = FALSE;
2719	for (;;)
2720	{
2721	c = *(++ptr);
2722	if (c == '\\')
2723	{
2724	if (ptr[1] == 'E') ptr++;
2725	else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2726	else break;
2727	}
2728	else if (!negate_class && c == '^')
2729	negate_class = TRUE;
2730	else break;
2731	}
2732
2733	/* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2734	an initial ']' is taken as a data character -- the code below handles
2735	that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2736	[^] must match any character, so generate OP_ALLANY. */
2737
2738	if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2739	{
2740	*code++ = negate_class? OP_ALLANY : OP_FAIL;
2741	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2742	zerofirstbyte = firstbyte;
2743	break;
2744	}
2745
2746	/* If a class contains a negative special such as \S, we need to flip the
2747	negation flag at the end, so that support for characters > 255 works
2748	correctly (they are all included in the class). */
2749
2750	should_flip_negation = FALSE;
2751
2752	/* Keep a count of chars with values < 256 so that we can optimize the case
2753	of just a single character (as long as it's < 256). However, For higher
2754	valued UTF-8 characters, we don't yet do any optimization. */
2755
2756	class_charcount = 0;
2757	class_lastchar = -1;
2758
2759	/* Initialize the 32-char bit map to all zeros. We build the map in a
2760	temporary bit of memory, in case the class contains only 1 character (less
2761	than 256), because in that case the compiled code doesn't use the bit map.
2762	*/
2763
2764	memset(classbits, 0, 32 * sizeof(uschar));
2765
2766	#ifdef SUPPORT_UTF8
2767	class_utf8 = FALSE; /* No chars >= 256 */
2768	class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2769	class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2770	#endif
2771
2772	/* Process characters until ] is reached. By writing this as a "do" it
2773	means that an initial ] is taken as a data character. At the start of the
2774	loop, c contains the first byte of the character. */
2775
2776	if (c != 0) do
2777	{
2778	const uschar *oldptr;
2779
2780	#ifdef SUPPORT_UTF8
2781	if (utf8 && c > 127)
2782	{ /* Braces are required because the */
2783	GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2784	}
2785
2786	/* In the pre-compile phase, accumulate the length of any UTF-8 extra
2787	data and reset the pointer. This is so that very large classes that
2788	contain a zillion UTF-8 characters no longer overwrite the work space
2789	(which is on the stack). */
2790
2791	if (lengthptr != NULL)
2792	{
2793	*lengthptr += class_utf8data - class_utf8data_base;
2794	class_utf8data = class_utf8data_base;
2795	}
2796
2797	#endif
2798
2799	/* Inside \Q...\E everything is literal except \E */
2800
2801	if (inescq)
2802	{
2803	if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2804	{
2805	inescq = FALSE; /* Reset literal state */
2806	ptr++; /* Skip the 'E' */
2807	continue; /* Carry on with next */
2808	}
2809	goto CHECK_RANGE; /* Could be range if \E follows */
2810	}
2811
2812	/* Handle POSIX class names. Perl allows a negation extension of the
2813	form [:^name:]. A square bracket that doesn't match the syntax is
2814	treated as a literal. We also recognize the POSIX constructions
2815	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2816	5.6 and 5.8 do. */
2817
2818	if (c == '[' &&
2819	(ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') &&
2820	check_posix_syntax(ptr, &tempptr))
2821	{
2822	BOOL local_negate = FALSE;
2823	int posix_class, taboffset, tabopt;
2824	register const uschar *cbits = cd->cbits;
2825	uschar pbits[32];
2826
2827	if (ptr[1] != ':')
2828	{
2829	*errorcodeptr = ERR31;
2830	goto FAILED;
2831	}
2832
2833	ptr += 2;
2834	if (*ptr == '^')
2835	{
2836	local_negate = TRUE;
2837	should_flip_negation = TRUE; /* Note negative special */
2838	ptr++;
2839	}
2840
2841	posix_class = check_posix_name(ptr, tempptr - ptr);
2842	if (posix_class < 0)
2843	{
2844	*errorcodeptr = ERR30;
2845	goto FAILED;
2846	}
2847
2848	/* If matching is caseless, upper and lower are converted to
2849	alpha. This relies on the fact that the class table starts with
2850	alpha, lower, upper as the first 3 entries. */
2851
2852	if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2853	posix_class = 0;
2854
2855	/* We build the bit map for the POSIX class in a chunk of local store
2856	because we may be adding and subtracting from it, and we don't want to
2857	subtract bits that may be in the main map already. At the end we or the
2858	result into the bit map that is being built. */
2859
2860	posix_class *= 3;
2861
2862	/* Copy in the first table (always present) */
2863
2864	memcpy(pbits, cbits + posix_class_maps[posix_class],
2865	32 * sizeof(uschar));
2866
2867	/* If there is a second table, add or remove it as required. */
2868
2869	taboffset = posix_class_maps[posix_class + 1];
2870	tabopt = posix_class_maps[posix_class + 2];
2871
2872	if (taboffset >= 0)
2873	{
2874	if (tabopt >= 0)
2875	for (c = 0; c < 32; c++) pbits[c] \|= cbits[c + taboffset];
2876	else
2877	for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2878	}
2879
2880	/* Not see if we need to remove any special characters. An option
2881	value of 1 removes vertical space and 2 removes underscore. */
2882
2883	if (tabopt < 0) tabopt = -tabopt;
2884	if (tabopt == 1) pbits[1] &= ~0x3c;
2885	else if (tabopt == 2) pbits[11] &= 0x7f;
2886
2887	/* Add the POSIX table or its complement into the main table that is
2888	being built and we are done. */
2889
2890	if (local_negate)
2891	for (c = 0; c < 32; c++) classbits[c] \|= ~pbits[c];
2892	else
2893	for (c = 0; c < 32; c++) classbits[c] \|= pbits[c];
2894
2895	ptr = tempptr + 1;
2896	class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2897	continue; /* End of POSIX syntax handling */
2898	}
2899
2900	/* Backslash may introduce a single character, or it may introduce one
2901	of the specials, which just set a flag. The sequence \b is a special
2902	case. Inside a class (and only there) it is treated as backspace.
2903	Elsewhere it marks a word boundary. Other escapes have preset maps ready
2904	to 'or' into the one we are building. We assume they have more than one
2905	character in them, so set class_charcount bigger than one. */
2906
2907	if (c == '\\')
2908	{
2909	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2910	if (*errorcodeptr != 0) goto FAILED;
2911
2912	if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2913	else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2914	else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2915	else if (-c == ESC_Q) /* Handle start of quoted string */
2916	{
2917	if (ptr[1] == '\\' && ptr[2] == 'E')
2918	{
2919	ptr += 2; /* avoid empty string */
2920	}
2921	else inescq = TRUE;
2922	continue;
2923	}
2924	else if (-c == ESC_E) continue; /* Ignore orphan \E */
2925
2926	if (c < 0)
2927	{
2928	register const uschar *cbits = cd->cbits;
2929	class_charcount += 2; /* Greater than 1 is what matters */
2930
2931	/* Save time by not doing this in the pre-compile phase. */
2932
2933	if (lengthptr == NULL) switch (-c)
2934	{
2935	case ESC_d:
2936	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_digit];
2937	continue;
2938
2939	case ESC_D:
2940	should_flip_negation = TRUE;
2941	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_digit];
2942	continue;
2943
2944	case ESC_w:
2945	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_word];
2946	continue;
2947
2948	case ESC_W:
2949	should_flip_negation = TRUE;
2950	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_word];
2951	continue;
2952
2953	case ESC_s:
2954	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_space];
2955	classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2956	continue;
2957
2958	case ESC_S:
2959	should_flip_negation = TRUE;
2960	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_space];
2961	classbits[1] \|= 0x08; /* Perl 5.004 onwards omits VT from \s */
2962	continue;
2963
2964	default: /* Not recognized; fall through */
2965	break; /* Need "default" setting to stop compiler warning. */
2966	}
2967
2968	/* In the pre-compile phase, just do the recognition. */
2969
2970	else if (c == -ESC_d \|\| c == -ESC_D \|\| c == -ESC_w \|\|
2971	c == -ESC_W \|\| c == -ESC_s \|\| c == -ESC_S) continue;
2972
2973	/* We need to deal with \H, \h, \V, and \v in both phases because
2974	they use extra memory. */
2975
2976	if (-c == ESC_h)
2977	{
2978	SETBIT(classbits, 0x09); /* VT */
2979	SETBIT(classbits, 0x20); /* SPACE */
2980	SETBIT(classbits, 0xa0); /* NSBP */
2981	#ifdef SUPPORT_UTF8
2982	if (utf8)
2983	{
2984	class_utf8 = TRUE;
2985	*class_utf8data++ = XCL_SINGLE;
2986	class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2987	*class_utf8data++ = XCL_SINGLE;
2988	class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2989	*class_utf8data++ = XCL_RANGE;
2990	class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2991	class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2992	*class_utf8data++ = XCL_SINGLE;
2993	class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2994	*class_utf8data++ = XCL_SINGLE;
2995	class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2996	*class_utf8data++ = XCL_SINGLE;
2997	class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2998	}
2999	#endif
3000	continue;
3001	}
3002
3003	if (-c == ESC_H)
3004	{
3005	for (c = 0; c < 32; c++)
3006	{
3007	int x = 0xff;
3008	switch (c)
3009	{
3010	case 0x09/8: x ^= 1 << (0x09%8); break;
3011	case 0x20/8: x ^= 1 << (0x20%8); break;
3012	case 0xa0/8: x ^= 1 << (0xa0%8); break;
3013	default: break;
3014	}
3015	classbits[c] \|= x;
3016	}
3017
3018	#ifdef SUPPORT_UTF8
3019	if (utf8)
3020	{
3021	class_utf8 = TRUE;
3022	*class_utf8data++ = XCL_RANGE;
3023	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3024	class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3025	*class_utf8data++ = XCL_RANGE;
3026	class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3027	class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3028	*class_utf8data++ = XCL_RANGE;
3029	class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3030	class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3031	*class_utf8data++ = XCL_RANGE;
3032	class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3033	class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3034	*class_utf8data++ = XCL_RANGE;
3035	class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3036	class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3037	*class_utf8data++ = XCL_RANGE;
3038	class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3039	class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3040	*class_utf8data++ = XCL_RANGE;
3041	class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3042	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3043	}
3044	#endif
3045	continue;
3046	}
3047
3048	if (-c == ESC_v)
3049	{
3050	SETBIT(classbits, 0x0a); /* LF */
3051	SETBIT(classbits, 0x0b); /* VT */
3052	SETBIT(classbits, 0x0c); /* FF */
3053	SETBIT(classbits, 0x0d); /* CR */
3054	SETBIT(classbits, 0x85); /* NEL */
3055	#ifdef SUPPORT_UTF8
3056	if (utf8)
3057	{
3058	class_utf8 = TRUE;
3059	*class_utf8data++ = XCL_RANGE;
3060	class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3061	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3062	}
3063	#endif
3064	continue;
3065	}
3066
3067	if (-c == ESC_V)
3068	{
3069	for (c = 0; c < 32; c++)
3070	{
3071	int x = 0xff;
3072	switch (c)
3073	{
3074	case 0x0a/8: x ^= 1 << (0x0a%8);
3075	x ^= 1 << (0x0b%8);
3076	x ^= 1 << (0x0c%8);
3077	x ^= 1 << (0x0d%8);
3078	break;
3079	case 0x85/8: x ^= 1 << (0x85%8); break;
3080	default: break;
3081	}
3082	classbits[c] \|= x;
3083	}
3084
3085	#ifdef SUPPORT_UTF8
3086	if (utf8)
3087	{
3088	class_utf8 = TRUE;
3089	*class_utf8data++ = XCL_RANGE;
3090	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3091	class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3092	*class_utf8data++ = XCL_RANGE;
3093	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3094	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3095	}
3096	#endif
3097	continue;
3098	}
3099
3100	/* We need to deal with \P and \p in both phases. */
3101
3102	#ifdef SUPPORT_UCP
3103	if (-c == ESC_p \|\| -c == ESC_P)
3104	{
3105	BOOL negated;
3106	int pdata;
3107	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3108	if (ptype < 0) goto FAILED;
3109	class_utf8 = TRUE;
3110	*class_utf8data++ = ((-c == ESC_p) != negated)?
3111	XCL_PROP : XCL_NOTPROP;
3112	*class_utf8data++ = ptype;
3113	*class_utf8data++ = pdata;
3114	class_charcount -= 2; /* Not a < 256 character */
3115	continue;
3116	}
3117	#endif
3118	/* Unrecognized escapes are faulted if PCRE is running in its
3119	strict mode. By default, for compatibility with Perl, they are
3120	treated as literals. */
3121
3122	if ((options & PCRE_EXTRA) != 0)
3123	{
3124	*errorcodeptr = ERR7;
3125	goto FAILED;
3126	}
3127
3128	class_charcount -= 2; /* Undo the default count from above */
3129	c = ptr; / Get the final character and fall through */
3130	}
3131
3132	/* Fall through if we have a single character (c >= 0). This may be
3133	greater than 256 in UTF-8 mode. */
3134
3135	} /* End of backslash handling */
3136
3137	/* A single character may be followed by '-' to form a range. However,
3138	Perl does not permit ']' to be the end of the range. A '-' character
3139	at the end is treated as a literal. Perl ignores orphaned \E sequences
3140	entirely. The code for handling \Q and \E is messy. */
3141
3142	CHECK_RANGE:
3143	while (ptr[1] == '\\' && ptr[2] == 'E')
3144	{
3145	inescq = FALSE;
3146	ptr += 2;
3147	}
3148
3149	oldptr = ptr;
3150
3151	/* Remember \r or \n */
3152
3153	if (c == '\r' \|\| c == '\n') cd->external_flags \|= PCRE_HASCRORLF;
3154
3155	/* Check for range */
3156
3157	if (!inescq && ptr[1] == '-')
3158	{
3159	int d;
3160	ptr += 2;
3161	while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3162
3163	/* If we hit \Q (not followed by \E) at this point, go into escaped
3164	mode. */
3165
3166	while (*ptr == '\\' && ptr[1] == 'Q')
3167	{
3168	ptr += 2;
3169	if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3170	inescq = TRUE;
3171	break;
3172	}
3173
3174	if (ptr == 0 \|\| (!inescq && ptr == ']'))
3175	{
3176	ptr = oldptr;
3177	goto LONE_SINGLE_CHARACTER;
3178	}
3179
3180	#ifdef SUPPORT_UTF8
3181	if (utf8)
3182	{ /* Braces are required because the */
3183	GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3184	}
3185	else
3186	#endif
3187	d = ptr; / Not UTF-8 mode */
3188
3189	/* The second part of a range can be a single-character escape, but
3190	not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3191	in such circumstances. */
3192
3193	if (!inescq && d == '\\')
3194	{
3195	d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3196	if (*errorcodeptr != 0) goto FAILED;
3197
3198	/* \b is backspace; \X is literal X; \R is literal R; any other
3199	special means the '-' was literal */
3200
3201	if (d < 0)
3202	{
3203	if (d == -ESC_b) d = '\b';
3204	else if (d == -ESC_X) d = 'X';
3205	else if (d == -ESC_R) d = 'R'; else
3206	{
3207	ptr = oldptr;
3208	goto LONE_SINGLE_CHARACTER; /* A few lines below */
3209	}
3210	}
3211	}
3212
3213	/* Check that the two values are in the correct order. Optimize
3214	one-character ranges */
3215
3216	if (d < c)
3217	{
3218	*errorcodeptr = ERR8;
3219	goto FAILED;
3220	}
3221
3222	if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3223
3224	/* Remember \r or \n */
3225
3226	if (d == '\r' \|\| d == '\n') cd->external_flags \|= PCRE_HASCRORLF;
3227
3228	/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3229	matching, we have to use an XCLASS with extra data items. Caseless
3230	matching for characters > 127 is available only if UCP support is
3231	available. */
3232
3233	#ifdef SUPPORT_UTF8
3234	if (utf8 && (d > 255 \|\| ((options & PCRE_CASELESS) != 0 && d > 127)))
3235	{
3236	class_utf8 = TRUE;
3237
3238	/* With UCP support, we can find the other case equivalents of
3239	the relevant characters. There may be several ranges. Optimize how
3240	they fit with the basic range. */
3241
3242	#ifdef SUPPORT_UCP
3243	if ((options & PCRE_CASELESS) != 0)
3244	{
3245	unsigned int occ, ocd;
3246	unsigned int cc = c;
3247	unsigned int origd = d;
3248	while (get_othercase_range(&cc, origd, &occ, &ocd))
3249	{
3250	if (occ >= (unsigned int)c &&
3251	ocd <= (unsigned int)d)
3252	continue; /* Skip embedded ranges */
3253
3254	if (occ < (unsigned int)c &&
3255	ocd >= (unsigned int)c - 1) /* Extend the basic range */
3256	{ /* if there is overlap, */
3257	c = occ; /* noting that if occ < c */
3258	continue; /* we can't have ocd > d */
3259	} /* because a subrange is */
3260	if (ocd > (unsigned int)d &&
3261	occ <= (unsigned int)d + 1) /* always shorter than */
3262	{ /* the basic range. */
3263	d = ocd;
3264	continue;
3265	}
3266
3267	if (occ == ocd)
3268	{
3269	*class_utf8data++ = XCL_SINGLE;
3270	}
3271	else
3272	{
3273	*class_utf8data++ = XCL_RANGE;
3274	class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3275	}
3276	class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3277	}
3278	}
3279	#endif /* SUPPORT_UCP */
3280
3281	/* Now record the original range, possibly modified for UCP caseless
3282	overlapping ranges. */
3283
3284	*class_utf8data++ = XCL_RANGE;
3285	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3286	class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3287
3288	/* With UCP support, we are done. Without UCP support, there is no
3289	caseless matching for UTF-8 characters > 127; we can use the bit map
3290	for the smaller ones. */
3291
3292	#ifdef SUPPORT_UCP
3293	continue; /* With next character in the class */
3294	#else
3295	if ((options & PCRE_CASELESS) == 0 \|\| c > 127) continue;
3296
3297	/* Adjust upper limit and fall through to set up the map */
3298
3299	d = 127;
3300
3301	#endif /* SUPPORT_UCP */
3302	}
3303	#endif /* SUPPORT_UTF8 */
3304
3305	/* We use the bit map for all cases when not in UTF-8 mode; else
3306	ranges that lie entirely within 0-127 when there is UCP support; else
3307	for partial ranges without UCP support. */
3308
3309	class_charcount += d - c + 1;
3310	class_lastchar = d;
3311
3312	/* We can save a bit of time by skipping this in the pre-compile. */
3313
3314	if (lengthptr == NULL) for (; c <= d; c++)
3315	{
3316	classbits[c/8] \|= (1 << (c&7));
3317	if ((options & PCRE_CASELESS) != 0)
3318	{
3319	int uc = cd->fcc[c]; /* flip case */
3320	classbits[uc/8] \|= (1 << (uc&7));
3321	}
3322	}
3323
3324	continue; /* Go get the next char in the class */
3325	}
3326
3327	/* Handle a lone single character - we can get here for a normal
3328	non-escape char, or after \ that introduces a single character or for an
3329	apparent range that isn't. */
3330
3331	LONE_SINGLE_CHARACTER:
3332
3333	/* Handle a character that cannot go in the bit map */
3334
3335	#ifdef SUPPORT_UTF8
3336	if (utf8 && (c > 255 \|\| ((options & PCRE_CASELESS) != 0 && c > 127)))
3337	{
3338	class_utf8 = TRUE;
3339	*class_utf8data++ = XCL_SINGLE;
3340	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3341
3342	#ifdef SUPPORT_UCP
3343	if ((options & PCRE_CASELESS) != 0)
3344	{
3345	unsigned int othercase;
3346	if ((othercase = UCD_OTHERCASE(c)) != c)
3347	{
3348	*class_utf8data++ = XCL_SINGLE;
3349	class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3350	}
3351	}
3352	#endif /* SUPPORT_UCP */
3353
3354	}
3355	else
3356	#endif /* SUPPORT_UTF8 */
3357
3358	/* Handle a single-byte character */
3359	{
3360	classbits[c/8] \|= (1 << (c&7));
3361	if ((options & PCRE_CASELESS) != 0)
3362	{
3363	c = cd->fcc[c]; /* flip case */
3364	classbits[c/8] \|= (1 << (c&7));
3365	}
3366	class_charcount++;
3367	class_lastchar = c;
3368	}
3369	}
3370
3371	/* Loop until ']' reached. This "while" is the end of the "do" above. */
3372
3373	while ((c = *(++ptr)) != 0 && (c != ']' \|\| inescq));
3374
3375	if (c == 0) /* Missing terminating ']' */
3376	{
3377	*errorcodeptr = ERR6;
3378	goto FAILED;
3379	}
3380
3381
3382	/* This code has been disabled because it would mean that \s counts as
3383	an explicit \r or \n reference, and that's not really what is wanted. Now
3384	we set the flag only if there is a literal "\r" or "\n" in the class. */
3385
3386	#if 0
3387	/* Remember whether \r or \n are in this class */
3388
3389	if (negate_class)
3390	{
3391	if ((classbits[1] & 0x24) != 0x24) cd->external_flags \|= PCRE_HASCRORLF;
3392	}
3393	else
3394	{
3395	if ((classbits[1] & 0x24) != 0) cd->external_flags \|= PCRE_HASCRORLF;
3396	}
3397	#endif
3398
3399
3400	/* If class_charcount is 1, we saw precisely one character whose value is
3401	less than 256. As long as there were no characters >= 128 and there was no
3402	use of \p or \P, in other words, no use of any XCLASS features, we can
3403	optimize.
3404
3405	In UTF-8 mode, we can optimize the negative case only if there were no
3406	characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3407	operate on single-bytes only. This is an historical hangover. Maybe one day
3408	we can tidy these opcodes to handle multi-byte characters.
3409
3410	The optimization throws away the bit map. We turn the item into a
3411	1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3412	that OP_NOT does not support multibyte characters. In the positive case, it
3413	can cause firstbyte to be set. Otherwise, there can be no first char if
3414	this item is first, whatever repeat count may follow. In the case of
3415	reqbyte, save the previous value for reinstating. */
3416
3417	#ifdef SUPPORT_UTF8
3418	if (class_charcount == 1 && !class_utf8 &&
3419	(!utf8 \|\| !negate_class \|\| class_lastchar < 128))
3420	#else
3421	if (class_charcount == 1)
3422	#endif
3423	{
3424	zeroreqbyte = reqbyte;
3425
3426	/* The OP_NOT opcode works on one-byte characters only. */
3427
3428	if (negate_class)
3429	{
3430	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3431	zerofirstbyte = firstbyte;
3432	*code++ = OP_NOT;
3433	*code++ = class_lastchar;
3434	break;
3435	}
3436
3437	/* For a single, positive character, get the value into mcbuffer, and
3438	then we can handle this with the normal one-character code. */
3439
3440	#ifdef SUPPORT_UTF8
3441	if (utf8 && class_lastchar > 127)
3442	mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3443	else
3444	#endif
3445	{
3446	mcbuffer[0] = class_lastchar;
3447	mclength = 1;
3448	}
3449	goto ONE_CHAR;
3450	} /* End of 1-char optimization */
3451
3452	/* The general case - not the one-char optimization. If this is the first
3453	thing in the branch, there can be no first char setting, whatever the
3454	repeat count. Any reqbyte setting must remain unchanged after any kind of
3455	repeat. */
3456
3457	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3458	zerofirstbyte = firstbyte;
3459	zeroreqbyte = reqbyte;
3460
3461	/* If there are characters with values > 255, we have to compile an
3462	extended class, with its own opcode, unless there was a negated special
3463	such as \S in the class, because in that case all characters > 255 are in
3464	the class, so any that were explicitly given as well can be ignored. If
3465	(when there are explicit characters > 255 that must be listed) there are no
3466	characters < 256, we can omit the bitmap in the actual compiled code. */
3467
3468	#ifdef SUPPORT_UTF8
3469	if (class_utf8 && !should_flip_negation)
3470	{
3471	class_utf8data++ = XCL_END; / Marks the end of extra data */
3472	*code++ = OP_XCLASS;
3473	code += LINK_SIZE;
3474	*code = negate_class? XCL_NOT : 0;
3475
3476	/* If the map is required, move up the extra data to make room for it;
3477	otherwise just move the code pointer to the end of the extra data. */
3478
3479	if (class_charcount > 0)
3480	{
3481	*code++ \|= XCL_MAP;
3482	memmove(code + 32, code, class_utf8data - code);
3483	memcpy(code, classbits, 32);
3484	code = class_utf8data + 32;
3485	}
3486	else code = class_utf8data;
3487
3488	/* Now fill in the complete length of the item */
3489
3490	PUT(previous, 1, code - previous);
3491	break; /* End of class handling */
3492	}
3493	#endif
3494
3495	/* If there are no characters > 255, set the opcode to OP_CLASS or
3496	OP_NCLASS, depending on whether the whole class was negated and whether
3497	there were negative specials such as \S in the class. Then copy the 32-byte
3498	map into the code vector, negating it if necessary. */
3499
3500	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3501	if (negate_class)
3502	{
3503	if (lengthptr == NULL) /* Save time in the pre-compile phase */
3504	for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3505	}
3506	else
3507	{
3508	memcpy(code, classbits, 32);
3509	}
3510	code += 32;
3511	break;
3512
3513
3514	/* ===================================================================*/
3515	/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3516	has been tested above. */
3517
3518	case '{':
3519	if (!is_quantifier) goto NORMAL_CHAR;
3520	ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3521	if (*errorcodeptr != 0) goto FAILED;
3522	goto REPEAT;
3523
3524	case '*':
3525	repeat_min = 0;
3526	repeat_max = -1;
3527	goto REPEAT;
3528
3529	case '+':
3530	repeat_min = 1;
3531	repeat_max = -1;
3532	goto REPEAT;
3533
3534	case '?':
3535	repeat_min = 0;
3536	repeat_max = 1;
3537
3538	REPEAT:
3539	if (previous == NULL)
3540	{
3541	*errorcodeptr = ERR9;
3542	goto FAILED;
3543	}
3544
3545	if (repeat_min == 0)
3546	{
3547	firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3548	reqbyte = zeroreqbyte; /* Ditto */
3549	}
3550
3551	/* Remember whether this is a variable length repeat */
3552
3553	reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3554
3555	op_type = 0; /* Default single-char op codes */
3556	possessive_quantifier = FALSE; /* Default not possessive quantifier */
3557
3558	/* Save start of previous item, in case we have to move it up to make space
3559	for an inserted OP_ONCE for the additional '+' extension. */
3560
3561	tempcode = previous;
3562
3563	/* If the next character is '+', we have a possessive quantifier. This
3564	implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3565	If the next character is '?' this is a minimizing repeat, by default,
3566	but if PCRE_UNGREEDY is set, it works the other way round. We change the
3567	repeat type to the non-default. */
3568
3569	if (ptr[1] == '+')
3570	{
3571	repeat_type = 0; /* Force greedy */
3572	possessive_quantifier = TRUE;
3573	ptr++;
3574	}
3575	else if (ptr[1] == '?')
3576	{
3577	repeat_type = greedy_non_default;
3578	ptr++;
3579	}
3580	else repeat_type = greedy_default;
3581
3582	/* If previous was a character match, abolish the item and generate a
3583	repeat item instead. If a char item has a minumum of more than one, ensure
3584	that it is set in reqbyte - it might not be if a sequence such as x{3} is
3585	the first thing in a branch because the x will have gone into firstbyte
3586	instead. */
3587
3588	if (previous == OP_CHAR \|\| previous == OP_CHARNC)
3589	{
3590	/* Deal with UTF-8 characters that take up more than one byte. It's
3591	easier to write this out separately than try to macrify it. Use c to
3592	hold the length of the character in bytes, plus 0x80 to flag that it's a
3593	length rather than a small character. */
3594
3595	#ifdef SUPPORT_UTF8
3596	if (utf8 && (code[-1] & 0x80) != 0)
3597	{
3598	uschar *lastchar = code - 1;
3599	while((*lastchar & 0xc0) == 0x80) lastchar--;
3600	c = code - lastchar; /* Length of UTF-8 character */
3601	memcpy(utf8_char, lastchar, c); /* Save the char */
3602	c \|= 0x80; /* Flag c as a length */
3603	}
3604	else
3605	#endif
3606
3607	/* Handle the case of a single byte - either with no UTF8 support, or
3608	with UTF-8 disabled, or for a UTF-8 character < 128. */
3609
3610	{
3611	c = code[-1];
3612	if (repeat_min > 1) reqbyte = c \| req_caseopt \| cd->req_varyopt;
3613	}
3614
3615	/* If the repetition is unlimited, it pays to see if the next thing on
3616	the line is something that cannot possibly match this character. If so,
3617	automatically possessifying this item gains some performance in the case
3618	where the match fails. */
3619
3620	if (!possessive_quantifier &&
3621	repeat_max < 0 &&
3622	check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3623	options, cd))
3624	{
3625	repeat_type = 0; /* Force greedy */
3626	possessive_quantifier = TRUE;
3627	}
3628
3629	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3630	}
3631
3632	/* If previous was a single negated character ([^a] or similar), we use
3633	one of the special opcodes, replacing it. The code is shared with single-
3634	character repeats by setting opt_type to add a suitable offset into
3635	repeat_type. We can also test for auto-possessification. OP_NOT is
3636	currently used only for single-byte chars. */
3637
3638	else if (*previous == OP_NOT)
3639	{
3640	op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3641	c = previous[1];
3642	if (!possessive_quantifier &&
3643	repeat_max < 0 &&
3644	check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3645	{
3646	repeat_type = 0; /* Force greedy */
3647	possessive_quantifier = TRUE;
3648	}
3649	goto OUTPUT_SINGLE_REPEAT;
3650	}
3651
3652	/* If previous was a character type match (\d or similar), abolish it and
3653	create a suitable repeat item. The code is shared with single-character
3654	repeats by setting op_type to add a suitable offset into repeat_type. Note
3655	the the Unicode property types will be present only when SUPPORT_UCP is
3656	defined, but we don't wrap the little bits of code here because it just
3657	makes it horribly messy. */
3658
3659	else if (*previous < OP_EODN)
3660	{
3661	uschar *oldcode;
3662	int prop_type, prop_value;
3663	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3664	c = *previous;
3665
3666	if (!possessive_quantifier &&
3667	repeat_max < 0 &&
3668	check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3669	{
3670	repeat_type = 0; /* Force greedy */
3671	possessive_quantifier = TRUE;
3672	}
3673
3674	OUTPUT_SINGLE_REPEAT:
3675	if (previous == OP_PROP \|\| previous == OP_NOTPROP)
3676	{
3677	prop_type = previous[1];
3678	prop_value = previous[2];
3679	}
3680	else prop_type = prop_value = -1;
3681
3682	oldcode = code;
3683	code = previous; /* Usually overwrite previous item */
3684
3685	/* If the maximum is zero then the minimum must also be zero; Perl allows
3686	this case, so we do too - by simply omitting the item altogether. */
3687
3688	if (repeat_max == 0) goto END_REPEAT;
3689
3690	/* All real repeats make it impossible to handle partial matching (maybe
3691	one day we will be able to remove this restriction). */
3692
3693	if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL;
3694
3695	/* Combine the op_type with the repeat_type */
3696
3697	repeat_type += op_type;
3698
3699	/* A minimum of zero is handled either as the special case * or ?, or as
3700	an UPTO, with the maximum given. */
3701
3702	if (repeat_min == 0)
3703	{
3704	if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3705	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3706	else
3707	{
3708	*code++ = OP_UPTO + repeat_type;
3709	PUT2INC(code, 0, repeat_max);
3710	}
3711	}
3712
3713	/* A repeat minimum of 1 is optimized into some special cases. If the
3714	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3715	left in place and, if the maximum is greater than 1, we use OP_UPTO with
3716	one less than the maximum. */
3717
3718	else if (repeat_min == 1)
3719	{
3720	if (repeat_max == -1)
3721	*code++ = OP_PLUS + repeat_type;
3722	else
3723	{
3724	code = oldcode; /* leave previous item in place */
3725	if (repeat_max == 1) goto END_REPEAT;
3726	*code++ = OP_UPTO + repeat_type;
3727	PUT2INC(code, 0, repeat_max - 1);
3728	}
3729	}
3730
3731	/* The case {n,n} is just an EXACT, while the general case {n,m} is
3732	handled as an EXACT followed by an UPTO. */
3733
3734	else
3735	{
3736	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
3737	PUT2INC(code, 0, repeat_min);
3738
3739	/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3740	we have to insert the character for the previous code. For a repeated
3741	Unicode property match, there are two extra bytes that define the
3742	required property. In UTF-8 mode, long characters have their length in
3743	c, with the 0x80 bit as a flag. */
3744
3745	if (repeat_max < 0)
3746	{
3747	#ifdef SUPPORT_UTF8
3748	if (utf8 && c >= 128)
3749	{
3750	memcpy(code, utf8_char, c & 7);
3751	code += c & 7;
3752	}
3753	else
3754	#endif
3755	{
3756	*code++ = c;
3757	if (prop_type >= 0)
3758	{
3759	*code++ = prop_type;
3760	*code++ = prop_value;
3761	}
3762	}
3763	*code++ = OP_STAR + repeat_type;
3764	}
3765
3766	/* Else insert an UPTO if the max is greater than the min, again
3767	preceded by the character, for the previously inserted code. If the
3768	UPTO is just for 1 instance, we can use QUERY instead. */
3769
3770	else if (repeat_max != repeat_min)
3771	{
3772	#ifdef SUPPORT_UTF8
3773	if (utf8 && c >= 128)
3774	{
3775	memcpy(code, utf8_char, c & 7);
3776	code += c & 7;
3777	}
3778	else
3779	#endif
3780	*code++ = c;
3781	if (prop_type >= 0)
3782	{
3783	*code++ = prop_type;
3784	*code++ = prop_value;
3785	}
3786	repeat_max -= repeat_min;
3787
3788	if (repeat_max == 1)
3789	{
3790	*code++ = OP_QUERY + repeat_type;
3791	}
3792	else
3793	{
3794	*code++ = OP_UPTO + repeat_type;
3795	PUT2INC(code, 0, repeat_max);
3796	}
3797	}
3798	}
3799
3800	/* The character or character type itself comes last in all cases. */
3801
3802	#ifdef SUPPORT_UTF8
3803	if (utf8 && c >= 128)
3804	{
3805	memcpy(code, utf8_char, c & 7);
3806	code += c & 7;
3807	}
3808	else
3809	#endif
3810	*code++ = c;
3811
3812	/* For a repeated Unicode property match, there are two extra bytes that
3813	define the required property. */
3814
3815	#ifdef SUPPORT_UCP
3816	if (prop_type >= 0)
3817	{
3818	*code++ = prop_type;
3819	*code++ = prop_value;
3820	}
3821	#endif
3822	}
3823
3824	/* If previous was a character class or a back reference, we put the repeat
3825	stuff after it, but just skip the item if the repeat was {0,0}. */
3826
3827	else if (*previous == OP_CLASS \|\|
3828	*previous == OP_NCLASS \|\|
3829	#ifdef SUPPORT_UTF8
3830	*previous == OP_XCLASS \|\|
3831	#endif
3832	*previous == OP_REF)
3833	{
3834	if (repeat_max == 0)
3835	{
3836	code = previous;
3837	goto END_REPEAT;
3838	}
3839
3840	/* All real repeats make it impossible to handle partial matching (maybe
3841	one day we will be able to remove this restriction). */
3842
3843	if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL;
3844
3845	if (repeat_min == 0 && repeat_max == -1)
3846	*code++ = OP_CRSTAR + repeat_type;
3847	else if (repeat_min == 1 && repeat_max == -1)
3848	*code++ = OP_CRPLUS + repeat_type;
3849	else if (repeat_min == 0 && repeat_max == 1)
3850	*code++ = OP_CRQUERY + repeat_type;
3851	else
3852	{
3853	*code++ = OP_CRRANGE + repeat_type;
3854	PUT2INC(code, 0, repeat_min);
3855	if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3856	PUT2INC(code, 0, repeat_max);
3857	}
3858	}
3859
3860	/* If previous was a bracket group, we may have to replicate it in certain
3861	cases. */
3862
3863	else if (previous == OP_BRA \|\| previous == OP_CBRA \|\|
3864	previous == OP_ONCE \|\| previous == OP_COND)
3865	{
3866	register int i;
3867	int ketoffset = 0;
3868	int len = code - previous;
3869	uschar *bralink = NULL;
3870
3871	/* Repeating a DEFINE group is pointless */
3872
3873	if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3874	{
3875	*errorcodeptr = ERR55;
3876	goto FAILED;
3877	}
3878
3879	/* If the maximum repeat count is unlimited, find the end of the bracket
3880	by scanning through from the start, and compute the offset back to it
3881	from the current code pointer. There may be an OP_OPT setting following
3882	the final KET, so we can't find the end just by going back from the code
3883	pointer. */
3884
3885	if (repeat_max == -1)
3886	{
3887	register uschar *ket = previous;
3888	do ket += GET(ket, 1); while (*ket != OP_KET);
3889	ketoffset = code - ket;
3890	}
3891
3892	/* The case of a zero minimum is special because of the need to stick
3893	OP_BRAZERO in front of it, and because the group appears once in the
3894	data, whereas in other cases it appears the minimum number of times. For
3895	this reason, it is simplest to treat this case separately, as otherwise
3896	the code gets far too messy. There are several special subcases when the
3897	minimum is zero. */
3898
3899	if (repeat_min == 0)
3900	{
3901	/* If the maximum is also zero, we used to just omit the group from the
3902	output altogether, like this:
3903
3904	** if (repeat_max == 0)
3905	** {
3906	** code = previous;
3907	** goto END_REPEAT;
3908	** }
3909
3910	However, that fails when a group is referenced as a subroutine from
3911	elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3912	so that it is skipped on execution. As we don't have a list of which
3913	groups are referenced, we cannot do this selectively.
3914
3915	If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3916	and do no more at this point. However, we do need to adjust any
3917	OP_RECURSE calls inside the group that refer to the group itself or any
3918	internal or forward referenced group, because the offset is from the
3919	start of the whole regex. Temporarily terminate the pattern while doing
3920	this. */
3921
3922	if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3923	{
3924	*code = OP_END;
3925	adjust_recurse(previous, 1, utf8, cd, save_hwm);
3926	memmove(previous+1, previous, len);
3927	code++;
3928	if (repeat_max == 0)
3929	{
3930	*previous++ = OP_SKIPZERO;
3931	goto END_REPEAT;
3932	}
3933	*previous++ = OP_BRAZERO + repeat_type;
3934	}
3935
3936	/* If the maximum is greater than 1 and limited, we have to replicate
3937	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3938	The first one has to be handled carefully because it's the original
3939	copy, which has to be moved up. The remainder can be handled by code
3940	that is common with the non-zero minimum case below. We have to
3941	adjust the value or repeat_max, since one less copy is required. Once
3942	again, we may have to adjust any OP_RECURSE calls inside the group. */
3943
3944	else
3945	{
3946	int offset;
3947	*code = OP_END;
3948	adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3949	memmove(previous + 2 + LINK_SIZE, previous, len);
3950	code += 2 + LINK_SIZE;
3951	*previous++ = OP_BRAZERO + repeat_type;
3952	*previous++ = OP_BRA;
3953
3954	/* We chain together the bracket offset fields that have to be
3955	filled in later when the ends of the brackets are reached. */
3956
3957	offset = (bralink == NULL)? 0 : previous - bralink;
3958	bralink = previous;
3959	PUTINC(previous, 0, offset);
3960	}
3961
3962	repeat_max--;
3963	}
3964
3965	/* If the minimum is greater than zero, replicate the group as many
3966	times as necessary, and adjust the maximum to the number of subsequent
3967	copies that we need. If we set a first char from the group, and didn't
3968	set a required char, copy the latter from the former. If there are any
3969	forward reference subroutine calls in the group, there will be entries on
3970	the workspace list; replicate these with an appropriate increment. */
3971
3972	else
3973	{
3974	if (repeat_min > 1)
3975	{
3976	/* In the pre-compile phase, we don't actually do the replication. We
3977	just adjust the length as if we had. Do some paranoid checks for
3978	potential integer overflow. */
3979
3980	if (lengthptr != NULL)
3981	{
3982	int delta = (repeat_min - 1)*length_prevgroup;
3983	if ((double)(repeat_min - 1)*(double)length_prevgroup >
3984	(double)INT_MAX \|\|
3985	OFLOW_MAX - *lengthptr < delta)
3986	{
3987	*errorcodeptr = ERR20;
3988	goto FAILED;
3989	}
3990	*lengthptr += delta;
3991	}
3992
3993	/* This is compiling for real */
3994
3995	else
3996	{
3997	if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3998	for (i = 1; i < repeat_min; i++)
3999	{
4000	uschar *hc;
4001	uschar *this_hwm = cd->hwm;
4002	memcpy(code, previous, len);
4003	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4004	{
4005	PUT(cd->hwm, 0, GET(hc, 0) + len);
4006	cd->hwm += LINK_SIZE;
4007	}
4008	save_hwm = this_hwm;
4009	code += len;
4010	}
4011	}
4012	}
4013
4014	if (repeat_max > 0) repeat_max -= repeat_min;
4015	}
4016
4017	/* This code is common to both the zero and non-zero minimum cases. If
4018	the maximum is limited, it replicates the group in a nested fashion,
4019	remembering the bracket starts on a stack. In the case of a zero minimum,
4020	the first one was set up above. In all cases the repeat_max now specifies
4021	the number of additional copies needed. Again, we must remember to
4022	replicate entries on the forward reference list. */
4023
4024	if (repeat_max >= 0)
4025	{
4026	/* In the pre-compile phase, we don't actually do the replication. We
4027	just adjust the length as if we had. For each repetition we must add 1
4028	to the length for BRAZERO and for all but the last repetition we must
4029	add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4030	paranoid checks to avoid integer overflow. */
4031
4032	if (lengthptr != NULL && repeat_max > 0)
4033	{
4034	int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4035	2 - 2LINK_SIZE; / Last one doesn't nest */
4036	if ((double)repeat_max *
4037	(double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4038	> (double)INT_MAX \|\|
4039	OFLOW_MAX - *lengthptr < delta)
4040	{
4041	*errorcodeptr = ERR20;
4042	goto FAILED;
4043	}
4044	*lengthptr += delta;
4045	}
4046
4047	/* This is compiling for real */
4048
4049	else for (i = repeat_max - 1; i >= 0; i--)
4050	{
4051	uschar *hc;
4052	uschar *this_hwm = cd->hwm;
4053
4054	*code++ = OP_BRAZERO + repeat_type;
4055
4056	/* All but the final copy start a new nesting, maintaining the
4057	chain of brackets outstanding. */
4058
4059	if (i != 0)
4060	{
4061	int offset;
4062	*code++ = OP_BRA;
4063	offset = (bralink == NULL)? 0 : code - bralink;
4064	bralink = code;
4065	PUTINC(code, 0, offset);
4066	}
4067
4068	memcpy(code, previous, len);
4069	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4070	{
4071	PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4072	cd->hwm += LINK_SIZE;
4073	}
4074	save_hwm = this_hwm;
4075	code += len;
4076	}
4077
4078	/* Now chain through the pending brackets, and fill in their length
4079	fields (which are holding the chain links pro tem). */
4080
4081	while (bralink != NULL)
4082	{
4083	int oldlinkoffset;
4084	int offset = code - bralink + 1;
4085	uschar *bra = code - offset;
4086	oldlinkoffset = GET(bra, 1);
4087	bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4088	*code++ = OP_KET;
4089	PUTINC(code, 0, offset);
4090	PUT(bra, 1, offset);
4091	}
4092	}
4093
4094	/* If the maximum is unlimited, set a repeater in the final copy. We
4095	can't just offset backwards from the current code point, because we
4096	don't know if there's been an options resetting after the ket. The
4097	correct offset was computed above.
4098
4099	Then, when we are doing the actual compile phase, check to see whether
4100	this group is a non-atomic one that could match an empty string. If so,
4101	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4102	that runtime checking can be done. [This check is also applied to
4103	atomic groups at runtime, but in a different way.] */
4104
4105	else
4106	{
4107	uschar *ketcode = code - ketoffset;
4108	uschar *bracode = ketcode - GET(ketcode, 1);
4109	*ketcode = OP_KETRMAX + repeat_type;
4110	if (lengthptr == NULL && *bracode != OP_ONCE)
4111	{
4112	uschar *scode = bracode;
4113	do
4114	{
4115	if (could_be_empty_branch(scode, ketcode, utf8))
4116	{
4117	*bracode += OP_SBRA - OP_BRA;
4118	break;
4119	}
4120	scode += GET(scode, 1);
4121	}
4122	while (*scode == OP_ALT);
4123	}
4124	}
4125	}
4126
4127	/* If previous is OP_FAIL, it was generated by an empty class [] in
4128	JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4129	by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4130	error above. We can just ignore the repeat in JS case. */
4131
4132	else if (*previous == OP_FAIL) goto END_REPEAT;
4133
4134	/* Else there's some kind of shambles */
4135
4136	else
4137	{
4138	*errorcodeptr = ERR11;
4139	goto FAILED;
4140	}
4141
4142	/* If the character following a repeat is '+', or if certain optimization
4143	tests above succeeded, possessive_quantifier is TRUE. For some of the
4144	simpler opcodes, there is an special alternative opcode for this. For
4145	anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4146	The '+' notation is just syntactic sugar, taken from Sun's Java package,
4147	but the special opcodes can optimize it a bit. The repeated item starts at
4148	tempcode, not at previous, which might be the first part of a string whose
4149	(former) last char we repeated.
4150
4151	Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4152	an 'upto' may follow. We skip over an 'exact' item, and then test the
4153	length of what remains before proceeding. */
4154
4155	if (possessive_quantifier)
4156	{
4157	int len;
4158	if (tempcode == OP_EXACT \|\| tempcode == OP_TYPEEXACT \|\|
4159	*tempcode == OP_NOTEXACT)
4160	tempcode += _pcre_OP_lengths[*tempcode] +
4161	((*tempcode == OP_TYPEEXACT &&
4162	(tempcode[3] == OP_PROP \|\| tempcode[3] == OP_NOTPROP))? 2:0);
4163	len = code - tempcode;
4164	if (len > 0) switch (*tempcode)
4165	{
4166	case OP_STAR: *tempcode = OP_POSSTAR; break;
4167	case OP_PLUS: *tempcode = OP_POSPLUS; break;
4168	case OP_QUERY: *tempcode = OP_POSQUERY; break;
4169	case OP_UPTO: *tempcode = OP_POSUPTO; break;
4170
4171	case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4172	case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4173	case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4174	case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4175
4176	case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4177	case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4178	case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4179	case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4180
4181	default:
4182	memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4183	code += 1 + LINK_SIZE;
4184	len += 1 + LINK_SIZE;
4185	tempcode[0] = OP_ONCE;
4186	*code++ = OP_KET;
4187	PUTINC(code, 0, len);
4188	PUT(tempcode, 1, len);
4189	break;
4190	}
4191	}
4192
4193	/* In all case we no longer have a previous item. We also set the
4194	"follows varying string" flag for subsequently encountered reqbytes if
4195	it isn't already set and we have just passed a varying length item. */
4196
4197	END_REPEAT:
4198	previous = NULL;
4199	cd->req_varyopt \|= reqvary;
4200	break;
4201
4202
4203	/* ===================================================================*/
4204	/* Start of nested parenthesized sub-expression, or comment or lookahead or
4205	lookbehind or option setting or condition or all the other extended
4206	parenthesis forms. */
4207
4208	case '(':
4209	newoptions = options;
4210	skipbytes = 0;
4211	bravalue = OP_CBRA;
4212	save_hwm = cd->hwm;
4213	reset_bracount = FALSE;
4214
4215	/* First deal with various "verbs" that can be introduced by ''. /
4216
4217	if ((++ptr) == '' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4218	{
4219	int i, namelen;
4220	const char *vn = verbnames;
4221	const uschar *name = ++ptr;
4222	previous = NULL;
4223	while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4224	if (*ptr == ':')
4225	{
4226	errorcodeptr = ERR59; / Not supported */
4227	goto FAILED;
4228	}
4229	if (*ptr != ')')
4230	{
4231	*errorcodeptr = ERR60;
4232	goto FAILED;
4233	}
4234	namelen = ptr - name;
4235	for (i = 0; i < verbcount; i++)
4236	{
4237	if (namelen == verbs[i].len &&
4238	strncmp((char *)name, vn, namelen) == 0)
4239	{
4240	*code = verbs[i].op;
4241	if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4242	break;
4243	}
4244	vn += verbs[i].len + 1;
4245	}
4246	if (i < verbcount) continue;
4247	*errorcodeptr = ERR60;
4248	goto FAILED;
4249	}
4250
4251	/* Deal with the extended parentheses; all are introduced by '?', and the
4252	appearance of any of them means that this is not a capturing group. */
4253
4254	else if (*ptr == '?')
4255	{
4256	int i, set, unset, namelen;
4257	int *optset;
4258	const uschar *name;
4259	uschar *slot;
4260
4261	switch (*(++ptr))
4262	{
4263	case '#': /* Comment; skip to ket */
4264	ptr++;
4265	while (ptr != 0 && ptr != ')') ptr++;
4266	if (*ptr == 0)
4267	{
4268	*errorcodeptr = ERR18;
4269	goto FAILED;
4270	}
4271	continue;
4272
4273
4274	/* ------------------------------------------------------------ */
4275	case '\|': /* Reset capture count for each branch */
4276	reset_bracount = TRUE;
4277	/* Fall through */
4278
4279	/* ------------------------------------------------------------ */
4280	case ':': /* Non-capturing bracket */
4281	bravalue = OP_BRA;
4282	ptr++;
4283	break;
4284
4285
4286	/* ------------------------------------------------------------ */
4287	case '(':
4288	bravalue = OP_COND; /* Conditional group */
4289
4290	/* A condition can be an assertion, a number (referring to a numbered
4291	group), a name (referring to a named group), or 'R', referring to
4292	recursion. R<digits> and R&name are also permitted for recursion tests.
4293
4294	There are several syntaxes for testing a named group: (?(name)) is used
4295	by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4296
4297	There are two unfortunate ambiguities, caused by history. (a) 'R' can
4298	be the recursive thing or the name 'R' (and similarly for 'R' followed
4299	by digits), and (b) a number could be a name that consists of digits.
4300	In both cases, we look for a name first; if not found, we try the other
4301	cases. */
4302
4303	/* For conditions that are assertions, check the syntax, and then exit
4304	the switch. This will take control down to where bracketed groups,
4305	including assertions, are processed. */
4306
4307	if (ptr[1] == '?' && (ptr[2] == '=' \|\| ptr[2] == '!' \|\| ptr[2] == '<'))
4308	break;
4309
4310	/* Most other conditions use OP_CREF (a couple change to OP_RREF
4311	below), and all need to skip 3 bytes at the start of the group. */
4312
4313	code[1+LINK_SIZE] = OP_CREF;
4314	skipbytes = 3;
4315	refsign = -1;
4316
4317	/* Check for a test for recursion in a named group. */
4318
4319	if (ptr[1] == 'R' && ptr[2] == '&')
4320	{
4321	terminator = -1;
4322	ptr += 2;
4323	code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4324	}
4325
4326	/* Check for a test for a named group's having been set, using the Perl
4327	syntax (?(<name>) or (?('name') */
4328
4329	else if (ptr[1] == '<')
4330	{
4331	terminator = '>';
4332	ptr++;
4333	}
4334	else if (ptr[1] == '\'')
4335	{
4336	terminator = '\'';
4337	ptr++;
4338	}
4339	else
4340	{
4341	terminator = 0;
4342	if (ptr[1] == '-' \|\| ptr[1] == '+') refsign = *(++ptr);
4343	}
4344
4345	/* We now expect to read a name; any thing else is an error */
4346
4347	if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4348	{
4349	ptr += 1; /* To get the right offset */
4350	*errorcodeptr = ERR28;
4351	goto FAILED;
4352	}
4353
4354	/* Read the name, but also get it as a number if it's all digits */
4355
4356	recno = 0;
4357	name = ++ptr;
4358	while ((cd->ctypes[*ptr] & ctype_word) != 0)
4359	{
4360	if (recno >= 0)
4361	recno = ((digitab[*ptr] & ctype_digit) != 0)?
4362	recno * 10 + *ptr - '0' : -1;
4363	ptr++;
4364	}
4365	namelen = ptr - name;
4366
4367	if ((terminator > 0 && ptr++ != terminator) \|\| ptr++ != ')')
4368	{
4369	ptr--; /* Error offset */
4370	*errorcodeptr = ERR26;
4371	goto FAILED;
4372	}
4373
4374	/* Do no further checking in the pre-compile phase. */
4375
4376	if (lengthptr != NULL) break;
4377
4378	/* In the real compile we do the work of looking for the actual
4379	reference. If the string started with "+" or "-" we require the rest to
4380	be digits, in which case recno will be set. */
4381
4382	if (refsign > 0)
4383	{
4384	if (recno <= 0)
4385	{
4386	*errorcodeptr = ERR58;
4387	goto FAILED;
4388	}
4389	recno = (refsign == '-')?
4390	cd->bracount - recno + 1 : recno +cd->bracount;
4391	if (recno <= 0 \|\| recno > cd->final_bracount)
4392	{
4393	*errorcodeptr = ERR15;
4394	goto FAILED;
4395	}
4396	PUT2(code, 2+LINK_SIZE, recno);
4397	break;
4398	}
4399
4400	/* Otherwise (did not start with "+" or "-"), start by looking for the
4401	name. */
4402
4403	slot = cd->name_table;
4404	for (i = 0; i < cd->names_found; i++)
4405	{
4406	if (strncmp((char )name, (char )slot+2, namelen) == 0) break;
4407	slot += cd->name_entry_size;
4408	}
4409
4410	/* Found a previous named subpattern */
4411
4412	if (i < cd->names_found)
4413	{
4414	recno = GET2(slot, 0);
4415	PUT2(code, 2+LINK_SIZE, recno);
4416	}
4417
4418	/* Search the pattern for a forward reference */
4419
4420	else if ((i = find_parens(ptr, cd, name, namelen,
4421	(options & PCRE_EXTENDED) != 0)) > 0)
4422	{
4423	PUT2(code, 2+LINK_SIZE, i);
4424	}
4425
4426	/* If terminator == 0 it means that the name followed directly after
4427	the opening parenthesis [e.g. (?(abc)...] and in this case there are
4428	some further alternatives to try. For the cases where terminator != 0
4429	[things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4430	now checked all the possibilities, so give an error. */
4431
4432	else if (terminator != 0)
4433	{
4434	*errorcodeptr = ERR15;
4435	goto FAILED;
4436	}
4437
4438	/* Check for (?(R) for recursion. Allow digits after R to specify a
4439	specific group number. */
4440
4441	else if (*name == 'R')
4442	{
4443	recno = 0;
4444	for (i = 1; i < namelen; i++)
4445	{
4446	if ((digitab[name[i]] & ctype_digit) == 0)
4447	{
4448	*errorcodeptr = ERR15;
4449	goto FAILED;
4450	}
4451	recno = recno * 10 + name[i] - '0';
4452	}
4453	if (recno == 0) recno = RREF_ANY;
4454	code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4455	PUT2(code, 2+LINK_SIZE, recno);
4456	}
4457
4458	/* Similarly, check for the (?(DEFINE) "condition", which is always
4459	false. */
4460
4461	else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4462	{
4463	code[1+LINK_SIZE] = OP_DEF;
4464	skipbytes = 1;
4465	}
4466
4467	/* Check for the "name" actually being a subpattern number. We are
4468	in the second pass here, so final_bracount is set. */
4469
4470	else if (recno > 0 && recno <= cd->final_bracount)
4471	{
4472	PUT2(code, 2+LINK_SIZE, recno);
4473	}
4474
4475	/* Either an unidentified subpattern, or a reference to (?(0) */
4476
4477	else
4478	{
4479	*errorcodeptr = (recno == 0)? ERR35: ERR15;
4480	goto FAILED;
4481	}
4482	break;
4483
4484
4485	/* ------------------------------------------------------------ */
4486	case '=': /* Positive lookahead */
4487	bravalue = OP_ASSERT;
4488	ptr++;
4489	break;
4490
4491
4492	/* ------------------------------------------------------------ */
4493	case '!': /* Negative lookahead */
4494	ptr++;
4495	if (ptr == ')') / Optimize (?!) */
4496	{
4497	*code++ = OP_FAIL;
4498	previous = NULL;
4499	continue;
4500	}
4501	bravalue = OP_ASSERT_NOT;
4502	break;
4503
4504
4505	/* ------------------------------------------------------------ */
4506	case '<': /* Lookbehind or named define */
4507	switch (ptr[1])
4508	{
4509	case '=': /* Positive lookbehind */
4510	bravalue = OP_ASSERTBACK;
4511	ptr += 2;
4512	break;
4513
4514	case '!': /* Negative lookbehind */
4515	bravalue = OP_ASSERTBACK_NOT;
4516	ptr += 2;
4517	break;
4518
4519	default: /* Could be name define, else bad */
4520	if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4521	ptr++; /* Correct offset for error */
4522	*errorcodeptr = ERR24;
4523	goto FAILED;
4524	}
4525	break;
4526
4527
4528	/* ------------------------------------------------------------ */
4529	case '>': /* One-time brackets */
4530	bravalue = OP_ONCE;
4531	ptr++;
4532	break;
4533
4534
4535	/* ------------------------------------------------------------ */
4536	case 'C': /* Callout - may be followed by digits; */
4537	previous_callout = code; /* Save for later completion */
4538	after_manual_callout = 1; /* Skip one item before completing */
4539	*code++ = OP_CALLOUT;
4540	{
4541	int n = 0;
4542	while ((digitab[*(++ptr)] & ctype_digit) != 0)
4543	n = n * 10 + *ptr - '0';
4544	if (*ptr != ')')
4545	{
4546	*errorcodeptr = ERR39;
4547	goto FAILED;
4548	}
4549	if (n > 255)
4550	{
4551	*errorcodeptr = ERR38;
4552	goto FAILED;
4553	}
4554	*code++ = n;
4555	PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4556	PUT(code, LINK_SIZE, 0); /* Default length */
4557	code += 2 * LINK_SIZE;
4558	}
4559	previous = NULL;
4560	continue;
4561
4562
4563	/* ------------------------------------------------------------ */
4564	case 'P': /* Python-style named subpattern handling */
4565	if ((++ptr) == '=' \|\| ptr == '>') /* Reference or recursion */
4566	{
4567	is_recurse = *ptr == '>';
4568	terminator = ')';
4569	goto NAMED_REF_OR_RECURSE;
4570	}
4571	else if (ptr != '<') / Test for Python-style definition */
4572	{
4573	*errorcodeptr = ERR41;
4574	goto FAILED;
4575	}
4576	/* Fall through to handle (?P< as (?< is handled */
4577
4578
4579	/* ------------------------------------------------------------ */
4580	DEFINE_NAME: /* Come here from (?< handling */
4581	case '\'':
4582	{
4583	terminator = (*ptr == '<')? '>' : '\'';
4584	name = ++ptr;
4585
4586	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4587	namelen = ptr - name;
4588
4589	/* In the pre-compile phase, just do a syntax check. */
4590
4591	if (lengthptr != NULL)
4592	{
4593	if (*ptr != terminator)
4594	{
4595	*errorcodeptr = ERR42;
4596	goto FAILED;
4597	}
4598	if (cd->names_found >= MAX_NAME_COUNT)
4599	{
4600	*errorcodeptr = ERR49;
4601	goto FAILED;
4602	}
4603	if (namelen + 3 > cd->name_entry_size)
4604	{
4605	cd->name_entry_size = namelen + 3;
4606	if (namelen > MAX_NAME_SIZE)
4607	{
4608	*errorcodeptr = ERR48;
4609	goto FAILED;
4610	}
4611	}
4612	}
4613
4614	/* In the real compile, create the entry in the table */
4615
4616	else
4617	{
4618	slot = cd->name_table;
4619	for (i = 0; i < cd->names_found; i++)
4620	{
4621	int crc = memcmp(name, slot+2, namelen);
4622	if (crc == 0)
4623	{
4624	if (slot[2+namelen] == 0)
4625	{
4626	if ((options & PCRE_DUPNAMES) == 0)
4627	{
4628	*errorcodeptr = ERR43;
4629	goto FAILED;
4630	}
4631	}
4632	else crc = -1; /* Current name is substring */
4633	}
4634	if (crc < 0)
4635	{
4636	memmove(slot + cd->name_entry_size, slot,
4637	(cd->names_found - i) * cd->name_entry_size);
4638	break;
4639	}
4640	slot += cd->name_entry_size;
4641	}
4642
4643	PUT2(slot, 0, cd->bracount + 1);
4644	memcpy(slot + 2, name, namelen);
4645	slot[2+namelen] = 0;
4646	}
4647	}
4648
4649	/* In both cases, count the number of names we've encountered. */
4650
4651	ptr++; /* Move past > or ' */
4652	cd->names_found++;
4653	goto NUMBERED_GROUP;
4654
4655
4656	/* ------------------------------------------------------------ */
4657	case '&': /* Perl recursion/subroutine syntax */
4658	terminator = ')';
4659	is_recurse = TRUE;
4660	/* Fall through */
4661
4662	/* We come here from the Python syntax above that handles both
4663	references (?P=name) and recursion (?P>name), as well as falling
4664	through from the Perl recursion syntax (?&name). We also come here from
4665	the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4666	.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4667
4668	NAMED_REF_OR_RECURSE:
4669	name = ++ptr;
4670	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4671	namelen = ptr - name;
4672
4673	/* In the pre-compile phase, do a syntax check and set a dummy
4674	reference number. */
4675
4676	if (lengthptr != NULL)
4677	{
4678	if (namelen == 0)
4679	{
4680	*errorcodeptr = ERR62;
4681	goto FAILED;
4682	}
4683	if (*ptr != terminator)
4684	{
4685	*errorcodeptr = ERR42;
4686	goto FAILED;
4687	}
4688	if (namelen > MAX_NAME_SIZE)
4689	{
4690	*errorcodeptr = ERR48;
4691	goto FAILED;
4692	}
4693	recno = 0;
4694	}
4695
4696	/* In the real compile, seek the name in the table. We check the name
4697	first, and then check that we have reached the end of the name in the
4698	table. That way, if the name that is longer than any in the table,
4699	the comparison will fail without reading beyond the table entry. */
4700
4701	else
4702	{
4703	slot = cd->name_table;
4704	for (i = 0; i < cd->names_found; i++)
4705	{
4706	if (strncmp((char )name, (char )slot+2, namelen) == 0 &&
4707	slot[2+namelen] == 0)
4708	break;
4709	slot += cd->name_entry_size;
4710	}
4711
4712	if (i < cd->names_found) /* Back reference */
4713	{
4714	recno = GET2(slot, 0);
4715	}
4716	else if ((recno = /* Forward back reference */
4717	find_parens(ptr, cd, name, namelen,
4718	(options & PCRE_EXTENDED) != 0)) <= 0)
4719	{
4720	*errorcodeptr = ERR15;
4721	goto FAILED;
4722	}
4723	}
4724
4725	/* In both phases, we can now go to the code than handles numerical
4726	recursion or backreferences. */
4727
4728	if (is_recurse) goto HANDLE_RECURSION;
4729	else goto HANDLE_REFERENCE;
4730
4731
4732	/* ------------------------------------------------------------ */
4733	case 'R': /* Recursion */
4734	ptr++; /* Same as (?0) */
4735	/* Fall through */
4736
4737
4738	/* ------------------------------------------------------------ */
4739	case '-': case '+':
4740	case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4741	case '5': case '6': case '7': case '8': case '9': /* subroutine */
4742	{
4743	const uschar *called;
4744	terminator = ')';
4745
4746	/* Come here from the \g<...> and \g'...' code (Oniguruma
4747	compatibility). However, the syntax has been checked to ensure that
4748	the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4749	be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4750	ever be taken. */
4751
4752	HANDLE_NUMERICAL_RECURSION:
4753
4754	if ((refsign = *ptr) == '+')
4755	{
4756	ptr++;
4757	if ((digitab[*ptr] & ctype_digit) == 0)
4758	{
4759	*errorcodeptr = ERR63;
4760	goto FAILED;
4761	}
4762	}
4763	else if (refsign == '-')
4764	{
4765	if ((digitab[ptr[1]] & ctype_digit) == 0)
4766	goto OTHER_CHAR_AFTER_QUERY;
4767	ptr++;
4768	}
4769
4770	recno = 0;
4771	while((digitab[*ptr] & ctype_digit) != 0)
4772	recno = recno * 10 + *ptr++ - '0';
4773
4774	if (*ptr != terminator)
4775	{
4776	*errorcodeptr = ERR29;
4777	goto FAILED;
4778	}
4779
4780	if (refsign == '-')
4781	{
4782	if (recno == 0)
4783	{
4784	*errorcodeptr = ERR58;
4785	goto FAILED;
4786	}
4787	recno = cd->bracount - recno + 1;
4788	if (recno <= 0)
4789	{
4790	*errorcodeptr = ERR15;
4791	goto FAILED;
4792	}
4793	}
4794	else if (refsign == '+')
4795	{
4796	if (recno == 0)
4797	{
4798	*errorcodeptr = ERR58;
4799	goto FAILED;
4800	}
4801	recno += cd->bracount;
4802	}
4803
4804	/* Come here from code above that handles a named recursion */
4805
4806	HANDLE_RECURSION:
4807
4808	previous = code;
4809	called = cd->start_code;
4810
4811	/* When we are actually compiling, find the bracket that is being
4812	referenced. Temporarily end the regex in case it doesn't exist before
4813	this point. If we end up with a forward reference, first check that
4814	the bracket does occur later so we can give the error (and position)
4815	now. Then remember this forward reference in the workspace so it can
4816	be filled in at the end. */
4817
4818	if (lengthptr == NULL)
4819	{
4820	*code = OP_END;
4821	if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4822
4823	/* Forward reference */
4824
4825	if (called == NULL)
4826	{
4827	if (find_parens(ptr, cd, NULL, recno,
4828	(options & PCRE_EXTENDED) != 0) < 0)
4829	{
4830	*errorcodeptr = ERR15;
4831	goto FAILED;
4832	}
4833	called = cd->start_code + recno;
4834	PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4835	}
4836
4837	/* If not a forward reference, and the subpattern is still open,
4838	this is a recursive call. We check to see if this is a left
4839	recursion that could loop for ever, and diagnose that case. */
4840
4841	else if (GET(called, 1) == 0 &&
4842	could_be_empty(called, code, bcptr, utf8))
4843	{
4844	*errorcodeptr = ERR40;
4845	goto FAILED;
4846	}
4847	}
4848
4849	/* Insert the recursion/subroutine item, automatically wrapped inside
4850	"once" brackets. Set up a "previous group" length so that a
4851	subsequent quantifier will work. */
4852
4853	*code = OP_ONCE;
4854	PUT(code, 1, 2 + 2*LINK_SIZE);
4855	code += 1 + LINK_SIZE;
4856
4857	*code = OP_RECURSE;
4858	PUT(code, 1, called - cd->start_code);
4859	code += 1 + LINK_SIZE;
4860
4861	*code = OP_KET;
4862	PUT(code, 1, 2 + 2*LINK_SIZE);
4863	code += 1 + LINK_SIZE;
4864
4865	length_prevgroup = 3 + 3*LINK_SIZE;
4866	}
4867
4868	/* Can't determine a first byte now */
4869
4870	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4871	continue;
4872
4873
4874	/* ------------------------------------------------------------ */
4875	default: /* Other characters: check option setting */
4876	OTHER_CHAR_AFTER_QUERY:
4877	set = unset = 0;
4878	optset = &set;
4879
4880	while (ptr != ')' && ptr != ':')
4881	{
4882	switch (*ptr++)
4883	{
4884	case '-': optset = &unset; break;
4885
4886	case 'J': /* Record that it changed in the external options */
4887	*optset \|= PCRE_DUPNAMES;
4888	cd->external_flags \|= PCRE_JCHANGED;
4889	break;
4890
4891	case 'i': *optset \|= PCRE_CASELESS; break;
4892	case 'm': *optset \|= PCRE_MULTILINE; break;
4893	case 's': *optset \|= PCRE_DOTALL; break;
4894	case 'x': *optset \|= PCRE_EXTENDED; break;
4895	case 'U': *optset \|= PCRE_UNGREEDY; break;
4896	case 'X': *optset \|= PCRE_EXTRA; break;
4897
4898	default: *errorcodeptr = ERR12;
4899	ptr--; /* Correct the offset */
4900	goto FAILED;
4901	}
4902	}
4903
4904	/* Set up the changed option bits, but don't change anything yet. */
4905
4906	newoptions = (options \| set) & (~unset);
4907
4908	/* If the options ended with ')' this is not the start of a nested
4909	group with option changes, so the options change at this level. If this
4910	item is right at the start of the pattern, the options can be
4911	abstracted and made external in the pre-compile phase, and ignored in
4912	the compile phase. This can be helpful when matching -- for instance in
4913	caseless checking of required bytes.
4914
4915	If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4916	definitely not at the start of the pattern because something has been
4917	compiled. In the pre-compile phase, however, the code pointer can have
4918	that value after the start, because it gets reset as code is discarded
4919	during the pre-compile. However, this can happen only at top level - if
4920	we are within parentheses, the starting BRA will still be present. At
4921	any parenthesis level, the length value can be used to test if anything
4922	has been compiled at that level. Thus, a test for both these conditions
4923	is necessary to ensure we correctly detect the start of the pattern in
4924	both phases.
4925
4926	If we are not at the pattern start, compile code to change the ims
4927	options if this setting actually changes any of them, and reset the
4928	greedy defaults and the case value for firstbyte and reqbyte. */
4929
4930	if (*ptr == ')')
4931	{
4932	if (code == cd->start_code + 1 + LINK_SIZE &&
4933	(lengthptr == NULL \|\| lengthptr == 2 + 2LINK_SIZE))
4934	{
4935	cd->external_options = newoptions;
4936	}
4937	else
4938	{
4939	if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4940	{
4941	*code++ = OP_OPT;
4942	*code++ = newoptions & PCRE_IMS;
4943	}
4944	greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4945	greedy_non_default = greedy_default ^ 1;
4946	req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4947	}
4948
4949	/* Change options at this level, and pass them back for use
4950	in subsequent branches. When not at the start of the pattern, this
4951	information is also necessary so that a resetting item can be
4952	compiled at the end of a group (if we are in a group). */
4953
4954	*optionsptr = options = newoptions;
4955	previous = NULL; /* This item can't be repeated */
4956	continue; /* It is complete */
4957	}
4958
4959	/* If the options ended with ':' we are heading into a nested group
4960	with possible change of options. Such groups are non-capturing and are
4961	not assertions of any kind. All we need to do is skip over the ':';
4962	the newoptions value is handled below. */
4963
4964	bravalue = OP_BRA;
4965	ptr++;
4966	} /* End of switch for character following (? */
4967	} /* End of (? handling */
4968
4969	/* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4970	all unadorned brackets become non-capturing and behave like (?:...)
4971	brackets. */
4972
4973	else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4974	{
4975	bravalue = OP_BRA;
4976	}
4977
4978	/* Else we have a capturing group. */
4979
4980	else
4981	{
4982	NUMBERED_GROUP:
4983	cd->bracount += 1;
4984	PUT2(code, 1+LINK_SIZE, cd->bracount);
4985	skipbytes = 2;
4986	}
4987
4988	/* Process nested bracketed regex. Assertions may not be repeated, but
4989	other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4990	non-register variable in order to be able to pass its address because some
4991	compilers complain otherwise. Pass in a new setting for the ims options if
4992	they have changed. */
4993
4994	previous = (bravalue >= OP_ONCE)? code : NULL;
4995	*code = bravalue;
4996	tempcode = code;
4997	tempreqvary = cd->req_varyopt; /* Save value before bracket */
4998	length_prevgroup = 0; /* Initialize for pre-compile phase */
4999
5000	if (!compile_regex(
5001	newoptions, /* The complete new option state */
5002	options & PCRE_IMS, /* The previous ims option state */
5003	&tempcode, /* Where to put code (updated) */
5004	&ptr, /* Input pointer (updated) */
5005	errorcodeptr, /* Where to put an error message */
5006	(bravalue == OP_ASSERTBACK \|\|
5007	bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5008	reset_bracount, /* True if (?\| group */
5009	skipbytes, /* Skip over bracket number */
5010	&subfirstbyte, /* For possible first char */
5011	&subreqbyte, /* For possible last char */
5012	bcptr, /* Current branch chain */
5013	cd, /* Tables block */
5014	(lengthptr == NULL)? NULL : /* Actual compile phase */
5015	&length_prevgroup /* Pre-compile phase */
5016	))
5017	goto FAILED;
5018
5019	/* At the end of compiling, code is still pointing to the start of the
5020	group, while tempcode has been updated to point past the end of the group
5021	and any option resetting that may follow it. The pattern pointer (ptr)
5022	is on the bracket. */
5023
5024	/* If this is a conditional bracket, check that there are no more than
5025	two branches in the group, or just one if it's a DEFINE group. We do this
5026	in the real compile phase, not in the pre-pass, where the whole group may
5027	not be available. */
5028
5029	if (bravalue == OP_COND && lengthptr == NULL)
5030	{
5031	uschar *tc = code;
5032	int condcount = 0;
5033
5034	do {
5035	condcount++;
5036	tc += GET(tc,1);
5037	}
5038	while (*tc != OP_KET);
5039
5040	/* A DEFINE group is never obeyed inline (the "condition" is always
5041	false). It must have only one branch. */
5042
5043	if (code[LINK_SIZE+1] == OP_DEF)
5044	{
5045	if (condcount > 1)
5046	{
5047	*errorcodeptr = ERR54;
5048	goto FAILED;
5049	}
5050	bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5051	}
5052
5053	/* A "normal" conditional group. If there is just one branch, we must not
5054	make use of its firstbyte or reqbyte, because this is equivalent to an
5055	empty second branch. */
5056
5057	else
5058	{
5059	if (condcount > 2)
5060	{
5061	*errorcodeptr = ERR27;
5062	goto FAILED;
5063	}
5064	if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5065	}
5066	}
5067
5068	/* Error if hit end of pattern */
5069
5070	if (*ptr != ')')
5071	{
5072	*errorcodeptr = ERR14;
5073	goto FAILED;
5074	}
5075
5076	/* In the pre-compile phase, update the length by the length of the group,
5077	less the brackets at either end. Then reduce the compiled code to just a
5078	set of non-capturing brackets so that it doesn't use much memory if it is
5079	duplicated by a quantifier.*/
5080
5081	if (lengthptr != NULL)
5082	{
5083	if (OFLOW_MAX - lengthptr < length_prevgroup - 2 - 2LINK_SIZE)
5084	{
5085	*errorcodeptr = ERR20;
5086	goto FAILED;
5087	}
5088	lengthptr += length_prevgroup - 2 - 2LINK_SIZE;
5089	*code++ = OP_BRA;
5090	PUTINC(code, 0, 1 + LINK_SIZE);
5091	*code++ = OP_KET;
5092	PUTINC(code, 0, 1 + LINK_SIZE);
5093	break; /* No need to waste time with special character handling */
5094	}
5095
5096	/* Otherwise update the main code pointer to the end of the group. */
5097
5098	code = tempcode;
5099
5100	/* For a DEFINE group, required and first character settings are not
5101	relevant. */
5102
5103	if (bravalue == OP_DEF) break;
5104
5105	/* Handle updating of the required and first characters for other types of
5106	group. Update for normal brackets of all kinds, and conditions with two
5107	branches (see code above). If the bracket is followed by a quantifier with
5108	zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5109	zerofirstbyte outside the main loop so that they can be accessed for the
5110	back off. */
5111
5112	zeroreqbyte = reqbyte;
5113	zerofirstbyte = firstbyte;
5114	groupsetfirstbyte = FALSE;
5115
5116	if (bravalue >= OP_ONCE)
5117	{
5118	/* If we have not yet set a firstbyte in this branch, take it from the
5119	subpattern, remembering that it was set here so that a repeat of more
5120	than one can replicate it as reqbyte if necessary. If the subpattern has
5121	no firstbyte, set "none" for the whole branch. In both cases, a zero
5122	repeat forces firstbyte to "none". */
5123
5124	if (firstbyte == REQ_UNSET)
5125	{
5126	if (subfirstbyte >= 0)
5127	{
5128	firstbyte = subfirstbyte;
5129	groupsetfirstbyte = TRUE;
5130	}
5131	else firstbyte = REQ_NONE;
5132	zerofirstbyte = REQ_NONE;
5133	}
5134
5135	/* If firstbyte was previously set, convert the subpattern's firstbyte
5136	into reqbyte if there wasn't one, using the vary flag that was in
5137	existence beforehand. */
5138
5139	else if (subfirstbyte >= 0 && subreqbyte < 0)
5140	subreqbyte = subfirstbyte \| tempreqvary;
5141
5142	/* If the subpattern set a required byte (or set a first byte that isn't
5143	really the first byte - see above), set it. */
5144
5145	if (subreqbyte >= 0) reqbyte = subreqbyte;
5146	}
5147
5148	/* For a forward assertion, we take the reqbyte, if set. This can be
5149	helpful if the pattern that follows the assertion doesn't set a different
5150	char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5151	for an assertion, however because it leads to incorrect effect for patterns
5152	such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5153	of a firstbyte. This is overcome by a scan at the end if there's no
5154	firstbyte, looking for an asserted first char. */
5155
5156	else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5157	break; /* End of processing '(' */
5158
5159
5160	/* ===================================================================*/
5161	/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5162	are arranged to be the negation of the corresponding OP_values. For the
5163	back references, the values are ESC_REF plus the reference number. Only
5164	back references and those types that consume a character may be repeated.
5165	We can test for values between ESC_b and ESC_Z for the latter; this may
5166	have to change if any new ones are ever created. */
5167
5168	case '\\':
5169	tempptr = ptr;
5170	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5171	if (*errorcodeptr != 0) goto FAILED;
5172
5173	if (c < 0)
5174	{
5175	if (-c == ESC_Q) /* Handle start of quoted string */
5176	{
5177	if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5178	else inescq = TRUE;
5179	continue;
5180	}
5181
5182	if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5183
5184	/* For metasequences that actually match a character, we disable the
5185	setting of a first character if it hasn't already been set. */
5186
5187	if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5188	firstbyte = REQ_NONE;
5189
5190	/* Set values to reset to if this is followed by a zero repeat. */
5191
5192	zerofirstbyte = firstbyte;
5193	zeroreqbyte = reqbyte;
5194
5195	/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5196	is a subroutine call by number (Oniguruma syntax). In fact, the value
5197	-ESC_g is returned only for these cases. So we don't need to check for <
5198	or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5199	-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5200	that is a synonym for a named back reference). */
5201
5202	if (-c == ESC_g)
5203	{
5204	const uschar *p;
5205	save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5206	terminator = (*(++ptr) == '<')? '>' : '\'';
5207
5208	/* These two statements stop the compiler for warning about possibly
5209	unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5210	fact, because we actually check for a number below, the paths that
5211	would actually be in error are never taken. */
5212
5213	skipbytes = 0;
5214	reset_bracount = FALSE;
5215
5216	/* Test for a name */
5217
5218	if (ptr[1] != '+' && ptr[1] != '-')
5219	{
5220	BOOL isnumber = TRUE;
5221	for (p = ptr + 1; p != 0 && p != terminator; p++)
5222	{
5223	if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5224	if ((cd->ctypes[*p] & ctype_word) == 0) break;
5225	}
5226	if (*p != terminator)
5227	{
5228	*errorcodeptr = ERR57;
5229	break;
5230	}
5231	if (isnumber)
5232	{
5233	ptr++;
5234	goto HANDLE_NUMERICAL_RECURSION;
5235	}
5236	is_recurse = TRUE;
5237	goto NAMED_REF_OR_RECURSE;
5238	}
5239
5240	/* Test a signed number in angle brackets or quotes. */
5241
5242	p = ptr + 2;
5243	while ((digitab[*p] & ctype_digit) != 0) p++;
5244	if (*p != terminator)
5245	{
5246	*errorcodeptr = ERR57;
5247	break;
5248	}
5249	ptr++;
5250	goto HANDLE_NUMERICAL_RECURSION;
5251	}
5252
5253	/* \k<name> or \k'name' is a back reference by name (Perl syntax).
5254	We also support \k{name} (.NET syntax) */
5255
5256	if (-c == ESC_k && (ptr[1] == '<' \|\| ptr[1] == '\'' \|\| ptr[1] == '{'))
5257	{
5258	is_recurse = FALSE;
5259	terminator = ((++ptr) == '<')? '>' : (ptr == '\'')? '\'' : '}';
5260	goto NAMED_REF_OR_RECURSE;
5261	}
5262
5263	/* Back references are handled specially; must disable firstbyte if
5264	not set to cope with cases like (?=(\w+))\1: which would otherwise set
5265	':' later. */
5266
5267	if (-c >= ESC_REF)
5268	{
5269	recno = -c - ESC_REF;
5270
5271	HANDLE_REFERENCE: /* Come here from named backref handling */
5272	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5273	previous = code;
5274	*code++ = OP_REF;
5275	PUT2INC(code, 0, recno);
5276	cd->backref_map \|= (recno < 32)? (1 << recno) : 1;
5277	if (recno > cd->top_backref) cd->top_backref = recno;
5278	}
5279
5280	/* So are Unicode property matches, if supported. */
5281
5282	#ifdef SUPPORT_UCP
5283	else if (-c == ESC_P \|\| -c == ESC_p)
5284	{
5285	BOOL negated;
5286	int pdata;
5287	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5288	if (ptype < 0) goto FAILED;
5289	previous = code;
5290	*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5291	*code++ = ptype;
5292	*code++ = pdata;
5293	}
5294	#else
5295
5296	/* If Unicode properties are not supported, \X, \P, and \p are not
5297	allowed. */
5298
5299	else if (-c == ESC_X \|\| -c == ESC_P \|\| -c == ESC_p)
5300	{
5301	*errorcodeptr = ERR45;
5302	goto FAILED;
5303	}
5304	#endif
5305
5306	/* For the rest (including \X when Unicode properties are supported), we
5307	can obtain the OP value by negating the escape value. */
5308
5309	else
5310	{
5311	previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5312	*code++ = -c;
5313	}
5314	continue;
5315	}
5316
5317	/* We have a data character whose value is in c. In UTF-8 mode it may have
5318	a value > 127. We set its representation in the length/buffer, and then
5319	handle it as a data character. */
5320
5321	#ifdef SUPPORT_UTF8
5322	if (utf8 && c > 127)
5323	mclength = _pcre_ord2utf8(c, mcbuffer);
5324	else
5325	#endif
5326
5327	{
5328	mcbuffer[0] = c;
5329	mclength = 1;
5330	}
5331	goto ONE_CHAR;
5332
5333
5334	/* ===================================================================*/
5335	/* Handle a literal character. It is guaranteed not to be whitespace or #
5336	when the extended flag is set. If we are in UTF-8 mode, it may be a
5337	multi-byte literal character. */
5338
5339	default:
5340	NORMAL_CHAR:
5341	mclength = 1;
5342	mcbuffer[0] = c;
5343
5344	#ifdef SUPPORT_UTF8
5345	if (utf8 && c >= 0xc0)
5346	{
5347	while ((ptr[1] & 0xc0) == 0x80)
5348	mcbuffer[mclength++] = *(++ptr);
5349	}
5350	#endif
5351
5352	/* At this point we have the character's bytes in mcbuffer, and the length
5353	in mclength. When not in UTF-8 mode, the length is always 1. */
5354
5355	ONE_CHAR:
5356	previous = code;
5357	*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5358	for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5359
5360	/* Remember if \r or \n were seen */
5361
5362	if (mcbuffer[0] == '\r' \|\| mcbuffer[0] == '\n')
5363	cd->external_flags \|= PCRE_HASCRORLF;
5364
5365	/* Set the first and required bytes appropriately. If no previous first
5366	byte, set it from this character, but revert to none on a zero repeat.
5367	Otherwise, leave the firstbyte value alone, and don't change it on a zero
5368	repeat. */
5369
5370	if (firstbyte == REQ_UNSET)
5371	{
5372	zerofirstbyte = REQ_NONE;
5373	zeroreqbyte = reqbyte;
5374
5375	/* If the character is more than one byte long, we can set firstbyte
5376	only if it is not to be matched caselessly. */
5377
5378	if (mclength == 1 \|\| req_caseopt == 0)
5379	{
5380	firstbyte = mcbuffer[0] \| req_caseopt;
5381	if (mclength != 1) reqbyte = code[-1] \| cd->req_varyopt;
5382	}
5383	else firstbyte = reqbyte = REQ_NONE;
5384	}
5385
5386	/* firstbyte was previously set; we can set reqbyte only the length is
5387	1 or the matching is caseful. */
5388
5389	else
5390	{
5391	zerofirstbyte = firstbyte;
5392	zeroreqbyte = reqbyte;
5393	if (mclength == 1 \|\| req_caseopt == 0)
5394	reqbyte = code[-1] \| req_caseopt \| cd->req_varyopt;
5395	}
5396
5397	break; /* End of literal character handling */
5398	}
5399	} /* end of big loop */
5400
5401
5402	/* Control never reaches here by falling through, only by a goto for all the
5403	error states. Pass back the position in the pattern so that it can be displayed
5404	to the user for diagnosing the error. */
5405
5406	FAILED:
5407	*ptrptr = ptr;
5408	return FALSE;
5409	}
5410
5411
5412
5413
5414	/*************************************************
5415	* Compile sequence of alternatives *
5416	*************************************************/
5417
5418	/* On entry, ptr is pointing past the bracket character, but on return it
5419	points to the closing bracket, or vertical bar, or end of string. The code
5420	variable is pointing at the byte into which the BRA operator has been stored.
5421	If the ims options are changed at the start (for a (?ims: group) or during any
5422	branch, we need to insert an OP_OPT item at the start of every following branch
5423	to ensure they get set correctly at run time, and also pass the new options
5424	into every subsequent branch compile.
5425
5426	This function is used during the pre-compile phase when we are trying to find
5427	out the amount of memory needed, as well as during the real compile phase. The
5428	value of lengthptr distinguishes the two phases.
5429
5430	Arguments:
5431	options option bits, including any changes for this subpattern
5432	oldims previous settings of ims option bits
5433	codeptr -> the address of the current code pointer
5434	ptrptr -> the address of the current pattern pointer
5435	errorcodeptr -> pointer to error code variable
5436	lookbehind TRUE if this is a lookbehind assertion
5437	reset_bracount TRUE to reset the count for each branch
5438	skipbytes skip this many bytes at start (for brackets and OP_COND)
5439	firstbyteptr place to put the first required character, or a negative number
5440	reqbyteptr place to put the last required character, or a negative number
5441	bcptr pointer to the chain of currently open branches
5442	cd points to the data block with tables pointers etc.
5443	lengthptr NULL during the real compile phase
5444	points to length accumulator during pre-compile phase
5445
5446	Returns: TRUE on success
5447	*/
5448
5449	static BOOL
5450	compile_regex(int options, int oldims, uschar codeptr, const uschar ptrptr,
5451	int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5452	int firstbyteptr, int reqbyteptr, branch_chain bcptr, compile_data cd,
5453	int *lengthptr)
5454	{
5455	const uschar ptr = ptrptr;
5456	uschar code = codeptr;
5457	uschar *last_branch = code;
5458	uschar *start_bracket = code;
5459	uschar *reverse_count = NULL;
5460	int firstbyte, reqbyte;
5461	int branchfirstbyte, branchreqbyte;
5462	int length;
5463	int orig_bracount;
5464	int max_bracount;
5465	branch_chain bc;
5466
5467	bc.outer = bcptr;
5468	bc.current = code;
5469
5470	firstbyte = reqbyte = REQ_UNSET;
5471
5472	/* Accumulate the length for use in the pre-compile phase. Start with the
5473	length of the BRA and KET and any extra bytes that are required at the
5474	beginning. We accumulate in a local variable to save frequent testing of
5475	lenthptr for NULL. We cannot do this by looking at the value of code at the
5476	start and end of each alternative, because compiled items are discarded during
5477	the pre-compile phase so that the work space is not exceeded. */
5478
5479	length = 2 + 2*LINK_SIZE + skipbytes;
5480
5481	/* WARNING: If the above line is changed for any reason, you must also change
5482	the code that abstracts option settings at the start of the pattern and makes
5483	them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5484	pre-compile phase to find out whether anything has yet been compiled or not. */
5485
5486	/* Offset is set zero to mark that this bracket is still open */
5487
5488	PUT(code, 1, 0);
5489	code += 1 + LINK_SIZE + skipbytes;
5490
5491	/* Loop for each alternative branch */
5492
5493	orig_bracount = max_bracount = cd->bracount;
5494	for (;;)
5495	{
5496	/* For a (?\| group, reset the capturing bracket count so that each branch
5497	uses the same numbers. */
5498
5499	if (reset_bracount) cd->bracount = orig_bracount;
5500
5501	/* Handle a change of ims options at the start of the branch */
5502
5503	if ((options & PCRE_IMS) != oldims)
5504	{
5505	*code++ = OP_OPT;
5506	*code++ = options & PCRE_IMS;
5507	length += 2;
5508	}
5509
5510	/* Set up dummy OP_REVERSE if lookbehind assertion */
5511
5512	if (lookbehind)
5513	{
5514	*code++ = OP_REVERSE;
5515	reverse_count = code;
5516	PUTINC(code, 0, 0);
5517	length += 1 + LINK_SIZE;
5518	}
5519
5520	/* Now compile the branch; in the pre-compile phase its length gets added
5521	into the length. */
5522
5523	if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5524	&branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5525	{
5526	*ptrptr = ptr;
5527	return FALSE;
5528	}
5529
5530	/* Keep the highest bracket count in case (?\| was used and some branch
5531	has fewer than the rest. */
5532
5533	if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5534
5535	/* In the real compile phase, there is some post-processing to be done. */
5536
5537	if (lengthptr == NULL)
5538	{
5539	/* If this is the first branch, the firstbyte and reqbyte values for the
5540	branch become the values for the regex. */
5541
5542	if (*last_branch != OP_ALT)
5543	{
5544	firstbyte = branchfirstbyte;
5545	reqbyte = branchreqbyte;
5546	}
5547
5548	/* If this is not the first branch, the first char and reqbyte have to
5549	match the values from all the previous branches, except that if the
5550	previous value for reqbyte didn't have REQ_VARY set, it can still match,
5551	and we set REQ_VARY for the regex. */
5552
5553	else
5554	{
5555	/* If we previously had a firstbyte, but it doesn't match the new branch,
5556	we have to abandon the firstbyte for the regex, but if there was
5557	previously no reqbyte, it takes on the value of the old firstbyte. */
5558
5559	if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5560	{
5561	if (reqbyte < 0) reqbyte = firstbyte;
5562	firstbyte = REQ_NONE;
5563	}
5564
5565	/* If we (now or from before) have no firstbyte, a firstbyte from the
5566	branch becomes a reqbyte if there isn't a branch reqbyte. */
5567
5568	if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5569	branchreqbyte = branchfirstbyte;
5570
5571	/* Now ensure that the reqbytes match */
5572
5573	if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5574	reqbyte = REQ_NONE;
5575	else reqbyte \|= branchreqbyte; /* To "or" REQ_VARY */
5576	}
5577
5578	/* If lookbehind, check that this branch matches a fixed-length string, and
5579	put the length into the OP_REVERSE item. Temporarily mark the end of the
5580	branch with OP_END. */
5581
5582	if (lookbehind)
5583	{
5584	int fixed_length;
5585	*code = OP_END;
5586	fixed_length = find_fixedlength(last_branch, options);
5587	DPRINTF(("fixed length = %d\n", fixed_length));
5588	if (fixed_length < 0)
5589	{
5590	*errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5591	*ptrptr = ptr;
5592	return FALSE;
5593	}
5594	PUT(reverse_count, 0, fixed_length);
5595	}
5596	}
5597
5598	/* Reached end of expression, either ')' or end of pattern. In the real
5599	compile phase, go back through the alternative branches and reverse the chain
5600	of offsets, with the field in the BRA item now becoming an offset to the
5601	first alternative. If there are no alternatives, it points to the end of the
5602	group. The length in the terminating ket is always the length of the whole
5603	bracketed item. If any of the ims options were changed inside the group,
5604	compile a resetting op-code following, except at the very end of the pattern.
5605	Return leaving the pointer at the terminating char. */
5606
5607	if (*ptr != '\|')
5608	{
5609	if (lengthptr == NULL)
5610	{
5611	int branch_length = code - last_branch;
5612	do
5613	{
5614	int prev_length = GET(last_branch, 1);
5615	PUT(last_branch, 1, branch_length);
5616	branch_length = prev_length;
5617	last_branch -= branch_length;
5618	}
5619	while (branch_length > 0);
5620	}
5621
5622	/* Fill in the ket */
5623
5624	*code = OP_KET;
5625	PUT(code, 1, code - start_bracket);
5626	code += 1 + LINK_SIZE;
5627
5628	/* Resetting option if needed */
5629
5630	if ((options & PCRE_IMS) != oldims && *ptr == ')')
5631	{
5632	*code++ = OP_OPT;
5633	*code++ = oldims;
5634	length += 2;
5635	}
5636
5637	/* Retain the highest bracket number, in case resetting was used. */
5638
5639	cd->bracount = max_bracount;
5640
5641	/* Set values to pass back */
5642
5643	*codeptr = code;
5644	*ptrptr = ptr;
5645	*firstbyteptr = firstbyte;
5646	*reqbyteptr = reqbyte;
5647	if (lengthptr != NULL)
5648	{
5649	if (OFLOW_MAX - *lengthptr < length)
5650	{
5651	*errorcodeptr = ERR20;
5652	return FALSE;
5653	}
5654	*lengthptr += length;
5655	}
5656	return TRUE;
5657	}
5658
5659	/* Another branch follows. In the pre-compile phase, we can move the code
5660	pointer back to where it was for the start of the first branch. (That is,
5661	pretend that each branch is the only one.)
5662
5663	In the real compile phase, insert an ALT node. Its length field points back
5664	to the previous branch while the bracket remains open. At the end the chain
5665	is reversed. It's done like this so that the start of the bracket has a
5666	zero offset until it is closed, making it possible to detect recursion. */
5667
5668	if (lengthptr != NULL)
5669	{
5670	code = *codeptr + 1 + LINK_SIZE + skipbytes;
5671	length += 1 + LINK_SIZE;
5672	}
5673	else
5674	{
5675	*code = OP_ALT;
5676	PUT(code, 1, code - last_branch);
5677	bc.current = last_branch = code;
5678	code += 1 + LINK_SIZE;
5679	}
5680
5681	ptr++;
5682	}
5683	/* Control never reaches here */
5684	}
5685
5686
5687
5688
5689	/*************************************************
5690	* Check for anchored expression *
5691	*************************************************/
5692
5693	/* Try to find out if this is an anchored regular expression. Consider each
5694	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5695	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5696	it's anchored. However, if this is a multiline pattern, then only OP_SOD
5697	counts, since OP_CIRC can match in the middle.
5698
5699	We can also consider a regex to be anchored if OP_SOM starts all its branches.
5700	This is the code for \G, which means "match at start of match position, taking
5701	into account the match offset".
5702
5703	A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5704	because that will try the rest of the pattern at all possible matching points,
5705	so there is no point trying again.... er ....
5706
5707	.... except when the .* appears inside capturing parentheses, and there is a
5708	subsequent back reference to those parentheses. We haven't enough information
5709	to catch that case precisely.
5710
5711	At first, the best we could do was to detect when .* was in capturing brackets
5712	and the highest back reference was greater than or equal to that level.
5713	However, by keeping a bitmap of the first 31 back references, we can catch some
5714	of the more common cases more precisely.
5715
5716	Arguments:
5717	code points to start of expression (the bracket)
5718	options points to the options setting
5719	bracket_map a bitmap of which brackets we are inside while testing; this
5720	handles up to substring 31; after that we just have to take
5721	the less precise approach
5722	backref_map the back reference bitmap
5723
5724	Returns: TRUE or FALSE
5725	*/
5726
5727	static BOOL
5728	is_anchored(register const uschar code, int options, unsigned int bracket_map,
5729	unsigned int backref_map)
5730	{
5731	do {
5732	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
5733	options, PCRE_MULTILINE, FALSE);
5734	register int op = *scode;
5735
5736	/* Non-capturing brackets */
5737
5738	if (op == OP_BRA)
5739	{
5740	if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5741	}
5742
5743	/* Capturing brackets */
5744
5745	else if (op == OP_CBRA)
5746	{
5747	int n = GET2(scode, 1+LINK_SIZE);
5748	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
5749	if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5750	}
5751
5752	/* Other brackets */
5753
5754	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND)
5755	{
5756	if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5757	}
5758
5759	/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5760	it isn't in brackets that are or may be referenced. */
5761
5762	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
5763	op == OP_TYPEPOSSTAR))
5764	{
5765	if (scode[1] != OP_ALLANY \|\| (bracket_map & backref_map) != 0)
5766	return FALSE;
5767	}
5768
5769	/* Check for explicit anchoring */
5770
5771	else if (op != OP_SOD && op != OP_SOM &&
5772	((*options & PCRE_MULTILINE) != 0 \|\| op != OP_CIRC))
5773	return FALSE;
5774	code += GET(code, 1);
5775	}
5776	while (code == OP_ALT); / Loop for each alternative */
5777	return TRUE;
5778	}
5779
5780
5781
5782	/*************************************************
5783	* Check for starting with ^ or .* *
5784	*************************************************/
5785
5786	/* This is called to find out if every branch starts with ^ or .* so that
5787	"first char" processing can be done to speed things up in multiline
5788	matching and for non-DOTALL patterns that start with .* (which must start at
5789	the beginning or after \n). As in the case of is_anchored() (see above), we
5790	have to take account of back references to capturing brackets that contain .*
5791	because in that case we can't make the assumption.
5792
5793	Arguments:
5794	code points to start of expression (the bracket)
5795	bracket_map a bitmap of which brackets we are inside while testing; this
5796	handles up to substring 31; after that we just have to take
5797	the less precise approach
5798	backref_map the back reference bitmap
5799
5800	Returns: TRUE or FALSE
5801	*/
5802
5803	static BOOL
5804	is_startline(const uschar *code, unsigned int bracket_map,
5805	unsigned int backref_map)
5806	{
5807	do {
5808	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
5809	NULL, 0, FALSE);
5810	register int op = *scode;
5811
5812	/* Non-capturing brackets */
5813
5814	if (op == OP_BRA)
5815	{
5816	if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5817	}
5818
5819	/* Capturing brackets */
5820
5821	else if (op == OP_CBRA)
5822	{
5823	int n = GET2(scode, 1+LINK_SIZE);
5824	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
5825	if (!is_startline(scode, new_map, backref_map)) return FALSE;
5826	}
5827
5828	/* Other brackets */
5829
5830	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND)
5831	{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5832
5833	/* .* means "start at start or after \n" if it isn't in brackets that
5834	may be referenced. */
5835
5836	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
5837	{
5838	if (scode[1] != OP_ANY \|\| (bracket_map & backref_map) != 0) return FALSE;
5839	}
5840
5841	/* Check for explicit circumflex */
5842
5843	else if (op != OP_CIRC) return FALSE;
5844
5845	/* Move on to the next alternative */
5846
5847	code += GET(code, 1);
5848	}
5849	while (code == OP_ALT); / Loop for each alternative */
5850	return TRUE;
5851	}
5852
5853
5854
5855	/*************************************************
5856	* Check for asserted fixed first char *
5857	*************************************************/
5858
5859	/* During compilation, the "first char" settings from forward assertions are
5860	discarded, because they can cause conflicts with actual literals that follow.
5861	However, if we end up without a first char setting for an unanchored pattern,
5862	it is worth scanning the regex to see if there is an initial asserted first
5863	char. If all branches start with the same asserted char, or with a bracket all
5864	of whose alternatives start with the same asserted char (recurse ad lib), then
5865	we return that char, otherwise -1.
5866
5867	Arguments:
5868	code points to start of expression (the bracket)
5869	options pointer to the options (used to check casing changes)
5870	inassert TRUE if in an assertion
5871
5872	Returns: -1 or the fixed first char
5873	*/
5874
5875	static int
5876	find_firstassertedchar(const uschar code, int options, BOOL inassert)
5877	{
5878	register int c = -1;
5879	do {
5880	int d;
5881	const uschar *scode =
5882	first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5883	register int op = *scode;
5884
5885	switch(op)
5886	{
5887	default:
5888	return -1;
5889
5890	case OP_BRA:
5891	case OP_CBRA:
5892	case OP_ASSERT:
5893	case OP_ONCE:
5894	case OP_COND:
5895	if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5896	return -1;
5897	if (c < 0) c = d; else if (c != d) return -1;
5898	break;
5899
5900	case OP_EXACT: /* Fall through */
5901	scode += 2;
5902
5903	case OP_CHAR:
5904	case OP_CHARNC:
5905	case OP_PLUS:
5906	case OP_MINPLUS:
5907	case OP_POSPLUS:
5908	if (!inassert) return -1;
5909	if (c < 0)
5910	{
5911	c = scode[1];
5912	if ((*options & PCRE_CASELESS) != 0) c \|= REQ_CASELESS;
5913	}
5914	else if (c != scode[1]) return -1;
5915	break;
5916	}
5917
5918	code += GET(code, 1);
5919	}
5920	while (*code == OP_ALT);
5921	return c;
5922	}
5923
5924
5925
5926	/*************************************************
5927	* Compile a Regular Expression *
5928	*************************************************/
5929
5930	/* This function takes a string and returns a pointer to a block of store
5931	holding a compiled version of the expression. The original API for this
5932	function had no error code return variable; it is retained for backwards
5933	compatibility. The new function is given a new name.
5934
5935	Arguments:
5936	pattern the regular expression
5937	options various option bits
5938	errorcodeptr pointer to error code variable (pcre_compile2() only)
5939	can be NULL if you don't want a code value
5940	errorptr pointer to pointer to error text
5941	erroroffset ptr offset in pattern where error was detected
5942	tables pointer to character tables or NULL
5943
5944	Returns: pointer to compiled data block, or NULL on error,
5945	with errorptr and erroroffset set
5946	*/
5947
5948	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5949	pcre_compile(const char pattern, int options, const char *errorptr,
5950	int erroroffset, const unsigned char tables)
5951	{
5952	return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5953	}
5954
5955
5956	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
5957	pcre_compile2(const char pattern, int options, int errorcodeptr,
5958	const char *errorptr, int erroroffset, const unsigned char *tables)
5959	{
5960	real_pcre *re;
5961	int length = 1; /* For final END opcode */
5962	int firstbyte, reqbyte, newline;
5963	int errorcode = 0;
5964	int skipatstart = 0;
5965	#ifdef SUPPORT_UTF8
5966	BOOL utf8;
5967	#endif
5968	size_t size;
5969	uschar *code;
5970	const uschar *codestart;
5971	const uschar *ptr;
5972	compile_data compile_block;
5973	compile_data *cd = &compile_block;
5974
5975	/* This space is used for "compiling" into during the first phase, when we are
5976	computing the amount of memory that is needed. Compiled items are thrown away
5977	as soon as possible, so that a fairly large buffer should be sufficient for
5978	this purpose. The same space is used in the second phase for remembering where
5979	to fill in forward references to subpatterns. */
5980
5981	uschar cworkspace[COMPILE_WORK_SIZE];
5982
5983	/* Set this early so that early errors get offset 0. */
5984
5985	ptr = (const uschar *)pattern;
5986
5987	/* We can't pass back an error message if errorptr is NULL; I guess the best we
5988	can do is just return NULL, but we can set a code value if there is a code
5989	pointer. */
5990
5991	if (errorptr == NULL)
5992	{
5993	if (errorcodeptr != NULL) *errorcodeptr = 99;
5994	return NULL;
5995	}
5996
5997	*errorptr = NULL;
5998	if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5999
6000	/* However, we can give a message for this error */
6001
6002	if (erroroffset == NULL)
6003	{
6004	errorcode = ERR16;
6005	goto PCRE_EARLY_ERROR_RETURN2;
6006	}
6007
6008	*erroroffset = 0;
6009
6010	/* Can't support UTF8 unless PCRE has been compiled to include the code. */
6011
6012	#ifdef SUPPORT_UTF8
6013	utf8 = (options & PCRE_UTF8) != 0;
6014	if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6015	(erroroffset = _pcre_valid_utf8((uschar )pattern, -1)) >= 0)
6016	{
6017	errorcode = ERR44;
6018	goto PCRE_EARLY_ERROR_RETURN2;
6019	}
6020	#else
6021	if ((options & PCRE_UTF8) != 0)
6022	{
6023	errorcode = ERR32;
6024	goto PCRE_EARLY_ERROR_RETURN;
6025	}
6026	#endif
6027
6028	if ((options & ~PUBLIC_OPTIONS) != 0)
6029	{
6030	errorcode = ERR17;
6031	goto PCRE_EARLY_ERROR_RETURN;
6032	}
6033
6034	/* Set up pointers to the individual character tables */
6035
6036	if (tables == NULL) tables = _pcre_default_tables;
6037	cd->lcc = tables + lcc_offset;
6038	cd->fcc = tables + fcc_offset;
6039	cd->cbits = tables + cbits_offset;
6040	cd->ctypes = tables + ctypes_offset;
6041
6042	/* Check for global one-time settings at the start of the pattern, and remember
6043	the offset for later. */
6044
6045	while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
6046	{
6047	int newnl = 0;
6048	int newbsr = 0;
6049
6050	if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6051	{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6052	else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
6053	{ skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6054	else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
6055	{ skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6056	else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6057	{ skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6058	else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
6059	{ skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6060
6061	else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6062	{ skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6063	else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6064	{ skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6065
6066	if (newnl != 0)
6067	options = (options & ~PCRE_NEWLINE_BITS) \| newnl;
6068	else if (newbsr != 0)
6069	options = (options & ~(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) \| newbsr;
6070	else break;
6071	}
6072
6073	/* Check validity of \R options. */
6074
6075	switch (options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE))
6076	{
6077	case 0:
6078	case PCRE_BSR_ANYCRLF:
6079	case PCRE_BSR_UNICODE:
6080	break;
6081	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6082	}
6083
6084	/* Handle different types of newline. The three bits give seven cases. The
6085	current code allows for fixed one- or two-byte sequences, plus "any" and
6086	"anycrlf". */
6087
6088	switch (options & PCRE_NEWLINE_BITS)
6089	{
6090	case 0: newline = NEWLINE; break; /* Build-time default */
6091	case PCRE_NEWLINE_CR: newline = '\r'; break;
6092	case PCRE_NEWLINE_LF: newline = '\n'; break;
6093	case PCRE_NEWLINE_CR+
6094	PCRE_NEWLINE_LF: newline = ('\r' << 8) \| '\n'; break;
6095	case PCRE_NEWLINE_ANY: newline = -1; break;
6096	case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6097	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6098	}
6099
6100	if (newline == -2)
6101	{
6102	cd->nltype = NLTYPE_ANYCRLF;
6103	}
6104	else if (newline < 0)
6105	{
6106	cd->nltype = NLTYPE_ANY;
6107	}
6108	else
6109	{
6110	cd->nltype = NLTYPE_FIXED;
6111	if (newline > 255)
6112	{
6113	cd->nllen = 2;
6114	cd->nl[0] = (newline >> 8) & 255;
6115	cd->nl[1] = newline & 255;
6116	}
6117	else
6118	{
6119	cd->nllen = 1;
6120	cd->nl[0] = newline;
6121	}
6122	}
6123
6124	/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6125	references to help in deciding whether (.*) can be treated as anchored or not.
6126	*/
6127
6128	cd->top_backref = 0;
6129	cd->backref_map = 0;
6130
6131	/* Reflect pattern for debugging output */
6132
6133	DPRINTF(("------------------------------------------------------------------\n"));
6134	DPRINTF(("%s\n", pattern));
6135
6136	/* Pretend to compile the pattern while actually just accumulating the length
6137	of memory required. This behaviour is triggered by passing a non-NULL final
6138	argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6139	to compile parts of the pattern into; the compiled code is discarded when it is
6140	no longer needed, so hopefully this workspace will never overflow, though there
6141	is a test for its doing so. */
6142
6143	cd->bracount = cd->final_bracount = 0;
6144	cd->names_found = 0;
6145	cd->name_entry_size = 0;
6146	cd->name_table = NULL;
6147	cd->start_workspace = cworkspace;
6148	cd->start_code = cworkspace;
6149	cd->hwm = cworkspace;
6150	cd->start_pattern = (const uschar *)pattern;
6151	cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6152	cd->req_varyopt = 0;
6153	cd->external_options = options;
6154	cd->external_flags = 0;
6155
6156	/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6157	don't need to look at the result of the function here. The initial options have
6158	been put into the cd block so that they can be changed if an option setting is
6159	found within the regex right at the beginning. Bringing initial option settings
6160	outside can help speed up starting point checks. */
6161
6162	ptr += skipatstart;
6163	code = cworkspace;
6164	*code = OP_BRA;
6165	(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6166	&code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6167	&length);
6168	if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6169
6170	DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6171	cd->hwm - cworkspace));
6172
6173	if (length > MAX_PATTERN_SIZE)
6174	{
6175	errorcode = ERR20;
6176	goto PCRE_EARLY_ERROR_RETURN;
6177	}
6178
6179	/* Compute the size of data block needed and get it, either from malloc or
6180	externally provided function. Integer overflow should no longer be possible
6181	because nowadays we limit the maximum value of cd->names_found and
6182	cd->name_entry_size. */
6183
6184	size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6185	re = (real_pcre *)(pcre_malloc)(size);
6186
6187	if (re == NULL)
6188	{
6189	errorcode = ERR21;
6190	goto PCRE_EARLY_ERROR_RETURN;
6191	}
6192
6193	/* Put in the magic number, and save the sizes, initial options, internal
6194	flags, and character table pointer. NULL is used for the default character
6195	tables. The nullpad field is at the end; it's there to help in the case when a
6196	regex compiled on a system with 4-byte pointers is run on another with 8-byte
6197	pointers. */
6198
6199	re->magic_number = MAGIC_NUMBER;
6200	re->size = size;
6201	re->options = cd->external_options;
6202	re->flags = cd->external_flags;
6203	re->dummy1 = 0;
6204	re->first_byte = 0;
6205	re->req_byte = 0;
6206	re->name_table_offset = sizeof(real_pcre);
6207	re->name_entry_size = cd->name_entry_size;
6208	re->name_count = cd->names_found;
6209	re->ref_count = 0;
6210	re->tables = (tables == _pcre_default_tables)? NULL : tables;
6211	re->nullpad = NULL;
6212
6213	/* The starting points of the name/number translation table and of the code are
6214	passed around in the compile data block. The start/end pattern and initial
6215	options are already set from the pre-compile phase, as is the name_entry_size
6216	field. Reset the bracket count and the names_found field. Also reset the hwm
6217	field; this time it's used for remembering forward references to subpatterns.
6218	*/
6219
6220	cd->final_bracount = cd->bracount; /* Save for checking forward references */
6221	cd->bracount = 0;
6222	cd->names_found = 0;
6223	cd->name_table = (uschar *)re + re->name_table_offset;
6224	codestart = cd->name_table + re->name_entry_size * re->name_count;
6225	cd->start_code = codestart;
6226	cd->hwm = cworkspace;
6227	cd->req_varyopt = 0;
6228	cd->had_accept = FALSE;
6229
6230	/* Set up a starting, non-extracting bracket, then compile the expression. On
6231	error, errorcode will be set non-zero, so we don't need to look at the result
6232	of the function here. */
6233
6234	ptr = (const uschar *)pattern + skipatstart;
6235	code = (uschar *)codestart;
6236	*code = OP_BRA;
6237	(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6238	&errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6239	re->top_bracket = cd->bracount;
6240	re->top_backref = cd->top_backref;
6241	re->flags = cd->external_flags;
6242
6243	if (cd->had_accept) reqbyte = -1; /* Must disable after (ACCEPT) /
6244
6245	/* If not reached end of pattern on success, there's an excess bracket. */
6246
6247	if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6248
6249	/* Fill in the terminating state and check for disastrous overflow, but
6250	if debugging, leave the test till after things are printed out. */
6251
6252	*code++ = OP_END;
6253
6254	#ifndef DEBUG
6255	if (code - codestart > length) errorcode = ERR23;
6256	#endif
6257
6258	/* Fill in any forward references that are required. */
6259
6260	while (errorcode == 0 && cd->hwm > cworkspace)
6261	{
6262	int offset, recno;
6263	const uschar *groupptr;
6264	cd->hwm -= LINK_SIZE;
6265	offset = GET(cd->hwm, 0);
6266	recno = GET(codestart, offset);
6267	groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6268	if (groupptr == NULL) errorcode = ERR53;
6269	else PUT(((uschar *)codestart), offset, groupptr - codestart);
6270	}
6271
6272	/* Give an error if there's back reference to a non-existent capturing
6273	subpattern. */
6274
6275	if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6276
6277	/* Failed to compile, or error while post-processing */
6278
6279	if (errorcode != 0)
6280	{
6281	(pcre_free)(re);
6282	PCRE_EARLY_ERROR_RETURN:
6283	erroroffset = ptr - (const uschar )pattern;
6284	PCRE_EARLY_ERROR_RETURN2:
6285	*errorptr = find_error_text(errorcode);
6286	if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6287	return NULL;
6288	}
6289
6290	/* If the anchored option was not passed, set the flag if we can determine that
6291	the pattern is anchored by virtue of ^ characters or \A or anything else (such
6292	as starting with .* when DOTALL is set).
6293
6294	Otherwise, if we know what the first byte has to be, save it, because that
6295	speeds up unanchored matches no end. If not, see if we can set the
6296	PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6297	start with ^. and also when all branches start with .* for non-DOTALL matches.
6298	*/
6299
6300	if ((re->options & PCRE_ANCHORED) == 0)
6301	{
6302	int temp_options = re->options; /* May get changed during these scans */
6303	if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6304	re->options \|= PCRE_ANCHORED;
6305	else
6306	{
6307	if (firstbyte < 0)
6308	firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6309	if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6310	{
6311	int ch = firstbyte & 255;
6312	re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6313	cd->fcc[ch] == ch)? ch : firstbyte;
6314	re->flags \|= PCRE_FIRSTSET;
6315	}
6316	else if (is_startline(codestart, 0, cd->backref_map))
6317	re->flags \|= PCRE_STARTLINE;
6318	}
6319	}
6320
6321	/* For an anchored pattern, we use the "required byte" only if it follows a
6322	variable length item in the regex. Remove the caseless flag for non-caseable
6323	bytes. */
6324
6325	if (reqbyte >= 0 &&
6326	((re->options & PCRE_ANCHORED) == 0 \|\| (reqbyte & REQ_VARY) != 0))
6327	{
6328	int ch = reqbyte & 255;
6329	re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6330	cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6331	re->flags \|= PCRE_REQCHSET;
6332	}
6333
6334	/* Print out the compiled data if debugging is enabled. This is never the
6335	case when building a production library. */
6336
6337	#ifdef DEBUG
6338
6339	printf("Length = %d top_bracket = %d top_backref = %d\n",
6340	length, re->top_bracket, re->top_backref);
6341
6342	printf("Options=%08x\n", re->options);
6343
6344	if ((re->flags & PCRE_FIRSTSET) != 0)
6345	{
6346	int ch = re->first_byte & 255;
6347	const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6348	"" : " (caseless)";
6349	if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6350	else printf("First char = \\x%02x%s\n", ch, caseless);
6351	}
6352
6353	if ((re->flags & PCRE_REQCHSET) != 0)
6354	{
6355	int ch = re->req_byte & 255;
6356	const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6357	"" : " (caseless)";
6358	if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6359	else printf("Req char = \\x%02x%s\n", ch, caseless);
6360	}
6361
6362	pcre_printint(re, stdout, TRUE);
6363
6364	/* This check is done here in the debugging case so that the code that
6365	was compiled can be seen. */
6366
6367	if (code - codestart > length)
6368	{
6369	(pcre_free)(re);
6370	*errorptr = find_error_text(ERR23);
6371	erroroffset = ptr - (uschar )pattern;
6372	if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6373	return NULL;
6374	}
6375	#endif /* DEBUG */
6376
6377	return (pcre *)re;
6378	}
6379
6380	/* End of pcre_compile.c */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: