monotone

monotone Mtn Source Tree

Root/pcre/pcre_compile.c

1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#include "pcre_config.h"
46
47#define NLBLOCK cd /* Block containing newline information */
48#define PSSTART start_pattern /* Field containing processed string start */
49#define PSEND end_pattern /* Field containing processed string end */
50
51#include "pcre_internal.h"
52
53
54/* When DEBUG is defined, we need the pcre_printint() function, which is also
55used by pcretest. DEBUG is not defined when building a production library. */
56
57#ifdef DEBUG
58#include "pcre_printint.src"
59#endif
60
61
62/* Macro for setting individual bits in class bitmaps. */
63
64#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
65
66/* Maximum length value to check against when making sure that the integer that
67holds the compiled pattern length does not overflow. We make it a bit less than
68INT_MAX to allow for adding in group terminating bytes, so that we don't have
69to check them every time. */
70
71#define OFLOW_MAX (INT_MAX - 20)
72
73
74/*************************************************
75* Code parameters and static tables *
76*************************************************/
77
78/* This value specifies the size of stack workspace that is used during the
79first pre-compile phase that determines how much memory is required. The regex
80is partly compiled into this space, but the compiled parts are discarded as
81soon as they can be, so that hopefully there will never be an overrun. The code
82does, however, check for an overrun. The largest amount I've seen used is 218,
83so this number is very generous.
84
85The same workspace is used during the second, actual compile phase for
86remembering forward references to groups so that they can be filled in at the
87end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
88is 4 there is plenty of room. */
89
90#define COMPILE_WORK_SIZE (4096)
91
92
93/* Table for handling escaped characters in the range '0'-'z'. Positive returns
94are simple data values; negative values are for special things like \d and so
95on. Zero means further processing is needed (for things like \x), or the escape
96is invalid. */
97
98#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
99static const short int escapes[] = {
100 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
101 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
102 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
103-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
104-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
105-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
106 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
107-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
108-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
109 0, 0, -ESC_z /* x - z */
110};
111
112#else /* This is the "abnormal" table for EBCDIC systems */
113static const short int escapes[] = {
114/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
115/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
116/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
117/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
118/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
119/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
120/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
121/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
122/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
123/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
124/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
125/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
126/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
127/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
129/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
130/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
131/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
132/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
133/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
134/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
135/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
136/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
137};
138#endif
139
140
141/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
142searched linearly. Put all the names into a single string, in order to reduce
143the number of relocations when a shared library is dynamically linked. */
144
145typedef struct verbitem {
146 int len;
147 int op;
148} verbitem;
149
150static const char verbnames[] =
151 "ACCEPT\0"
152 "COMMIT\0"
153 "F\0"
154 "FAIL\0"
155 "PRUNE\0"
156 "SKIP\0"
157 "THEN";
158
159static verbitem verbs[] = {
160 { 6, OP_ACCEPT },
161 { 6, OP_COMMIT },
162 { 1, OP_FAIL },
163 { 4, OP_FAIL },
164 { 5, OP_PRUNE },
165 { 4, OP_SKIP },
166 { 4, OP_THEN }
167};
168
169static int verbcount = sizeof(verbs)/sizeof(verbitem);
170
171
172/* Tables of names of POSIX character classes and their lengths. The names are
173now all in a single string, to reduce the number of relocations when a shared
174library is dynamically loaded. The list of lengths is terminated by a zero
175length entry. The first three must be alpha, lower, upper, as this is assumed
176for handling case independence. */
177
178static const char posix_names[] =
179 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
180 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
181 "word\0" "xdigit";
182
183static const uschar posix_name_lengths[] = {
184 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
185
186/* Table of class bit maps for each POSIX class. Each class is formed from a
187base map, with an optional addition or removal of another map. Then, for some
188classes, there is some additional tweaking: for [:blank:] the vertical space
189characters are removed, and for [:alpha:] and [:alnum:] the underscore
190character is removed. The triples in the table consist of the base map offset,
191second map offset or -1 if no second map, and a non-negative value for map
192addition or a negative value for map subtraction (if there are two maps). The
193absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
194remove vertical space characters, 2 => remove underscore. */
195
196static const int posix_class_maps[] = {
197 cbit_word, cbit_digit, -2, /* alpha */
198 cbit_lower, -1, 0, /* lower */
199 cbit_upper, -1, 0, /* upper */
200 cbit_word, -1, 2, /* alnum - word without underscore */
201 cbit_print, cbit_cntrl, 0, /* ascii */
202 cbit_space, -1, 1, /* blank - a GNU extension */
203 cbit_cntrl, -1, 0, /* cntrl */
204 cbit_digit, -1, 0, /* digit */
205 cbit_graph, -1, 0, /* graph */
206 cbit_print, -1, 0, /* print */
207 cbit_punct, -1, 0, /* punct */
208 cbit_space, -1, 0, /* space */
209 cbit_word, -1, 0, /* word - a Perl extension */
210 cbit_xdigit,-1, 0 /* xdigit */
211};
212
213
214#define STRING(a) # a
215#define XSTRING(s) STRING(s)
216
217/* The texts of compile-time error messages. These are "char *" because they
218are passed to the outside world. Do not ever re-use any error number, because
219they are documented. Always add a new error instead. Messages marked DEAD below
220are no longer used. This used to be a table of strings, but in order to reduce
221the number of relocations needed when a shared library is loaded dynamically,
222it is now one long string. We cannot use a table of offsets, because the
223lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
224simply count through to the one we want - this isn't a performance issue
225because these strings are used only when there is a compilation error. */
226
227static const char error_texts[] =
228 "no error\0"
229 "\\ at end of pattern\0"
230 "\\c at end of pattern\0"
231 "unrecognized character follows \\\0"
232 "numbers out of order in {} quantifier\0"
233 /* 5 */
234 "number too big in {} quantifier\0"
235 "missing terminating ] for character class\0"
236 "invalid escape sequence in character class\0"
237 "range out of order in character class\0"
238 "nothing to repeat\0"
239 /* 10 */
240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
241 "internal error: unexpected repeat\0"
242 "unrecognized character after (? or (?-\0"
243 "POSIX named classes are supported only within a class\0"
244 "missing )\0"
245 /* 15 */
246 "reference to non-existent subpattern\0"
247 "erroffset passed as NULL\0"
248 "unknown option bit(s) set\0"
249 "missing ) after comment\0"
250 "parentheses nested too deeply\0" /** DEAD **/
251 /* 20 */
252 "regular expression is too large\0"
253 "failed to get memory\0"
254 "unmatched parentheses\0"
255 "internal error: code overflow\0"
256 "unrecognized character after (?<\0"
257 /* 25 */
258 "lookbehind assertion is not fixed length\0"
259 "malformed number or name after (?(\0"
260 "conditional group contains more than two branches\0"
261 "assertion expected after (?(\0"
262 "(?R or (?[+-]digits must be followed by )\0"
263 /* 30 */
264 "unknown POSIX class name\0"
265 "POSIX collating elements are not supported\0"
266 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
267 "spare error\0" /** DEAD **/
268 "character value in \\x{...} sequence is too large\0"
269 /* 35 */
270 "invalid condition (?(0)\0"
271 "\\C not allowed in lookbehind assertion\0"
272 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
273 "number after (?C is > 255\0"
274 "closing ) for (?C expected\0"
275 /* 40 */
276 "recursive call could loop indefinitely\0"
277 "unrecognized character after (?P\0"
278 "syntax error in subpattern name (missing terminator)\0"
279 "two named subpatterns have the same name\0"
280 "invalid UTF-8 string\0"
281 /* 45 */
282 "support for \\P, \\p, and \\X has not been compiled\0"
283 "malformed \\P or \\p sequence\0"
284 "unknown property name after \\P or \\p\0"
285 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
286 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
287 /* 50 */
288 "repeated subpattern is too long\0" /** DEAD **/
289 "octal value is greater than \\377 (not in UTF-8 mode)\0"
290 "internal error: overran compiling workspace\0"
291 "internal error: previously-checked referenced subpattern not found\0"
292 "DEFINE group contains more than one branch\0"
293 /* 55 */
294 "repeating a DEFINE group is not allowed\0"
295 "inconsistent NEWLINE options\0"
296 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
297 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
298 "(*VERB) with an argument is not supported\0"
299 /* 60 */
300 "(*VERB) not recognized\0"
301 "number is too big\0"
302 "subpattern name expected\0"
303 "digit expected after (?+";
304
305
306/* Table to identify digits and hex digits. This is used when compiling
307patterns. Note that the tables in chartables are dependent on the locale, and
308may mark arbitrary characters as digits - but the PCRE compiling code expects
309to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310a private table here. It costs 256 bytes, but it is a lot faster than doing
311character value tests (at least in some simple cases I timed), and in some
312applications one wants PCRE to compile efficiently as well as match
313efficiently.
314
315For convenience, we use the same bit definitions as in chartables:
316
317 0x04 decimal digit
318 0x08 hexadecimal digit
319
320Then we can use ctype_digit and ctype_xdigit in the code. */
321
322#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
323static const unsigned char digitab[] =
324 {
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
331 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
332 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
333 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
337 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357
358#else /* This is the "abnormal" case, for EBCDIC systems */
359static const unsigned char digitab[] =
360 {
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
377 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
385 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
391 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
392 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
393
394static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
396 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
397 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
403 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
404 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
406 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
408 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
411 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
412 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
413 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
415 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
417 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
418 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
420 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
421 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
423 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
425 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
426 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
427#endif
428
429
430/* Definition to allow mutual recursion */
431
432static BOOL
433 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434 int *, int *, branch_chain *, compile_data *, int *);
435
436
437
438/*************************************************
439* Find an error text *
440*************************************************/
441
442/* The error texts are now all in one long string, to save on relocations. As
443some of the text is of unknown length, we can't use a table of offsets.
444Instead, just count through the strings. This is not a performance issue
445because it happens only when there has been a compilation error.
446
447Argument: the error number
448Returns: pointer to the error string
449*/
450
451static const char *
452find_error_text(int n)
453{
454const char *s = error_texts;
455for (; n > 0; n--) while (*s++ != 0);
456return s;
457}
458
459
460/*************************************************
461* Handle escapes *
462*************************************************/
463
464/* This function is called when a \ has been encountered. It either returns a
465positive value for a simple escape such as \n, or a negative value which
466encodes one of the more complicated things such as \d. A backreference to group
467n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469ptr is pointing at the \. On exit, it is on the final character of the escape
470sequence.
471
472Arguments:
473 ptrptr points to the pattern position pointer
474 errorcodeptr points to the errorcode variable
475 bracount number of previous extracting brackets
476 options the options bits
477 isclass TRUE if inside a character class
478
479Returns: zero or positive => a data character
480 negative => a special escape sequence
481 on error, errorcodeptr is set
482*/
483
484static int
485check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486 int options, BOOL isclass)
487{
488BOOL utf8 = (options & PCRE_UTF8) != 0;
489const uschar *ptr = *ptrptr + 1;
490int c, i;
491
492GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
493ptr--; /* Set pointer back to the last byte */
494
495/* If backslash is at the end of the pattern, it's an error. */
496
497if (c == 0) *errorcodeptr = ERR1;
498
499/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
500in a table. A non-zero result is something that can be returned immediately.
501Otherwise further processing may be required. */
502
503#ifndef EBCDIC /* ASCII coding */
504else if (c < '0' || c > 'z') {} /* Not alphanumeric */
505else if ((i = escapes[c - '0']) != 0) c = i;
506
507#else /* EBCDIC coding */
508else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
509else if ((i = escapes[c - 0x48]) != 0) c = i;
510#endif
511
512/* Escapes that need further processing, or are illegal. */
513
514else
515 {
516 const uschar *oldptr;
517 BOOL braced, negated;
518
519 switch (c)
520 {
521 /* A number of Perl escapes are not handled by PCRE. We give an explicit
522 error. */
523
524 case 'l':
525 case 'L':
526 case 'N':
527 case 'u':
528 case 'U':
529 *errorcodeptr = ERR37;
530 break;
531
532 /* \g must be followed by a number, either plain or braced. If positive, it
533 is an absolute backreference. If negative, it is a relative backreference.
534 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535 reference to a named group. This is part of Perl's movement towards a
536 unified syntax for back references. As this is synonymous with \k{name}, we
537 fudge it up by pretending it really was \k. */
538
539 case 'g':
540 if (ptr[1] == '{')
541 {
542 const uschar *p;
543 for (p = ptr+2; *p != 0 && *p != '}'; p++)
544 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545 if (*p != 0 && *p != '}')
546 {
547 c = -ESC_k;
548 break;
549 }
550 braced = TRUE;
551 ptr++;
552 }
553 else braced = FALSE;
554
555 if (ptr[1] == '-')
556 {
557 negated = TRUE;
558 ptr++;
559 }
560 else negated = FALSE;
561
562 c = 0;
563 while ((digitab[ptr[1]] & ctype_digit) != 0)
564 c = c * 10 + *(++ptr) - '0';
565
566 if (c < 0)
567 {
568 *errorcodeptr = ERR61;
569 break;
570 }
571
572 if (c == 0 || (braced && *(++ptr) != '}'))
573 {
574 *errorcodeptr = ERR57;
575 break;
576 }
577
578 if (negated)
579 {
580 if (c > bracount)
581 {
582 *errorcodeptr = ERR15;
583 break;
584 }
585 c = bracount - (c - 1);
586 }
587
588 c = -(ESC_REF + c);
589 break;
590
591 /* The handling of escape sequences consisting of a string of digits
592 starting with one that is not zero is not straightforward. By experiment,
593 the way Perl works seems to be as follows:
594
595 Outside a character class, the digits are read as a decimal number. If the
596 number is less than 10, or if there are that many previous extracting
597 left brackets, then it is a back reference. Otherwise, up to three octal
598 digits are read to form an escaped byte. Thus \123 is likely to be octal
599 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600 value is greater than 377, the least significant 8 bits are taken. Inside a
601 character class, \ followed by a digit is always an octal number. */
602
603 case '1': case '2': case '3': case '4': case '5':
604 case '6': case '7': case '8': case '9':
605
606 if (!isclass)
607 {
608 oldptr = ptr;
609 c -= '0';
610 while ((digitab[ptr[1]] & ctype_digit) != 0)
611 c = c * 10 + *(++ptr) - '0';
612 if (c < 0)
613 {
614 *errorcodeptr = ERR61;
615 break;
616 }
617 if (c < 10 || c <= bracount)
618 {
619 c = -(ESC_REF + c);
620 break;
621 }
622 ptr = oldptr; /* Put the pointer back and fall through */
623 }
624
625 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626 generates a binary zero byte and treats the digit as a following literal.
627 Thus we have to pull back the pointer by one. */
628
629 if ((c = *ptr) >= '8')
630 {
631 ptr--;
632 c = 0;
633 break;
634 }
635
636 /* \0 always starts an octal number, but we may drop through to here with a
637 larger first octal digit. The original code used just to take the least
638 significant 8 bits of octal numbers (I think this is what early Perls used
639 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640 than 3 octal digits. */
641
642 case '0':
643 c -= '0';
644 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645 c = c * 8 + *(++ptr) - '0';
646 if (!utf8 && c > 255) *errorcodeptr = ERR51;
647 break;
648
649 /* \x is complicated. \x{ddd} is a character number which can be greater
650 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651 treated as a data character. */
652
653 case 'x':
654 if (ptr[1] == '{')
655 {
656 const uschar *pt = ptr + 2;
657 int count = 0;
658
659 c = 0;
660 while ((digitab[*pt] & ctype_xdigit) != 0)
661 {
662 register int cc = *pt++;
663 if (c == 0 && cc == '0') continue; /* Leading zeroes */
664 count++;
665
666#ifndef EBCDIC /* ASCII coding */
667 if (cc >= 'a') cc -= 32; /* Convert to upper case */
668 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669#else /* EBCDIC coding */
670 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
671 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672#endif
673 }
674
675 if (*pt == '}')
676 {
677 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678 ptr = pt;
679 break;
680 }
681
682 /* If the sequence of hex digits does not end with '}', then we don't
683 recognize this construct; fall through to the normal \x handling. */
684 }
685
686 /* Read just a single-byte hex-defined char */
687
688 c = 0;
689 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690 {
691 int cc; /* Some compilers don't like ++ */
692 cc = *(++ptr); /* in initializers */
693#ifndef EBCDIC /* ASCII coding */
694 if (cc >= 'a') cc -= 32; /* Convert to upper case */
695 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696#else /* EBCDIC coding */
697 if (cc <= 'z') cc += 64; /* Convert to upper case */
698 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699#endif
700 }
701 break;
702
703 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704 This coding is ASCII-specific, but then the whole concept of \cx is
705 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706
707 case 'c':
708 c = *(++ptr);
709 if (c == 0)
710 {
711 *errorcodeptr = ERR2;
712 break;
713 }
714
715#ifndef EBCDIC /* ASCII coding */
716 if (c >= 'a' && c <= 'z') c -= 32;
717 c ^= 0x40;
718#else /* EBCDIC coding */
719 if (c >= 'a' && c <= 'z') c += 64;
720 c ^= 0xC0;
721#endif
722 break;
723
724 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725 other alphanumeric following \ is an error if PCRE_EXTRA was set;
726 otherwise, for Perl compatibility, it is a literal. This code looks a bit
727 odd, but there used to be some cases other than the default, and there may
728 be again in future, so I haven't "optimized" it. */
729
730 default:
731 if ((options & PCRE_EXTRA) != 0) switch(c)
732 {
733 default:
734 *errorcodeptr = ERR3;
735 break;
736 }
737 break;
738 }
739 }
740
741*ptrptr = ptr;
742return c;
743}
744
745
746
747#ifdef SUPPORT_UCP
748/*************************************************
749* Handle \P and \p *
750*************************************************/
751
752/* This function is called after \P or \p has been encountered, provided that
753PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754pointing at the P or p. On exit, it is pointing at the final character of the
755escape sequence.
756
757Argument:
758 ptrptr points to the pattern position pointer
759 negptr points to a boolean that is set TRUE for negation else FALSE
760 dptr points to an int that is set to the detailed property value
761 errorcodeptr points to the error code variable
762
763Returns: type value from ucp_type_table, or -1 for an invalid type
764*/
765
766static int
767get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768{
769int c, i, bot, top;
770const uschar *ptr = *ptrptr;
771char name[32];
772
773c = *(++ptr);
774if (c == 0) goto ERROR_RETURN;
775
776*negptr = FALSE;
777
778/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779negation. */
780
781if (c == '{')
782 {
783 if (ptr[1] == '^')
784 {
785 *negptr = TRUE;
786 ptr++;
787 }
788 for (i = 0; i < (int)sizeof(name) - 1; i++)
789 {
790 c = *(++ptr);
791 if (c == 0) goto ERROR_RETURN;
792 if (c == '}') break;
793 name[i] = c;
794 }
795 if (c !='}') goto ERROR_RETURN;
796 name[i] = 0;
797 }
798
799/* Otherwise there is just one following character */
800
801else
802 {
803 name[0] = c;
804 name[1] = 0;
805 }
806
807*ptrptr = ptr;
808
809/* Search for a recognized property name using binary chop */
810
811bot = 0;
812top = _pcre_utt_size;
813
814while (bot < top)
815 {
816 i = (bot + top) >> 1;
817 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818 if (c == 0)
819 {
820 *dptr = _pcre_utt[i].value;
821 return _pcre_utt[i].type;
822 }
823 if (c > 0) bot = i + 1; else top = i;
824 }
825
826*errorcodeptr = ERR47;
827*ptrptr = ptr;
828return -1;
829
830ERROR_RETURN:
831*errorcodeptr = ERR46;
832*ptrptr = ptr;
833return -1;
834}
835#endif
836
837
838
839
840/*************************************************
841* Check for counted repeat *
842*************************************************/
843
844/* This function is called when a '{' is encountered in a place where it might
845start a quantifier. It looks ahead to see if it really is a quantifier or not.
846It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847where the ddds are digits.
848
849Arguments:
850 p pointer to the first char after '{'
851
852Returns: TRUE or FALSE
853*/
854
855static BOOL
856is_counted_repeat(const uschar *p)
857{
858if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859while ((digitab[*p] & ctype_digit) != 0) p++;
860if (*p == '}') return TRUE;
861
862if (*p++ != ',') return FALSE;
863if (*p == '}') return TRUE;
864
865if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866while ((digitab[*p] & ctype_digit) != 0) p++;
867
868return (*p == '}');
869}
870
871
872
873/*************************************************
874* Read repeat counts *
875*************************************************/
876
877/* Read an item of the form {n,m} and return the values. This is called only
878after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879so the syntax is guaranteed to be correct, but we need to check the values.
880
881Arguments:
882 p pointer to first char after '{'
883 minp pointer to int for min
884 maxp pointer to int for max
885 returned as -1 if no max
886 errorcodeptr points to error code variable
887
888Returns: pointer to '}' on success;
889 current ptr on error, with errorcodeptr set non-zero
890*/
891
892static const uschar *
893read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894{
895int min = 0;
896int max = -1;
897
898/* Read the minimum value and do a paranoid check: a negative value indicates
899an integer overflow. */
900
901while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902if (min < 0 || min > 65535)
903 {
904 *errorcodeptr = ERR5;
905 return p;
906 }
907
908/* Read the maximum value if there is one, and again do a paranoid on its size.
909Also, max must not be less than min. */
910
911if (*p == '}') max = min; else
912 {
913 if (*(++p) != '}')
914 {
915 max = 0;
916 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917 if (max < 0 || max > 65535)
918 {
919 *errorcodeptr = ERR5;
920 return p;
921 }
922 if (max < min)
923 {
924 *errorcodeptr = ERR4;
925 return p;
926 }
927 }
928 }
929
930/* Fill in the required variables, and pass back the pointer to the terminating
931'}'. */
932
933*minp = min;
934*maxp = max;
935return p;
936}
937
938
939
940/*************************************************
941* Find forward referenced subpattern *
942*************************************************/
943
944/* This function scans along a pattern's text looking for capturing
945subpatterns, and counting them. If it finds a named pattern that matches the
946name it is given, it returns its number. Alternatively, if the name is NULL, it
947returns when it reaches a given numbered subpattern. This is used for forward
948references to subpatterns. We know that if (?P< is encountered, the name will
949be terminated by '>' because that is checked in the first pass.
950
951Arguments:
952 ptr current position in the pattern
953 count current count of capturing parens so far encountered
954 name name to seek, or NULL if seeking a numbered subpattern
955 lorn name length, or subpattern number if name is NULL
956 xmode TRUE if we are in /x mode
957
958Returns: the number of the named subpattern, or -1 if not found
959*/
960
961static int
962find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963 BOOL xmode)
964{
965const uschar *thisname;
966
967for (; *ptr != 0; ptr++)
968 {
969 int term;
970
971 /* Skip over backslashed characters and also entire \Q...\E */
972
973 if (*ptr == '\\')
974 {
975 if (*(++ptr) == 0) return -1;
976 if (*ptr == 'Q') for (;;)
977 {
978 while (*(++ptr) != 0 && *ptr != '\\');
979 if (*ptr == 0) return -1;
980 if (*(++ptr) == 'E') break;
981 }
982 continue;
983 }
984
985 /* Skip over character classes */
986
987 if (*ptr == '[')
988 {
989 while (*(++ptr) != ']')
990 {
991 if (*ptr == 0) return -1;
992 if (*ptr == '\\')
993 {
994 if (*(++ptr) == 0) return -1;
995 if (*ptr == 'Q') for (;;)
996 {
997 while (*(++ptr) != 0 && *ptr != '\\');
998 if (*ptr == 0) return -1;
999 if (*(++ptr) == 'E') break;
1000 }
1001 continue;
1002 }
1003 }
1004 continue;
1005 }
1006
1007 /* Skip comments in /x mode */
1008
1009 if (xmode && *ptr == '#')
1010 {
1011 while (*(++ptr) != 0 && *ptr != '\n');
1012 if (*ptr == 0) return -1;
1013 continue;
1014 }
1015
1016 /* An opening parens must now be a real metacharacter */
1017
1018 if (*ptr != '(') continue;
1019 if (ptr[1] != '?' && ptr[1] != '*')
1020 {
1021 count++;
1022 if (name == NULL && count == lorn) return count;
1023 continue;
1024 }
1025
1026 ptr += 2;
1027 if (*ptr == 'P') ptr++; /* Allow optional P */
1028
1029 /* We have to disambiguate (?<! and (?<= from (?<name> */
1030
1031 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032 *ptr != '\'')
1033 continue;
1034
1035 count++;
1036
1037 if (name == NULL && count == lorn) return count;
1038 term = *ptr++;
1039 if (term == '<') term = '>';
1040 thisname = ptr;
1041 while (*ptr != term) ptr++;
1042 if (name != NULL && lorn == ptr - thisname &&
1043 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044 return count;
1045 }
1046
1047return -1;
1048}
1049
1050
1051
1052/*************************************************
1053* Find first significant op code *
1054*************************************************/
1055
1056/* This is called by several functions that scan a compiled expression looking
1057for a fixed first character, or an anchoring op code etc. It skips over things
1058that do not influence this. For some calls, a change of option is important.
1059For some calls, it makes sense to skip negative forward and all backward
1060assertions, and also the \b assertion; for others it does not.
1061
1062Arguments:
1063 code pointer to the start of the group
1064 options pointer to external options
1065 optbit the option bit whose changing is significant, or
1066 zero if none are
1067 skipassert TRUE if certain assertions are to be skipped
1068
1069Returns: pointer to the first significant opcode
1070*/
1071
1072static const uschar*
1073first_significant_code(const uschar *code, int *options, int optbit,
1074 BOOL skipassert)
1075{
1076for (;;)
1077 {
1078 switch ((int)*code)
1079 {
1080 case OP_OPT:
1081 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082 *options = (int)code[1];
1083 code += 2;
1084 break;
1085
1086 case OP_ASSERT_NOT:
1087 case OP_ASSERTBACK:
1088 case OP_ASSERTBACK_NOT:
1089 if (!skipassert) return code;
1090 do code += GET(code, 1); while (*code == OP_ALT);
1091 code += _pcre_OP_lengths[*code];
1092 break;
1093
1094 case OP_WORD_BOUNDARY:
1095 case OP_NOT_WORD_BOUNDARY:
1096 if (!skipassert) return code;
1097 /* Fall through */
1098
1099 case OP_CALLOUT:
1100 case OP_CREF:
1101 case OP_RREF:
1102 case OP_DEF:
1103 code += _pcre_OP_lengths[*code];
1104 break;
1105
1106 default:
1107 return code;
1108 }
1109 }
1110/* Control never reaches here */
1111}
1112
1113
1114
1115
1116/*************************************************
1117* Find the fixed length of a pattern *
1118*************************************************/
1119
1120/* Scan a pattern and compute the fixed length of subject that will match it,
1121if the length is fixed. This is needed for dealing with backward assertions.
1122In UTF8 mode, the result is in characters rather than bytes.
1123
1124Arguments:
1125 code points to the start of the pattern (the bracket)
1126 options the compiling options
1127
1128Returns: the fixed length, or -1 if there is no fixed length,
1129 or -2 if \C was encountered
1130*/
1131
1132static int
1133find_fixedlength(uschar *code, int options)
1134{
1135int length = -1;
1136
1137register int branchlength = 0;
1138register uschar *cc = code + 1 + LINK_SIZE;
1139
1140/* Scan along the opcodes for this branch. If we get to the end of the
1141branch, check the length against that of the other branches. */
1142
1143for (;;)
1144 {
1145 int d;
1146 register int op = *cc;
1147 switch (op)
1148 {
1149 case OP_CBRA:
1150 case OP_BRA:
1151 case OP_ONCE:
1152 case OP_COND:
1153 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154 if (d < 0) return d;
1155 branchlength += d;
1156 do cc += GET(cc, 1); while (*cc == OP_ALT);
1157 cc += 1 + LINK_SIZE;
1158 break;
1159
1160 /* Reached end of a branch; if it's a ket it is the end of a nested
1161 call. If it's ALT it is an alternation in a nested call. If it is
1162 END it's the end of the outer call. All can be handled by the same code. */
1163
1164 case OP_ALT:
1165 case OP_KET:
1166 case OP_KETRMAX:
1167 case OP_KETRMIN:
1168 case OP_END:
1169 if (length < 0) length = branchlength;
1170 else if (length != branchlength) return -1;
1171 if (*cc != OP_ALT) return length;
1172 cc += 1 + LINK_SIZE;
1173 branchlength = 0;
1174 break;
1175
1176 /* Skip over assertive subpatterns */
1177
1178 case OP_ASSERT:
1179 case OP_ASSERT_NOT:
1180 case OP_ASSERTBACK:
1181 case OP_ASSERTBACK_NOT:
1182 do cc += GET(cc, 1); while (*cc == OP_ALT);
1183 /* Fall through */
1184
1185 /* Skip over things that don't match chars */
1186
1187 case OP_REVERSE:
1188 case OP_CREF:
1189 case OP_RREF:
1190 case OP_DEF:
1191 case OP_OPT:
1192 case OP_CALLOUT:
1193 case OP_SOD:
1194 case OP_SOM:
1195 case OP_EOD:
1196 case OP_EODN:
1197 case OP_CIRC:
1198 case OP_DOLL:
1199 case OP_NOT_WORD_BOUNDARY:
1200 case OP_WORD_BOUNDARY:
1201 cc += _pcre_OP_lengths[*cc];
1202 break;
1203
1204 /* Handle literal characters */
1205
1206 case OP_CHAR:
1207 case OP_CHARNC:
1208 case OP_NOT:
1209 branchlength++;
1210 cc += 2;
1211#ifdef SUPPORT_UTF8
1212 if ((options & PCRE_UTF8) != 0)
1213 {
1214 while ((*cc & 0xc0) == 0x80) cc++;
1215 }
1216#endif
1217 break;
1218
1219 /* Handle exact repetitions. The count is already in characters, but we
1220 need to skip over a multibyte character in UTF8 mode. */
1221
1222 case OP_EXACT:
1223 branchlength += GET2(cc,1);
1224 cc += 4;
1225#ifdef SUPPORT_UTF8
1226 if ((options & PCRE_UTF8) != 0)
1227 {
1228 while((*cc & 0x80) == 0x80) cc++;
1229 }
1230#endif
1231 break;
1232
1233 case OP_TYPEEXACT:
1234 branchlength += GET2(cc,1);
1235 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236 cc += 4;
1237 break;
1238
1239 /* Handle single-char matchers */
1240
1241 case OP_PROP:
1242 case OP_NOTPROP:
1243 cc += 2;
1244 /* Fall through */
1245
1246 case OP_NOT_DIGIT:
1247 case OP_DIGIT:
1248 case OP_NOT_WHITESPACE:
1249 case OP_WHITESPACE:
1250 case OP_NOT_WORDCHAR:
1251 case OP_WORDCHAR:
1252 case OP_ANY:
1253 branchlength++;
1254 cc++;
1255 break;
1256
1257 /* The single-byte matcher isn't allowed */
1258
1259 case OP_ANYBYTE:
1260 return -2;
1261
1262 /* Check a class for variable quantification */
1263
1264#ifdef SUPPORT_UTF8
1265 case OP_XCLASS:
1266 cc += GET(cc, 1) - 33;
1267 /* Fall through */
1268#endif
1269
1270 case OP_CLASS:
1271 case OP_NCLASS:
1272 cc += 33;
1273
1274 switch (*cc)
1275 {
1276 case OP_CRSTAR:
1277 case OP_CRMINSTAR:
1278 case OP_CRQUERY:
1279 case OP_CRMINQUERY:
1280 return -1;
1281
1282 case OP_CRRANGE:
1283 case OP_CRMINRANGE:
1284 if (GET2(cc,1) != GET2(cc,3)) return -1;
1285 branchlength += GET2(cc,1);
1286 cc += 5;
1287 break;
1288
1289 default:
1290 branchlength++;
1291 }
1292 break;
1293
1294 /* Anything else is variable length */
1295
1296 default:
1297 return -1;
1298 }
1299 }
1300/* Control never gets here */
1301}
1302
1303
1304
1305
1306/*************************************************
1307* Scan compiled regex for numbered bracket *
1308*************************************************/
1309
1310/* This little function scans through a compiled pattern until it finds a
1311capturing bracket with the given number.
1312
1313Arguments:
1314 code points to start of expression
1315 utf8 TRUE in UTF-8 mode
1316 number the required bracket number
1317
1318Returns: pointer to the opcode for the bracket, or NULL if not found
1319*/
1320
1321static const uschar *
1322find_bracket(const uschar *code, BOOL utf8, int number)
1323{
1324for (;;)
1325 {
1326 register int c = *code;
1327 if (c == OP_END) return NULL;
1328
1329 /* XCLASS is used for classes that cannot be represented just by a bit
1330 map. This includes negated single high-valued characters. The length in
1331 the table is zero; the actual length is stored in the compiled code. */
1332
1333 if (c == OP_XCLASS) code += GET(code, 1);
1334
1335 /* Handle capturing bracket */
1336
1337 else if (c == OP_CBRA)
1338 {
1339 int n = GET2(code, 1+LINK_SIZE);
1340 if (n == number) return (uschar *)code;
1341 code += _pcre_OP_lengths[c];
1342 }
1343
1344 /* Otherwise, we can get the item's length from the table, except that for
1345 repeated character types, we have to test for \p and \P, which have an extra
1346 two bytes of parameters. */
1347
1348 else
1349 {
1350 switch(c)
1351 {
1352 case OP_TYPESTAR:
1353 case OP_TYPEMINSTAR:
1354 case OP_TYPEPLUS:
1355 case OP_TYPEMINPLUS:
1356 case OP_TYPEQUERY:
1357 case OP_TYPEMINQUERY:
1358 case OP_TYPEPOSSTAR:
1359 case OP_TYPEPOSPLUS:
1360 case OP_TYPEPOSQUERY:
1361 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362 break;
1363
1364 case OP_TYPEUPTO:
1365 case OP_TYPEMINUPTO:
1366 case OP_TYPEEXACT:
1367 case OP_TYPEPOSUPTO:
1368 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369 break;
1370 }
1371
1372 /* Add in the fixed length from the table */
1373
1374 code += _pcre_OP_lengths[c];
1375
1376 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377 a multi-byte character. The length in the table is a minimum, so we have to
1378 arrange to skip the extra bytes. */
1379
1380#ifdef SUPPORT_UTF8
1381 if (utf8) switch(c)
1382 {
1383 case OP_CHAR:
1384 case OP_CHARNC:
1385 case OP_EXACT:
1386 case OP_UPTO:
1387 case OP_MINUPTO:
1388 case OP_POSUPTO:
1389 case OP_STAR:
1390 case OP_MINSTAR:
1391 case OP_POSSTAR:
1392 case OP_PLUS:
1393 case OP_MINPLUS:
1394 case OP_POSPLUS:
1395 case OP_QUERY:
1396 case OP_MINQUERY:
1397 case OP_POSQUERY:
1398 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399 break;
1400 }
1401#endif
1402 }
1403 }
1404}
1405
1406
1407
1408/*************************************************
1409* Scan compiled regex for recursion reference *
1410*************************************************/
1411
1412/* This little function scans through a compiled pattern until it finds an
1413instance of OP_RECURSE.
1414
1415Arguments:
1416 code points to start of expression
1417 utf8 TRUE in UTF-8 mode
1418
1419Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1420*/
1421
1422static const uschar *
1423find_recurse(const uschar *code, BOOL utf8)
1424{
1425for (;;)
1426 {
1427 register int c = *code;
1428 if (c == OP_END) return NULL;
1429 if (c == OP_RECURSE) return code;
1430
1431 /* XCLASS is used for classes that cannot be represented just by a bit
1432 map. This includes negated single high-valued characters. The length in
1433 the table is zero; the actual length is stored in the compiled code. */
1434
1435 if (c == OP_XCLASS) code += GET(code, 1);
1436
1437 /* Otherwise, we can get the item's length from the table, except that for
1438 repeated character types, we have to test for \p and \P, which have an extra
1439 two bytes of parameters. */
1440
1441 else
1442 {
1443 switch(c)
1444 {
1445 case OP_TYPESTAR:
1446 case OP_TYPEMINSTAR:
1447 case OP_TYPEPLUS:
1448 case OP_TYPEMINPLUS:
1449 case OP_TYPEQUERY:
1450 case OP_TYPEMINQUERY:
1451 case OP_TYPEPOSSTAR:
1452 case OP_TYPEPOSPLUS:
1453 case OP_TYPEPOSQUERY:
1454 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455 break;
1456
1457 case OP_TYPEPOSUPTO:
1458 case OP_TYPEUPTO:
1459 case OP_TYPEMINUPTO:
1460 case OP_TYPEEXACT:
1461 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462 break;
1463 }
1464
1465 /* Add in the fixed length from the table */
1466
1467 code += _pcre_OP_lengths[c];
1468
1469 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470 by a multi-byte character. The length in the table is a minimum, so we have
1471 to arrange to skip the extra bytes. */
1472
1473#ifdef SUPPORT_UTF8
1474 if (utf8) switch(c)
1475 {
1476 case OP_CHAR:
1477 case OP_CHARNC:
1478 case OP_EXACT:
1479 case OP_UPTO:
1480 case OP_MINUPTO:
1481 case OP_POSUPTO:
1482 case OP_STAR:
1483 case OP_MINSTAR:
1484 case OP_POSSTAR:
1485 case OP_PLUS:
1486 case OP_MINPLUS:
1487 case OP_POSPLUS:
1488 case OP_QUERY:
1489 case OP_MINQUERY:
1490 case OP_POSQUERY:
1491 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492 break;
1493 }
1494#endif
1495 }
1496 }
1497}
1498
1499
1500
1501/*************************************************
1502* Scan compiled branch for non-emptiness *
1503*************************************************/
1504
1505/* This function scans through a branch of a compiled pattern to see whether it
1506can match the empty string or not. It is called from could_be_empty()
1507below and from compile_branch() when checking for an unlimited repeat of a
1508group that can match nothing. Note that first_significant_code() skips over
1509backward and negative forward assertions when its final argument is TRUE. If we
1510hit an unclosed bracket, we return "empty" - this means we've struck an inner
1511bracket whose current branch will already have been scanned.
1512
1513Arguments:
1514 code points to start of search
1515 endcode points to where to stop
1516 utf8 TRUE if in UTF8 mode
1517
1518Returns: TRUE if what is matched could be empty
1519*/
1520
1521static BOOL
1522could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1523{
1524register int c;
1525for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1526 code < endcode;
1527 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1528 {
1529 const uschar *ccode;
1530
1531 c = *code;
1532
1533 /* Skip over forward assertions; the other assertions are skipped by
1534 first_significant_code() with a TRUE final argument. */
1535
1536 if (c == OP_ASSERT)
1537 {
1538 do code += GET(code, 1); while (*code == OP_ALT);
1539 c = *code;
1540 continue;
1541 }
1542
1543 /* Groups with zero repeats can of course be empty; skip them. */
1544
1545 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1546 {
1547 code += _pcre_OP_lengths[c];
1548 do code += GET(code, 1); while (*code == OP_ALT);
1549 c = *code;
1550 continue;
1551 }
1552
1553 /* For other groups, scan the branches. */
1554
1555 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1556 {
1557 BOOL empty_branch;
1558 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1559
1560 /* Scan a closed bracket */
1561
1562 empty_branch = FALSE;
1563 do
1564 {
1565 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1566 empty_branch = TRUE;
1567 code += GET(code, 1);
1568 }
1569 while (*code == OP_ALT);
1570 if (!empty_branch) return FALSE; /* All branches are non-empty */
1571 c = *code;
1572 continue;
1573 }
1574
1575 /* Handle the other opcodes */
1576
1577 switch (c)
1578 {
1579 /* Check for quantifiers after a class. XCLASS is used for classes that
1580 cannot be represented just by a bit map. This includes negated single
1581 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1582 actual length is stored in the compiled code, so we must update "code"
1583 here. */
1584
1585#ifdef SUPPORT_UTF8
1586 case OP_XCLASS:
1587 ccode = code += GET(code, 1);
1588 goto CHECK_CLASS_REPEAT;
1589#endif
1590
1591 case OP_CLASS:
1592 case OP_NCLASS:
1593 ccode = code + 33;
1594
1595#ifdef SUPPORT_UTF8
1596 CHECK_CLASS_REPEAT:
1597#endif
1598
1599 switch (*ccode)
1600 {
1601 case OP_CRSTAR: /* These could be empty; continue */
1602 case OP_CRMINSTAR:
1603 case OP_CRQUERY:
1604 case OP_CRMINQUERY:
1605 break;
1606
1607 default: /* Non-repeat => class must match */
1608 case OP_CRPLUS: /* These repeats aren't empty */
1609 case OP_CRMINPLUS:
1610 return FALSE;
1611
1612 case OP_CRRANGE:
1613 case OP_CRMINRANGE:
1614 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1615 break;
1616 }
1617 break;
1618
1619 /* Opcodes that must match a character */
1620
1621 case OP_PROP:
1622 case OP_NOTPROP:
1623 case OP_EXTUNI:
1624 case OP_NOT_DIGIT:
1625 case OP_DIGIT:
1626 case OP_NOT_WHITESPACE:
1627 case OP_WHITESPACE:
1628 case OP_NOT_WORDCHAR:
1629 case OP_WORDCHAR:
1630 case OP_ANY:
1631 case OP_ANYBYTE:
1632 case OP_CHAR:
1633 case OP_CHARNC:
1634 case OP_NOT:
1635 case OP_PLUS:
1636 case OP_MINPLUS:
1637 case OP_POSPLUS:
1638 case OP_EXACT:
1639 case OP_NOTPLUS:
1640 case OP_NOTMINPLUS:
1641 case OP_NOTPOSPLUS:
1642 case OP_NOTEXACT:
1643 case OP_TYPEPLUS:
1644 case OP_TYPEMINPLUS:
1645 case OP_TYPEPOSPLUS:
1646 case OP_TYPEEXACT:
1647 return FALSE;
1648
1649 /* These are going to continue, as they may be empty, but we have to
1650 fudge the length for the \p and \P cases. */
1651
1652 case OP_TYPESTAR:
1653 case OP_TYPEMINSTAR:
1654 case OP_TYPEPOSSTAR:
1655 case OP_TYPEQUERY:
1656 case OP_TYPEMINQUERY:
1657 case OP_TYPEPOSQUERY:
1658 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1659 break;
1660
1661 /* Same for these */
1662
1663 case OP_TYPEUPTO:
1664 case OP_TYPEMINUPTO:
1665 case OP_TYPEPOSUPTO:
1666 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1667 break;
1668
1669 /* End of branch */
1670
1671 case OP_KET:
1672 case OP_KETRMAX:
1673 case OP_KETRMIN:
1674 case OP_ALT:
1675 return TRUE;
1676
1677 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1678 MINUPTO, and POSUPTO may be followed by a multibyte character */
1679
1680#ifdef SUPPORT_UTF8
1681 case OP_STAR:
1682 case OP_MINSTAR:
1683 case OP_POSSTAR:
1684 case OP_QUERY:
1685 case OP_MINQUERY:
1686 case OP_POSQUERY:
1687 case OP_UPTO:
1688 case OP_MINUPTO:
1689 case OP_POSUPTO:
1690 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1691 break;
1692#endif
1693 }
1694 }
1695
1696return TRUE;
1697}
1698
1699
1700
1701/*************************************************
1702* Scan compiled regex for non-emptiness *
1703*************************************************/
1704
1705/* This function is called to check for left recursive calls. We want to check
1706the current branch of the current pattern to see if it could match the empty
1707string. If it could, we must look outwards for branches at other levels,
1708stopping when we pass beyond the bracket which is the subject of the recursion.
1709
1710Arguments:
1711 code points to start of the recursion
1712 endcode points to where to stop (current RECURSE item)
1713 bcptr points to the chain of current (unclosed) branch starts
1714 utf8 TRUE if in UTF-8 mode
1715
1716Returns: TRUE if what is matched could be empty
1717*/
1718
1719static BOOL
1720could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1721 BOOL utf8)
1722{
1723while (bcptr != NULL && bcptr->current >= code)
1724 {
1725 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1726 bcptr = bcptr->outer;
1727 }
1728return TRUE;
1729}
1730
1731
1732
1733/*************************************************
1734* Check for POSIX class syntax *
1735*************************************************/
1736
1737/* This function is called when the sequence "[:" or "[." or "[=" is
1738encountered in a character class. It checks whether this is followed by a
1739sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1740reach an unescaped ']' without the special preceding character, return FALSE.
1741
1742Originally, this function only recognized a sequence of letters between the
1743terminators, but it seems that Perl recognizes any sequence of characters,
1744though of course unknown POSIX names are subsequently rejected. Perl gives an
1745"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1746didn't consider this to be a POSIX class. Likewise for [:1234:].
1747
1748The problem in trying to be exactly like Perl is in the handling of escapes. We
1749have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1750class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1751below handles the special case of \], but does not try to do any other escape
1752processing. This makes it different from Perl for cases such as [:l\ower:]
1753where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1754"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1755I think.
1756
1757Arguments:
1758 ptr pointer to the initial [
1759 endptr where to return the end pointer
1760
1761Returns: TRUE or FALSE
1762*/
1763
1764static BOOL
1765check_posix_syntax(const uschar *ptr, const uschar **endptr)
1766{
1767int terminator; /* Don't combine these lines; the Solaris cc */
1768terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1769for (++ptr; *ptr != 0; ptr++)
1770 {
1771 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1772 {
1773 if (*ptr == ']') return FALSE;
1774 if (*ptr == terminator && ptr[1] == ']')
1775 {
1776 *endptr = ptr;
1777 return TRUE;
1778 }
1779 }
1780 }
1781return FALSE;
1782}
1783
1784
1785
1786
1787/*************************************************
1788* Check POSIX class name *
1789*************************************************/
1790
1791/* This function is called to check the name given in a POSIX-style class entry
1792such as [:alnum:].
1793
1794Arguments:
1795 ptr points to the first letter
1796 len the length of the name
1797
1798Returns: a value representing the name, or -1 if unknown
1799*/
1800
1801static int
1802check_posix_name(const uschar *ptr, int len)
1803{
1804const char *pn = posix_names;
1805register int yield = 0;
1806while (posix_name_lengths[yield] != 0)
1807 {
1808 if (len == posix_name_lengths[yield] &&
1809 strncmp((const char *)ptr, pn, len) == 0) return yield;
1810 pn += posix_name_lengths[yield] + 1;
1811 yield++;
1812 }
1813return -1;
1814}
1815
1816
1817/*************************************************
1818* Adjust OP_RECURSE items in repeated group *
1819*************************************************/
1820
1821/* OP_RECURSE items contain an offset from the start of the regex to the group
1822that is referenced. This means that groups can be replicated for fixed
1823repetition simply by copying (because the recursion is allowed to refer to
1824earlier groups that are outside the current group). However, when a group is
1825optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1826it, after it has been compiled. This means that any OP_RECURSE items within it
1827that refer to the group itself or any contained groups have to have their
1828offsets adjusted. That one of the jobs of this function. Before it is called,
1829the partially compiled regex must be temporarily terminated with OP_END.
1830
1831This function has been extended with the possibility of forward references for
1832recursions and subroutine calls. It must also check the list of such references
1833for the group we are dealing with. If it finds that one of the recursions in
1834the current group is on this list, it adjusts the offset in the list, not the
1835value in the reference (which is a group number).
1836
1837Arguments:
1838 group points to the start of the group
1839 adjust the amount by which the group is to be moved
1840 utf8 TRUE in UTF-8 mode
1841 cd contains pointers to tables etc.
1842 save_hwm the hwm forward reference pointer at the start of the group
1843
1844Returns: nothing
1845*/
1846
1847static void
1848adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1849 uschar *save_hwm)
1850{
1851uschar *ptr = group;
1852
1853while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1854 {
1855 int offset;
1856 uschar *hc;
1857
1858 /* See if this recursion is on the forward reference list. If so, adjust the
1859 reference. */
1860
1861 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1862 {
1863 offset = GET(hc, 0);
1864 if (cd->start_code + offset == ptr + 1)
1865 {
1866 PUT(hc, 0, offset + adjust);
1867 break;
1868 }
1869 }
1870
1871 /* Otherwise, adjust the recursion offset if it's after the start of this
1872 group. */
1873
1874 if (hc >= cd->hwm)
1875 {
1876 offset = GET(ptr, 1);
1877 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1878 }
1879
1880 ptr += 1 + LINK_SIZE;
1881 }
1882}
1883
1884
1885
1886/*************************************************
1887* Insert an automatic callout point *
1888*************************************************/
1889
1890/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1891callout points before each pattern item.
1892
1893Arguments:
1894 code current code pointer
1895 ptr current pattern pointer
1896 cd pointers to tables etc
1897
1898Returns: new code pointer
1899*/
1900
1901static uschar *
1902auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1903{
1904*code++ = OP_CALLOUT;
1905*code++ = 255;
1906PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1907PUT(code, LINK_SIZE, 0); /* Default length */
1908return code + 2*LINK_SIZE;
1909}
1910
1911
1912
1913/*************************************************
1914* Complete a callout item *
1915*************************************************/
1916
1917/* A callout item contains the length of the next item in the pattern, which
1918we can't fill in till after we have reached the relevant point. This is used
1919for both automatic and manual callouts.
1920
1921Arguments:
1922 previous_callout points to previous callout item
1923 ptr current pattern pointer
1924 cd pointers to tables etc
1925
1926Returns: nothing
1927*/
1928
1929static void
1930complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1931{
1932int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1933PUT(previous_callout, 2 + LINK_SIZE, length);
1934}
1935
1936
1937
1938#ifdef SUPPORT_UCP
1939/*************************************************
1940* Get othercase range *
1941*************************************************/
1942
1943/* This function is passed the start and end of a class range, in UTF-8 mode
1944with UCP support. It searches up the characters, looking for internal ranges of
1945characters in the "other" case. Each call returns the next one, updating the
1946start address.
1947
1948Arguments:
1949 cptr points to starting character value; updated
1950 d end value
1951 ocptr where to put start of othercase range
1952 odptr where to put end of othercase range
1953
1954Yield: TRUE when range returned; FALSE when no more
1955*/
1956
1957static BOOL
1958get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1959 unsigned int *odptr)
1960{
1961unsigned int c, othercase, next;
1962
1963for (c = *cptr; c <= d; c++)
1964 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1965
1966if (c > d) return FALSE;
1967
1968*ocptr = othercase;
1969next = othercase + 1;
1970
1971for (++c; c <= d; c++)
1972 {
1973 if (_pcre_ucp_othercase(c) != next) break;
1974 next++;
1975 }
1976
1977*odptr = next - 1;
1978*cptr = c;
1979
1980return TRUE;
1981}
1982#endif /* SUPPORT_UCP */
1983
1984
1985
1986/*************************************************
1987* Check if auto-possessifying is possible *
1988*************************************************/
1989
1990/* This function is called for unlimited repeats of certain items, to see
1991whether the next thing could possibly match the repeated item. If not, it makes
1992sense to automatically possessify the repeated item.
1993
1994Arguments:
1995 op_code the repeated op code
1996 this data for this item, depends on the opcode
1997 utf8 TRUE in UTF-8 mode
1998 utf8_char used for utf8 character bytes, NULL if not relevant
1999 ptr next character in pattern
2000 options options bits
2001 cd contains pointers to tables etc.
2002
2003Returns: TRUE if possessifying is wanted
2004*/
2005
2006static BOOL
2007check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2008 const uschar *ptr, int options, compile_data *cd)
2009{
2010int next;
2011
2012/* Skip whitespace and comments in extended mode */
2013
2014if ((options & PCRE_EXTENDED) != 0)
2015 {
2016 for (;;)
2017 {
2018 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2019 if (*ptr == '#')
2020 {
2021 while (*(++ptr) != 0)
2022 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2023 }
2024 else break;
2025 }
2026 }
2027
2028/* If the next item is one that we can handle, get its value. A non-negative
2029value is a character, a negative value is an escape value. */
2030
2031if (*ptr == '\\')
2032 {
2033 int temperrorcode = 0;
2034 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2035 if (temperrorcode != 0) return FALSE;
2036 ptr++; /* Point after the escape sequence */
2037 }
2038
2039else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2040 {
2041#ifdef SUPPORT_UTF8
2042 if (utf8) { GETCHARINC(next, ptr); } else
2043#endif
2044 next = *ptr++;
2045 }
2046
2047else return FALSE;
2048
2049/* Skip whitespace and comments in extended mode */
2050
2051if ((options & PCRE_EXTENDED) != 0)
2052 {
2053 for (;;)
2054 {
2055 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2056 if (*ptr == '#')
2057 {
2058 while (*(++ptr) != 0)
2059 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2060 }
2061 else break;
2062 }
2063 }
2064
2065/* If the next thing is itself optional, we have to give up. */
2066
2067if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2068 return FALSE;
2069
2070/* Now compare the next item with the previous opcode. If the previous is a
2071positive single character match, "item" either contains the character or, if
2072"item" is greater than 127 in utf8 mode, the character's bytes are in
2073utf8_char. */
2074
2075
2076/* Handle cases when the next item is a character. */
2077
2078if (next >= 0) switch(op_code)
2079 {
2080 case OP_CHAR:
2081#ifdef SUPPORT_UTF8
2082 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2083#endif
2084 return item != next;
2085
2086 /* For CHARNC (caseless character) we must check the other case. If we have
2087 Unicode property support, we can use it to test the other case of
2088 high-valued characters. */
2089
2090 case OP_CHARNC:
2091#ifdef SUPPORT_UTF8
2092 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2093#endif
2094 if (item == next) return FALSE;
2095#ifdef SUPPORT_UTF8
2096 if (utf8)
2097 {
2098 unsigned int othercase;
2099 if (next < 128) othercase = cd->fcc[next]; else
2100#ifdef SUPPORT_UCP
2101 othercase = _pcre_ucp_othercase((unsigned int)next);
2102#else
2103 othercase = NOTACHAR;
2104#endif
2105 return (unsigned int)item != othercase;
2106 }
2107 else
2108#endif /* SUPPORT_UTF8 */
2109 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2110
2111 /* For OP_NOT, "item" must be a single-byte character. */
2112
2113 case OP_NOT:
2114 if (next < 0) return FALSE; /* Not a character */
2115 if (item == next) return TRUE;
2116 if ((options & PCRE_CASELESS) == 0) return FALSE;
2117#ifdef SUPPORT_UTF8
2118 if (utf8)
2119 {
2120 unsigned int othercase;
2121 if (next < 128) othercase = cd->fcc[next]; else
2122#ifdef SUPPORT_UCP
2123 othercase = _pcre_ucp_othercase(next);
2124#else
2125 othercase = NOTACHAR;
2126#endif
2127 return (unsigned int)item == othercase;
2128 }
2129 else
2130#endif /* SUPPORT_UTF8 */
2131 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2132
2133 case OP_DIGIT:
2134 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2135
2136 case OP_NOT_DIGIT:
2137 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2138
2139 case OP_WHITESPACE:
2140 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2141
2142 case OP_NOT_WHITESPACE:
2143 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2144
2145 case OP_WORDCHAR:
2146 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2147
2148 case OP_NOT_WORDCHAR:
2149 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2150
2151 case OP_HSPACE:
2152 case OP_NOT_HSPACE:
2153 switch(next)
2154 {
2155 case 0x09:
2156 case 0x20:
2157 case 0xa0:
2158 case 0x1680:
2159 case 0x180e:
2160 case 0x2000:
2161 case 0x2001:
2162 case 0x2002:
2163 case 0x2003:
2164 case 0x2004:
2165 case 0x2005:
2166 case 0x2006:
2167 case 0x2007:
2168 case 0x2008:
2169 case 0x2009:
2170 case 0x200A:
2171 case 0x202f:
2172 case 0x205f:
2173 case 0x3000:
2174 return op_code != OP_HSPACE;
2175 default:
2176 return op_code == OP_HSPACE;
2177 }
2178
2179 case OP_VSPACE:
2180 case OP_NOT_VSPACE:
2181 switch(next)
2182 {
2183 case 0x0a:
2184 case 0x0b:
2185 case 0x0c:
2186 case 0x0d:
2187 case 0x85:
2188 case 0x2028:
2189 case 0x2029:
2190 return op_code != OP_VSPACE;
2191 default:
2192 return op_code == OP_VSPACE;
2193 }
2194
2195 default:
2196 return FALSE;
2197 }
2198
2199
2200/* Handle the case when the next item is \d, \s, etc. */
2201
2202switch(op_code)
2203 {
2204 case OP_CHAR:
2205 case OP_CHARNC:
2206#ifdef SUPPORT_UTF8
2207 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2208#endif
2209 switch(-next)
2210 {
2211 case ESC_d:
2212 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2213
2214 case ESC_D:
2215 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2216
2217 case ESC_s:
2218 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2219
2220 case ESC_S:
2221 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2222
2223 case ESC_w:
2224 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2225
2226 case ESC_W:
2227 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2228
2229 case ESC_h:
2230 case ESC_H:
2231 switch(item)
2232 {
2233 case 0x09:
2234 case 0x20:
2235 case 0xa0:
2236 case 0x1680:
2237 case 0x180e:
2238 case 0x2000:
2239 case 0x2001:
2240 case 0x2002:
2241 case 0x2003:
2242 case 0x2004:
2243 case 0x2005:
2244 case 0x2006:
2245 case 0x2007:
2246 case 0x2008:
2247 case 0x2009:
2248 case 0x200A:
2249 case 0x202f:
2250 case 0x205f:
2251 case 0x3000:
2252 return -next != ESC_h;
2253 default:
2254 return -next == ESC_h;
2255 }
2256
2257 case ESC_v:
2258 case ESC_V:
2259 switch(item)
2260 {
2261 case 0x0a:
2262 case 0x0b:
2263 case 0x0c:
2264 case 0x0d:
2265 case 0x85:
2266 case 0x2028:
2267 case 0x2029:
2268 return -next != ESC_v;
2269 default:
2270 return -next == ESC_v;
2271 }
2272
2273 default:
2274 return FALSE;
2275 }
2276
2277 case OP_DIGIT:
2278 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2279 next == -ESC_h || next == -ESC_v;
2280
2281 case OP_NOT_DIGIT:
2282 return next == -ESC_d;
2283
2284 case OP_WHITESPACE:
2285 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2286
2287 case OP_NOT_WHITESPACE:
2288 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2289
2290 case OP_HSPACE:
2291 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2292
2293 case OP_NOT_HSPACE:
2294 return next == -ESC_h;
2295
2296 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2297 case OP_VSPACE:
2298 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2299
2300 case OP_NOT_VSPACE:
2301 return next == -ESC_v;
2302
2303 case OP_WORDCHAR:
2304 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2305
2306 case OP_NOT_WORDCHAR:
2307 return next == -ESC_w || next == -ESC_d;
2308
2309 default:
2310 return FALSE;
2311 }
2312
2313/* Control does not reach here */
2314}
2315
2316
2317
2318/*************************************************
2319* Compile one branch *
2320*************************************************/
2321
2322/* Scan the pattern, compiling it into the a vector. If the options are
2323changed during the branch, the pointer is used to change the external options
2324bits. This function is used during the pre-compile phase when we are trying
2325to find out the amount of memory needed, as well as during the real compile
2326phase. The value of lengthptr distinguishes the two phases.
2327
2328Arguments:
2329 optionsptr pointer to the option bits
2330 codeptr points to the pointer to the current code point
2331 ptrptr points to the current pattern pointer
2332 errorcodeptr points to error code variable
2333 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2334 reqbyteptr set to the last literal character required, else < 0
2335 bcptr points to current branch chain
2336 cd contains pointers to tables etc.
2337 lengthptr NULL during the real compile phase
2338 points to length accumulator during pre-compile phase
2339
2340Returns: TRUE on success
2341 FALSE, with *errorcodeptr set non-zero on error
2342*/
2343
2344static BOOL
2345compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2346 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2347 compile_data *cd, int *lengthptr)
2348{
2349int repeat_type, op_type;
2350int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2351int bravalue = 0;
2352int greedy_default, greedy_non_default;
2353int firstbyte, reqbyte;
2354int zeroreqbyte, zerofirstbyte;
2355int req_caseopt, reqvary, tempreqvary;
2356int options = *optionsptr;
2357int after_manual_callout = 0;
2358int length_prevgroup = 0;
2359register int c;
2360register uschar *code = *codeptr;
2361uschar *last_code = code;
2362uschar *orig_code = code;
2363uschar *tempcode;
2364BOOL inescq = FALSE;
2365BOOL groupsetfirstbyte = FALSE;
2366const uschar *ptr = *ptrptr;
2367const uschar *tempptr;
2368uschar *previous = NULL;
2369uschar *previous_callout = NULL;
2370uschar *save_hwm = NULL;
2371uschar classbits[32];
2372
2373#ifdef SUPPORT_UTF8
2374BOOL class_utf8;
2375BOOL utf8 = (options & PCRE_UTF8) != 0;
2376uschar *class_utf8data;
2377uschar *class_utf8data_base;
2378uschar utf8_char[6];
2379#else
2380BOOL utf8 = FALSE;
2381uschar *utf8_char = NULL;
2382#endif
2383
2384#ifdef DEBUG
2385if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2386#endif
2387
2388/* Set up the default and non-default settings for greediness */
2389
2390greedy_default = ((options & PCRE_UNGREEDY) != 0);
2391greedy_non_default = greedy_default ^ 1;
2392
2393/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2394matching encountered yet". It gets changed to REQ_NONE if we hit something that
2395matches a non-fixed char first char; reqbyte just remains unset if we never
2396find one.
2397
2398When we hit a repeat whose minimum is zero, we may have to adjust these values
2399to take the zero repeat into account. This is implemented by setting them to
2400zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2401item types that can be repeated set these backoff variables appropriately. */
2402
2403firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2404
2405/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2406according to the current setting of the caseless flag. REQ_CASELESS is a bit
2407value > 255. It is added into the firstbyte or reqbyte variables to record the
2408case status of the value. This is used only for ASCII characters. */
2409
2410req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2411
2412/* Switch on next character until the end of the branch */
2413
2414for (;; ptr++)
2415 {
2416 BOOL negate_class;
2417 BOOL should_flip_negation;
2418 BOOL possessive_quantifier;
2419 BOOL is_quantifier;
2420 BOOL is_recurse;
2421 BOOL reset_bracount;
2422 int class_charcount;
2423 int class_lastchar;
2424 int newoptions;
2425 int recno;
2426 int refsign;
2427 int skipbytes;
2428 int subreqbyte;
2429 int subfirstbyte;
2430 int terminator;
2431 int mclength;
2432 uschar mcbuffer[8];
2433
2434 /* Get next byte in the pattern */
2435
2436 c = *ptr;
2437
2438 /* If we are in the pre-compile phase, accumulate the length used for the
2439 previous cycle of this loop. */
2440
2441 if (lengthptr != NULL)
2442 {
2443#ifdef DEBUG
2444 if (code > cd->hwm) cd->hwm = code; /* High water info */
2445#endif
2446 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2447 {
2448 *errorcodeptr = ERR52;
2449 goto FAILED;
2450 }
2451
2452 /* There is at least one situation where code goes backwards: this is the
2453 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2454 the class is simply eliminated. However, it is created first, so we have to
2455 allow memory for it. Therefore, don't ever reduce the length at this point.
2456 */
2457
2458 if (code < last_code) code = last_code;
2459
2460 /* Paranoid check for integer overflow */
2461
2462 if (OFLOW_MAX - *lengthptr < code - last_code)
2463 {
2464 *errorcodeptr = ERR20;
2465 goto FAILED;
2466 }
2467
2468 *lengthptr += code - last_code;
2469 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2470
2471 /* If "previous" is set and it is not at the start of the work space, move
2472 it back to there, in order to avoid filling up the work space. Otherwise,
2473 if "previous" is NULL, reset the current code pointer to the start. */
2474
2475 if (previous != NULL)
2476 {
2477 if (previous > orig_code)
2478 {
2479 memmove(orig_code, previous, code - previous);
2480 code -= previous - orig_code;
2481 previous = orig_code;
2482 }
2483 }
2484 else code = orig_code;
2485
2486 /* Remember where this code item starts so we can pick up the length
2487 next time round. */
2488
2489 last_code = code;
2490 }
2491
2492 /* In the real compile phase, just check the workspace used by the forward
2493 reference list. */
2494
2495 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2496 {
2497 *errorcodeptr = ERR52;
2498 goto FAILED;
2499 }
2500
2501 /* If in \Q...\E, check for the end; if not, we have a literal */
2502
2503 if (inescq && c != 0)
2504 {
2505 if (c == '\\' && ptr[1] == 'E')
2506 {
2507 inescq = FALSE;
2508 ptr++;
2509 continue;
2510 }
2511 else
2512 {
2513 if (previous_callout != NULL)
2514 {
2515 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2516 complete_callout(previous_callout, ptr, cd);
2517 previous_callout = NULL;
2518 }
2519 if ((options & PCRE_AUTO_CALLOUT) != 0)
2520 {
2521 previous_callout = code;
2522 code = auto_callout(code, ptr, cd);
2523 }
2524 goto NORMAL_CHAR;
2525 }
2526 }
2527
2528 /* Fill in length of a previous callout, except when the next thing is
2529 a quantifier. */
2530
2531 is_quantifier = c == '*' || c == '+' || c == '?' ||
2532 (c == '{' && is_counted_repeat(ptr+1));
2533
2534 if (!is_quantifier && previous_callout != NULL &&
2535 after_manual_callout-- <= 0)
2536 {
2537 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2538 complete_callout(previous_callout, ptr, cd);
2539 previous_callout = NULL;
2540 }
2541
2542 /* In extended mode, skip white space and comments */
2543
2544 if ((options & PCRE_EXTENDED) != 0)
2545 {
2546 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2547 if (c == '#')
2548 {
2549 while (*(++ptr) != 0)
2550 {
2551 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2552 }
2553 if (*ptr != 0) continue;
2554
2555 /* Else fall through to handle end of string */
2556 c = 0;
2557 }
2558 }
2559
2560 /* No auto callout for quantifiers. */
2561
2562 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2563 {
2564 previous_callout = code;
2565 code = auto_callout(code, ptr, cd);
2566 }
2567
2568 switch(c)
2569 {
2570 /* ===================================================================*/
2571 case 0: /* The branch terminates at string end */
2572 case '|': /* or | or ) */
2573 case ')':
2574 *firstbyteptr = firstbyte;
2575 *reqbyteptr = reqbyte;
2576 *codeptr = code;
2577 *ptrptr = ptr;
2578 if (lengthptr != NULL)
2579 {
2580 if (OFLOW_MAX - *lengthptr < code - last_code)
2581 {
2582 *errorcodeptr = ERR20;
2583 goto FAILED;
2584 }
2585 *lengthptr += code - last_code; /* To include callout length */
2586 DPRINTF((">> end branch\n"));
2587 }
2588 return TRUE;
2589
2590
2591 /* ===================================================================*/
2592 /* Handle single-character metacharacters. In multiline mode, ^ disables
2593 the setting of any following char as a first character. */
2594
2595 case '^':
2596 if ((options & PCRE_MULTILINE) != 0)
2597 {
2598 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2599 }
2600 previous = NULL;
2601 *code++ = OP_CIRC;
2602 break;
2603
2604 case '$':
2605 previous = NULL;
2606 *code++ = OP_DOLL;
2607 break;
2608
2609 /* There can never be a first char if '.' is first, whatever happens about
2610 repeats. The value of reqbyte doesn't change either. */
2611
2612 case '.':
2613 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2614 zerofirstbyte = firstbyte;
2615 zeroreqbyte = reqbyte;
2616 previous = code;
2617 *code++ = OP_ANY;
2618 break;
2619
2620
2621 /* ===================================================================*/
2622 /* Character classes. If the included characters are all < 256, we build a
2623 32-byte bitmap of the permitted characters, except in the special case
2624 where there is only one such character. For negated classes, we build the
2625 map as usual, then invert it at the end. However, we use a different opcode
2626 so that data characters > 255 can be handled correctly.
2627
2628 If the class contains characters outside the 0-255 range, a different
2629 opcode is compiled. It may optionally have a bit map for characters < 256,
2630 but those above are are explicitly listed afterwards. A flag byte tells
2631 whether the bitmap is present, and whether this is a negated class or not.
2632 */
2633
2634 case '[':
2635 previous = code;
2636
2637 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2638 they are encountered at the top level, so we'll do that too. */
2639
2640 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2641 check_posix_syntax(ptr, &tempptr))
2642 {
2643 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2644 goto FAILED;
2645 }
2646
2647 /* If the first character is '^', set the negation flag and skip it. Also,
2648 if the first few characters (either before or after ^) are \Q\E or \E we
2649 skip them too. This makes for compatibility with Perl. */
2650
2651 negate_class = FALSE;
2652 for (;;)
2653 {
2654 c = *(++ptr);
2655 if (c == '\\')
2656 {
2657 if (ptr[1] == 'E') ptr++;
2658 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2659 else break;
2660 }
2661 else if (!negate_class && c == '^')
2662 negate_class = TRUE;
2663 else break;
2664 }
2665
2666 /* If a class contains a negative special such as \S, we need to flip the
2667 negation flag at the end, so that support for characters > 255 works
2668 correctly (they are all included in the class). */
2669
2670 should_flip_negation = FALSE;
2671
2672 /* Keep a count of chars with values < 256 so that we can optimize the case
2673 of just a single character (as long as it's < 256). However, For higher
2674 valued UTF-8 characters, we don't yet do any optimization. */
2675
2676 class_charcount = 0;
2677 class_lastchar = -1;
2678
2679 /* Initialize the 32-char bit map to all zeros. We build the map in a
2680 temporary bit of memory, in case the class contains only 1 character (less
2681 than 256), because in that case the compiled code doesn't use the bit map.
2682 */
2683
2684 memset(classbits, 0, 32 * sizeof(uschar));
2685
2686#ifdef SUPPORT_UTF8
2687 class_utf8 = FALSE; /* No chars >= 256 */
2688 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2689 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2690#endif
2691
2692 /* Process characters until ] is reached. By writing this as a "do" it
2693 means that an initial ] is taken as a data character. At the start of the
2694 loop, c contains the first byte of the character. */
2695
2696 if (c != 0) do
2697 {
2698 const uschar *oldptr;
2699
2700#ifdef SUPPORT_UTF8
2701 if (utf8 && c > 127)
2702 { /* Braces are required because the */
2703 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2704 }
2705
2706 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2707 data and reset the pointer. This is so that very large classes that
2708 contain a zillion UTF-8 characters no longer overwrite the work space
2709 (which is on the stack). */
2710
2711 if (lengthptr != NULL)
2712 {
2713 *lengthptr += class_utf8data - class_utf8data_base;
2714 class_utf8data = class_utf8data_base;
2715 }
2716
2717#endif
2718
2719 /* Inside \Q...\E everything is literal except \E */
2720
2721 if (inescq)
2722 {
2723 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2724 {
2725 inescq = FALSE; /* Reset literal state */
2726 ptr++; /* Skip the 'E' */
2727 continue; /* Carry on with next */
2728 }
2729 goto CHECK_RANGE; /* Could be range if \E follows */
2730 }
2731
2732 /* Handle POSIX class names. Perl allows a negation extension of the
2733 form [:^name:]. A square bracket that doesn't match the syntax is
2734 treated as a literal. We also recognize the POSIX constructions
2735 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2736 5.6 and 5.8 do. */
2737
2738 if (c == '[' &&
2739 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2740 check_posix_syntax(ptr, &tempptr))
2741 {
2742 BOOL local_negate = FALSE;
2743 int posix_class, taboffset, tabopt;
2744 register const uschar *cbits = cd->cbits;
2745 uschar pbits[32];
2746
2747 if (ptr[1] != ':')
2748 {
2749 *errorcodeptr = ERR31;
2750 goto FAILED;
2751 }
2752
2753 ptr += 2;
2754 if (*ptr == '^')
2755 {
2756 local_negate = TRUE;
2757 should_flip_negation = TRUE; /* Note negative special */
2758 ptr++;
2759 }
2760
2761 posix_class = check_posix_name(ptr, tempptr - ptr);
2762 if (posix_class < 0)
2763 {
2764 *errorcodeptr = ERR30;
2765 goto FAILED;
2766 }
2767
2768 /* If matching is caseless, upper and lower are converted to
2769 alpha. This relies on the fact that the class table starts with
2770 alpha, lower, upper as the first 3 entries. */
2771
2772 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2773 posix_class = 0;
2774
2775 /* We build the bit map for the POSIX class in a chunk of local store
2776 because we may be adding and subtracting from it, and we don't want to
2777 subtract bits that may be in the main map already. At the end we or the
2778 result into the bit map that is being built. */
2779
2780 posix_class *= 3;
2781
2782 /* Copy in the first table (always present) */
2783
2784 memcpy(pbits, cbits + posix_class_maps[posix_class],
2785 32 * sizeof(uschar));
2786
2787 /* If there is a second table, add or remove it as required. */
2788
2789 taboffset = posix_class_maps[posix_class + 1];
2790 tabopt = posix_class_maps[posix_class + 2];
2791
2792 if (taboffset >= 0)
2793 {
2794 if (tabopt >= 0)
2795 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2796 else
2797 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2798 }
2799
2800 /* Not see if we need to remove any special characters. An option
2801 value of 1 removes vertical space and 2 removes underscore. */
2802
2803 if (tabopt < 0) tabopt = -tabopt;
2804 if (tabopt == 1) pbits[1] &= ~0x3c;
2805 else if (tabopt == 2) pbits[11] &= 0x7f;
2806
2807 /* Add the POSIX table or its complement into the main table that is
2808 being built and we are done. */
2809
2810 if (local_negate)
2811 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2812 else
2813 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2814
2815 ptr = tempptr + 1;
2816 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2817 continue; /* End of POSIX syntax handling */
2818 }
2819
2820 /* Backslash may introduce a single character, or it may introduce one
2821 of the specials, which just set a flag. The sequence \b is a special
2822 case. Inside a class (and only there) it is treated as backspace.
2823 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2824 to 'or' into the one we are building. We assume they have more than one
2825 character in them, so set class_charcount bigger than one. */
2826
2827 if (c == '\\')
2828 {
2829 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2830 if (*errorcodeptr != 0) goto FAILED;
2831
2832 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2833 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2834 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2835 else if (-c == ESC_Q) /* Handle start of quoted string */
2836 {
2837 if (ptr[1] == '\\' && ptr[2] == 'E')
2838 {
2839 ptr += 2; /* avoid empty string */
2840 }
2841 else inescq = TRUE;
2842 continue;
2843 }
2844 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2845
2846 if (c < 0)
2847 {
2848 register const uschar *cbits = cd->cbits;
2849 class_charcount += 2; /* Greater than 1 is what matters */
2850
2851 /* Save time by not doing this in the pre-compile phase. */
2852
2853 if (lengthptr == NULL) switch (-c)
2854 {
2855 case ESC_d:
2856 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2857 continue;
2858
2859 case ESC_D:
2860 should_flip_negation = TRUE;
2861 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2862 continue;
2863
2864 case ESC_w:
2865 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2866 continue;
2867
2868 case ESC_W:
2869 should_flip_negation = TRUE;
2870 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2871 continue;
2872
2873 case ESC_s:
2874 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2875 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2876 continue;
2877
2878 case ESC_S:
2879 should_flip_negation = TRUE;
2880 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2881 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2882 continue;
2883
2884 default: /* Not recognized; fall through */
2885 break; /* Need "default" setting to stop compiler warning. */
2886 }
2887
2888 /* In the pre-compile phase, just do the recognition. */
2889
2890 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2891 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2892
2893 /* We need to deal with \H, \h, \V, and \v in both phases because
2894 they use extra memory. */
2895
2896 if (-c == ESC_h)
2897 {
2898 SETBIT(classbits, 0x09); /* VT */
2899 SETBIT(classbits, 0x20); /* SPACE */
2900 SETBIT(classbits, 0xa0); /* NSBP */
2901#ifdef SUPPORT_UTF8
2902 if (utf8)
2903 {
2904 class_utf8 = TRUE;
2905 *class_utf8data++ = XCL_SINGLE;
2906 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2907 *class_utf8data++ = XCL_SINGLE;
2908 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2909 *class_utf8data++ = XCL_RANGE;
2910 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2911 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2912 *class_utf8data++ = XCL_SINGLE;
2913 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2914 *class_utf8data++ = XCL_SINGLE;
2915 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2916 *class_utf8data++ = XCL_SINGLE;
2917 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2918 }
2919#endif
2920 continue;
2921 }
2922
2923 if (-c == ESC_H)
2924 {
2925 for (c = 0; c < 32; c++)
2926 {
2927 int x = 0xff;
2928 switch (c)
2929 {
2930 case 0x09/8: x ^= 1 << (0x09%8); break;
2931 case 0x20/8: x ^= 1 << (0x20%8); break;
2932 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2933 default: break;
2934 }
2935 classbits[c] |= x;
2936 }
2937
2938#ifdef SUPPORT_UTF8
2939 if (utf8)
2940 {
2941 class_utf8 = TRUE;
2942 *class_utf8data++ = XCL_RANGE;
2943 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2944 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2945 *class_utf8data++ = XCL_RANGE;
2946 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2947 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2948 *class_utf8data++ = XCL_RANGE;
2949 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2950 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2951 *class_utf8data++ = XCL_RANGE;
2952 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2953 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2954 *class_utf8data++ = XCL_RANGE;
2955 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2956 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2957 *class_utf8data++ = XCL_RANGE;
2958 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2959 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2960 *class_utf8data++ = XCL_RANGE;
2961 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2962 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2963 }
2964#endif
2965 continue;
2966 }
2967
2968 if (-c == ESC_v)
2969 {
2970 SETBIT(classbits, 0x0a); /* LF */
2971 SETBIT(classbits, 0x0b); /* VT */
2972 SETBIT(classbits, 0x0c); /* FF */
2973 SETBIT(classbits, 0x0d); /* CR */
2974 SETBIT(classbits, 0x85); /* NEL */
2975#ifdef SUPPORT_UTF8
2976 if (utf8)
2977 {
2978 class_utf8 = TRUE;
2979 *class_utf8data++ = XCL_RANGE;
2980 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2981 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2982 }
2983#endif
2984 continue;
2985 }
2986
2987 if (-c == ESC_V)
2988 {
2989 for (c = 0; c < 32; c++)
2990 {
2991 int x = 0xff;
2992 switch (c)
2993 {
2994 case 0x0a/8: x ^= 1 << (0x0a%8);
2995 x ^= 1 << (0x0b%8);
2996 x ^= 1 << (0x0c%8);
2997 x ^= 1 << (0x0d%8);
2998 break;
2999 case 0x85/8: x ^= 1 << (0x85%8); break;
3000 default: break;
3001 }
3002 classbits[c] |= x;
3003 }
3004
3005#ifdef SUPPORT_UTF8
3006 if (utf8)
3007 {
3008 class_utf8 = TRUE;
3009 *class_utf8data++ = XCL_RANGE;
3010 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3011 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3012 *class_utf8data++ = XCL_RANGE;
3013 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3014 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3015 }
3016#endif
3017 continue;
3018 }
3019
3020 /* We need to deal with \P and \p in both phases. */
3021
3022#ifdef SUPPORT_UCP
3023 if (-c == ESC_p || -c == ESC_P)
3024 {
3025 BOOL negated;
3026 int pdata;
3027 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3028 if (ptype < 0) goto FAILED;
3029 class_utf8 = TRUE;
3030 *class_utf8data++ = ((-c == ESC_p) != negated)?
3031 XCL_PROP : XCL_NOTPROP;
3032 *class_utf8data++ = ptype;
3033 *class_utf8data++ = pdata;
3034 class_charcount -= 2; /* Not a < 256 character */
3035 continue;
3036 }
3037#endif
3038 /* Unrecognized escapes are faulted if PCRE is running in its
3039 strict mode. By default, for compatibility with Perl, they are
3040 treated as literals. */
3041
3042 if ((options & PCRE_EXTRA) != 0)
3043 {
3044 *errorcodeptr = ERR7;
3045 goto FAILED;
3046 }
3047
3048 class_charcount -= 2; /* Undo the default count from above */
3049 c = *ptr; /* Get the final character and fall through */
3050 }
3051
3052 /* Fall through if we have a single character (c >= 0). This may be
3053 greater than 256 in UTF-8 mode. */
3054
3055 } /* End of backslash handling */
3056
3057 /* A single character may be followed by '-' to form a range. However,
3058 Perl does not permit ']' to be the end of the range. A '-' character
3059 at the end is treated as a literal. Perl ignores orphaned \E sequences
3060 entirely. The code for handling \Q and \E is messy. */
3061
3062 CHECK_RANGE:
3063 while (ptr[1] == '\\' && ptr[2] == 'E')
3064 {
3065 inescq = FALSE;
3066 ptr += 2;
3067 }
3068
3069 oldptr = ptr;
3070
3071 /* Remember \r or \n */
3072
3073 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3074
3075 /* Check for range */
3076
3077 if (!inescq && ptr[1] == '-')
3078 {
3079 int d;
3080 ptr += 2;
3081 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3082
3083 /* If we hit \Q (not followed by \E) at this point, go into escaped
3084 mode. */
3085
3086 while (*ptr == '\\' && ptr[1] == 'Q')
3087 {
3088 ptr += 2;
3089 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3090 inescq = TRUE;
3091 break;
3092 }
3093
3094 if (*ptr == 0 || (!inescq && *ptr == ']'))
3095 {
3096 ptr = oldptr;
3097 goto LONE_SINGLE_CHARACTER;
3098 }
3099
3100#ifdef SUPPORT_UTF8
3101 if (utf8)
3102 { /* Braces are required because the */
3103 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3104 }
3105 else
3106#endif
3107 d = *ptr; /* Not UTF-8 mode */
3108
3109 /* The second part of a range can be a single-character escape, but
3110 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3111 in such circumstances. */
3112
3113 if (!inescq && d == '\\')
3114 {
3115 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3116 if (*errorcodeptr != 0) goto FAILED;
3117
3118 /* \b is backspace; \X is literal X; \R is literal R; any other
3119 special means the '-' was literal */
3120
3121 if (d < 0)
3122 {
3123 if (d == -ESC_b) d = '\b';
3124 else if (d == -ESC_X) d = 'X';
3125 else if (d == -ESC_R) d = 'R'; else
3126 {
3127 ptr = oldptr;
3128 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3129 }
3130 }
3131 }
3132
3133 /* Check that the two values are in the correct order. Optimize
3134 one-character ranges */
3135
3136 if (d < c)
3137 {
3138 *errorcodeptr = ERR8;
3139 goto FAILED;
3140 }
3141
3142 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3143
3144 /* Remember \r or \n */
3145
3146 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3147
3148 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3149 matching, we have to use an XCLASS with extra data items. Caseless
3150 matching for characters > 127 is available only if UCP support is
3151 available. */
3152
3153#ifdef SUPPORT_UTF8
3154 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3155 {
3156 class_utf8 = TRUE;
3157
3158 /* With UCP support, we can find the other case equivalents of
3159 the relevant characters. There may be several ranges. Optimize how
3160 they fit with the basic range. */
3161
3162#ifdef SUPPORT_UCP
3163 if ((options & PCRE_CASELESS) != 0)
3164 {
3165 unsigned int occ, ocd;
3166 unsigned int cc = c;
3167 unsigned int origd = d;
3168 while (get_othercase_range(&cc, origd, &occ, &ocd))
3169 {
3170 if (occ >= (unsigned int)c &&
3171 ocd <= (unsigned int)d)
3172 continue; /* Skip embedded ranges */
3173
3174 if (occ < (unsigned int)c &&
3175 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3176 { /* if there is overlap, */
3177 c = occ; /* noting that if occ < c */
3178 continue; /* we can't have ocd > d */
3179 } /* because a subrange is */
3180 if (ocd > (unsigned int)d &&
3181 occ <= (unsigned int)d + 1) /* always shorter than */
3182 { /* the basic range. */
3183 d = ocd;
3184 continue;
3185 }
3186
3187 if (occ == ocd)
3188 {
3189 *class_utf8data++ = XCL_SINGLE;
3190 }
3191 else
3192 {
3193 *class_utf8data++ = XCL_RANGE;
3194 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3195 }
3196 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3197 }
3198 }
3199#endif /* SUPPORT_UCP */
3200
3201 /* Now record the original range, possibly modified for UCP caseless
3202 overlapping ranges. */
3203
3204 *class_utf8data++ = XCL_RANGE;
3205 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3206 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3207
3208 /* With UCP support, we are done. Without UCP support, there is no
3209 caseless matching for UTF-8 characters > 127; we can use the bit map
3210 for the smaller ones. */
3211
3212#ifdef SUPPORT_UCP
3213 continue; /* With next character in the class */
3214#else
3215 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3216
3217 /* Adjust upper limit and fall through to set up the map */
3218
3219 d = 127;
3220
3221#endif /* SUPPORT_UCP */
3222 }
3223#endif /* SUPPORT_UTF8 */
3224
3225 /* We use the bit map for all cases when not in UTF-8 mode; else
3226 ranges that lie entirely within 0-127 when there is UCP support; else
3227 for partial ranges without UCP support. */
3228
3229 class_charcount += d - c + 1;
3230 class_lastchar = d;
3231
3232 /* We can save a bit of time by skipping this in the pre-compile. */
3233
3234 if (lengthptr == NULL) for (; c <= d; c++)
3235 {
3236 classbits[c/8] |= (1 << (c&7));
3237 if ((options & PCRE_CASELESS) != 0)
3238 {
3239 int uc = cd->fcc[c]; /* flip case */
3240 classbits[uc/8] |= (1 << (uc&7));
3241 }
3242 }
3243
3244 continue; /* Go get the next char in the class */
3245 }
3246
3247 /* Handle a lone single character - we can get here for a normal
3248 non-escape char, or after \ that introduces a single character or for an
3249 apparent range that isn't. */
3250
3251 LONE_SINGLE_CHARACTER:
3252
3253 /* Handle a character that cannot go in the bit map */
3254
3255#ifdef SUPPORT_UTF8
3256 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3257 {
3258 class_utf8 = TRUE;
3259 *class_utf8data++ = XCL_SINGLE;
3260 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3261
3262#ifdef SUPPORT_UCP
3263 if ((options & PCRE_CASELESS) != 0)
3264 {
3265 unsigned int othercase;
3266 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3267 {
3268 *class_utf8data++ = XCL_SINGLE;
3269 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3270 }
3271 }
3272#endif /* SUPPORT_UCP */
3273
3274 }
3275 else
3276#endif /* SUPPORT_UTF8 */
3277
3278 /* Handle a single-byte character */
3279 {
3280 classbits[c/8] |= (1 << (c&7));
3281 if ((options & PCRE_CASELESS) != 0)
3282 {
3283 c = cd->fcc[c]; /* flip case */
3284 classbits[c/8] |= (1 << (c&7));
3285 }
3286 class_charcount++;
3287 class_lastchar = c;
3288 }
3289 }
3290
3291 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3292
3293 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3294
3295 if (c == 0) /* Missing terminating ']' */
3296 {
3297 *errorcodeptr = ERR6;
3298 goto FAILED;
3299 }
3300
3301
3302/* This code has been disabled because it would mean that \s counts as
3303an explicit \r or \n reference, and that's not really what is wanted. Now
3304we set the flag only if there is a literal "\r" or "\n" in the class. */
3305
3306#if 0
3307 /* Remember whether \r or \n are in this class */
3308
3309 if (negate_class)
3310 {
3311 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3312 }
3313 else
3314 {
3315 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3316 }
3317#endif
3318
3319
3320 /* If class_charcount is 1, we saw precisely one character whose value is
3321 less than 256. As long as there were no characters >= 128 and there was no
3322 use of \p or \P, in other words, no use of any XCLASS features, we can
3323 optimize.
3324
3325 In UTF-8 mode, we can optimize the negative case only if there were no
3326 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3327 operate on single-bytes only. This is an historical hangover. Maybe one day
3328 we can tidy these opcodes to handle multi-byte characters.
3329
3330 The optimization throws away the bit map. We turn the item into a
3331 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3332 that OP_NOT does not support multibyte characters. In the positive case, it
3333 can cause firstbyte to be set. Otherwise, there can be no first char if
3334 this item is first, whatever repeat count may follow. In the case of
3335 reqbyte, save the previous value for reinstating. */
3336
3337#ifdef SUPPORT_UTF8
3338 if (class_charcount == 1 && !class_utf8 &&
3339 (!utf8 || !negate_class || class_lastchar < 128))
3340#else
3341 if (class_charcount == 1)
3342#endif
3343 {
3344 zeroreqbyte = reqbyte;
3345
3346 /* The OP_NOT opcode works on one-byte characters only. */
3347
3348 if (negate_class)
3349 {
3350 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3351 zerofirstbyte = firstbyte;
3352 *code++ = OP_NOT;
3353 *code++ = class_lastchar;
3354 break;
3355 }
3356
3357 /* For a single, positive character, get the value into mcbuffer, and
3358 then we can handle this with the normal one-character code. */
3359
3360#ifdef SUPPORT_UTF8
3361 if (utf8 && class_lastchar > 127)
3362 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3363 else
3364#endif
3365 {
3366 mcbuffer[0] = class_lastchar;
3367 mclength = 1;
3368 }
3369 goto ONE_CHAR;
3370 } /* End of 1-char optimization */
3371
3372 /* The general case - not the one-char optimization. If this is the first
3373 thing in the branch, there can be no first char setting, whatever the
3374 repeat count. Any reqbyte setting must remain unchanged after any kind of
3375 repeat. */
3376
3377 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3378 zerofirstbyte = firstbyte;
3379 zeroreqbyte = reqbyte;
3380
3381 /* If there are characters with values > 255, we have to compile an
3382 extended class, with its own opcode, unless there was a negated special
3383 such as \S in the class, because in that case all characters > 255 are in
3384 the class, so any that were explicitly given as well can be ignored. If
3385 (when there are explicit characters > 255 that must be listed) there are no
3386 characters < 256, we can omit the bitmap in the actual compiled code. */
3387
3388#ifdef SUPPORT_UTF8
3389 if (class_utf8 && !should_flip_negation)
3390 {
3391 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3392 *code++ = OP_XCLASS;
3393 code += LINK_SIZE;
3394 *code = negate_class? XCL_NOT : 0;
3395
3396 /* If the map is required, move up the extra data to make room for it;
3397 otherwise just move the code pointer to the end of the extra data. */
3398
3399 if (class_charcount > 0)
3400 {
3401 *code++ |= XCL_MAP;
3402 memmove(code + 32, code, class_utf8data - code);
3403 memcpy(code, classbits, 32);
3404 code = class_utf8data + 32;
3405 }
3406 else code = class_utf8data;
3407
3408 /* Now fill in the complete length of the item */
3409
3410 PUT(previous, 1, code - previous);
3411 break; /* End of class handling */
3412 }
3413#endif
3414
3415 /* If there are no characters > 255, set the opcode to OP_CLASS or
3416 OP_NCLASS, depending on whether the whole class was negated and whether
3417 there were negative specials such as \S in the class. Then copy the 32-byte
3418 map into the code vector, negating it if necessary. */
3419
3420 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3421 if (negate_class)
3422 {
3423 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3424 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3425 }
3426 else
3427 {
3428 memcpy(code, classbits, 32);
3429 }
3430 code += 32;
3431 break;
3432
3433
3434 /* ===================================================================*/
3435 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3436 has been tested above. */
3437
3438 case '{':
3439 if (!is_quantifier) goto NORMAL_CHAR;
3440 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3441 if (*errorcodeptr != 0) goto FAILED;
3442 goto REPEAT;
3443
3444 case '*':
3445 repeat_min = 0;
3446 repeat_max = -1;
3447 goto REPEAT;
3448
3449 case '+':
3450 repeat_min = 1;
3451 repeat_max = -1;
3452 goto REPEAT;
3453
3454 case '?':
3455 repeat_min = 0;
3456 repeat_max = 1;
3457
3458 REPEAT:
3459 if (previous == NULL)
3460 {
3461 *errorcodeptr = ERR9;
3462 goto FAILED;
3463 }
3464
3465 if (repeat_min == 0)
3466 {
3467 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3468 reqbyte = zeroreqbyte; /* Ditto */
3469 }
3470
3471 /* Remember whether this is a variable length repeat */
3472
3473 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3474
3475 op_type = 0; /* Default single-char op codes */
3476 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3477
3478 /* Save start of previous item, in case we have to move it up to make space
3479 for an inserted OP_ONCE for the additional '+' extension. */
3480
3481 tempcode = previous;
3482
3483 /* If the next character is '+', we have a possessive quantifier. This
3484 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3485 If the next character is '?' this is a minimizing repeat, by default,
3486 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3487 repeat type to the non-default. */
3488
3489 if (ptr[1] == '+')
3490 {
3491 repeat_type = 0; /* Force greedy */
3492 possessive_quantifier = TRUE;
3493 ptr++;
3494 }
3495 else if (ptr[1] == '?')
3496 {
3497 repeat_type = greedy_non_default;
3498 ptr++;
3499 }
3500 else repeat_type = greedy_default;
3501
3502 /* If previous was a character match, abolish the item and generate a
3503 repeat item instead. If a char item has a minumum of more than one, ensure
3504 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3505 the first thing in a branch because the x will have gone into firstbyte
3506 instead. */
3507
3508 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3509 {
3510 /* Deal with UTF-8 characters that take up more than one byte. It's
3511 easier to write this out separately than try to macrify it. Use c to
3512 hold the length of the character in bytes, plus 0x80 to flag that it's a
3513 length rather than a small character. */
3514
3515#ifdef SUPPORT_UTF8
3516 if (utf8 && (code[-1] & 0x80) != 0)
3517 {
3518 uschar *lastchar = code - 1;
3519 while((*lastchar & 0xc0) == 0x80) lastchar--;
3520 c = code - lastchar; /* Length of UTF-8 character */
3521 memcpy(utf8_char, lastchar, c); /* Save the char */
3522 c |= 0x80; /* Flag c as a length */
3523 }
3524 else
3525#endif
3526
3527 /* Handle the case of a single byte - either with no UTF8 support, or
3528 with UTF-8 disabled, or for a UTF-8 character < 128. */
3529
3530 {
3531 c = code[-1];
3532 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3533 }
3534
3535 /* If the repetition is unlimited, it pays to see if the next thing on
3536 the line is something that cannot possibly match this character. If so,
3537 automatically possessifying this item gains some performance in the case
3538 where the match fails. */
3539
3540 if (!possessive_quantifier &&
3541 repeat_max < 0 &&
3542 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3543 options, cd))
3544 {
3545 repeat_type = 0; /* Force greedy */
3546 possessive_quantifier = TRUE;
3547 }
3548
3549 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3550 }
3551
3552 /* If previous was a single negated character ([^a] or similar), we use
3553 one of the special opcodes, replacing it. The code is shared with single-
3554 character repeats by setting opt_type to add a suitable offset into
3555 repeat_type. We can also test for auto-possessification. OP_NOT is
3556 currently used only for single-byte chars. */
3557
3558 else if (*previous == OP_NOT)
3559 {
3560 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3561 c = previous[1];
3562 if (!possessive_quantifier &&
3563 repeat_max < 0 &&
3564 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3565 {
3566 repeat_type = 0; /* Force greedy */
3567 possessive_quantifier = TRUE;
3568 }
3569 goto OUTPUT_SINGLE_REPEAT;
3570 }
3571
3572 /* If previous was a character type match (\d or similar), abolish it and
3573 create a suitable repeat item. The code is shared with single-character
3574 repeats by setting op_type to add a suitable offset into repeat_type. Note
3575 the the Unicode property types will be present only when SUPPORT_UCP is
3576 defined, but we don't wrap the little bits of code here because it just
3577 makes it horribly messy. */
3578
3579 else if (*previous < OP_EODN)
3580 {
3581 uschar *oldcode;
3582 int prop_type, prop_value;
3583 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3584 c = *previous;
3585
3586 if (!possessive_quantifier &&
3587 repeat_max < 0 &&
3588 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3589 {
3590 repeat_type = 0; /* Force greedy */
3591 possessive_quantifier = TRUE;
3592 }
3593
3594 OUTPUT_SINGLE_REPEAT:
3595 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3596 {
3597 prop_type = previous[1];
3598 prop_value = previous[2];
3599 }
3600 else prop_type = prop_value = -1;
3601
3602 oldcode = code;
3603 code = previous; /* Usually overwrite previous item */
3604
3605 /* If the maximum is zero then the minimum must also be zero; Perl allows
3606 this case, so we do too - by simply omitting the item altogether. */
3607
3608 if (repeat_max == 0) goto END_REPEAT;
3609
3610 /* All real repeats make it impossible to handle partial matching (maybe
3611 one day we will be able to remove this restriction). */
3612
3613 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3614
3615 /* Combine the op_type with the repeat_type */
3616
3617 repeat_type += op_type;
3618
3619 /* A minimum of zero is handled either as the special case * or ?, or as
3620 an UPTO, with the maximum given. */
3621
3622 if (repeat_min == 0)
3623 {
3624 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3625 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3626 else
3627 {
3628 *code++ = OP_UPTO + repeat_type;
3629 PUT2INC(code, 0, repeat_max);
3630 }
3631 }
3632
3633 /* A repeat minimum of 1 is optimized into some special cases. If the
3634 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3635 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3636 one less than the maximum. */
3637
3638 else if (repeat_min == 1)
3639 {
3640 if (repeat_max == -1)
3641 *code++ = OP_PLUS + repeat_type;
3642 else
3643 {
3644 code = oldcode; /* leave previous item in place */
3645 if (repeat_max == 1) goto END_REPEAT;
3646 *code++ = OP_UPTO + repeat_type;
3647 PUT2INC(code, 0, repeat_max - 1);
3648 }
3649 }
3650
3651 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3652 handled as an EXACT followed by an UPTO. */
3653
3654 else
3655 {
3656 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3657 PUT2INC(code, 0, repeat_min);
3658
3659 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3660 we have to insert the character for the previous code. For a repeated
3661 Unicode property match, there are two extra bytes that define the
3662 required property. In UTF-8 mode, long characters have their length in
3663 c, with the 0x80 bit as a flag. */
3664
3665 if (repeat_max < 0)
3666 {
3667#ifdef SUPPORT_UTF8
3668 if (utf8 && c >= 128)
3669 {
3670 memcpy(code, utf8_char, c & 7);
3671 code += c & 7;
3672 }
3673 else
3674#endif
3675 {
3676 *code++ = c;
3677 if (prop_type >= 0)
3678 {
3679 *code++ = prop_type;
3680 *code++ = prop_value;
3681 }
3682 }
3683 *code++ = OP_STAR + repeat_type;
3684 }
3685
3686 /* Else insert an UPTO if the max is greater than the min, again
3687 preceded by the character, for the previously inserted code. If the
3688 UPTO is just for 1 instance, we can use QUERY instead. */
3689
3690 else if (repeat_max != repeat_min)
3691 {
3692#ifdef SUPPORT_UTF8
3693 if (utf8 && c >= 128)
3694 {
3695 memcpy(code, utf8_char, c & 7);
3696 code += c & 7;
3697 }
3698 else
3699#endif
3700 *code++ = c;
3701 if (prop_type >= 0)
3702 {
3703 *code++ = prop_type;
3704 *code++ = prop_value;
3705 }
3706 repeat_max -= repeat_min;
3707
3708 if (repeat_max == 1)
3709 {
3710 *code++ = OP_QUERY + repeat_type;
3711 }
3712 else
3713 {
3714 *code++ = OP_UPTO + repeat_type;
3715 PUT2INC(code, 0, repeat_max);
3716 }
3717 }
3718 }
3719
3720 /* The character or character type itself comes last in all cases. */
3721
3722#ifdef SUPPORT_UTF8
3723 if (utf8 && c >= 128)
3724 {
3725 memcpy(code, utf8_char, c & 7);
3726 code += c & 7;
3727 }
3728 else
3729#endif
3730 *code++ = c;
3731
3732 /* For a repeated Unicode property match, there are two extra bytes that
3733 define the required property. */
3734
3735#ifdef SUPPORT_UCP
3736 if (prop_type >= 0)
3737 {
3738 *code++ = prop_type;
3739 *code++ = prop_value;
3740 }
3741#endif
3742 }
3743
3744 /* If previous was a character class or a back reference, we put the repeat
3745 stuff after it, but just skip the item if the repeat was {0,0}. */
3746
3747 else if (*previous == OP_CLASS ||
3748 *previous == OP_NCLASS ||
3749#ifdef SUPPORT_UTF8
3750 *previous == OP_XCLASS ||
3751#endif
3752 *previous == OP_REF)
3753 {
3754 if (repeat_max == 0)
3755 {
3756 code = previous;
3757 goto END_REPEAT;
3758 }
3759
3760 /* All real repeats make it impossible to handle partial matching (maybe
3761 one day we will be able to remove this restriction). */
3762
3763 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3764
3765 if (repeat_min == 0 && repeat_max == -1)
3766 *code++ = OP_CRSTAR + repeat_type;
3767 else if (repeat_min == 1 && repeat_max == -1)
3768 *code++ = OP_CRPLUS + repeat_type;
3769 else if (repeat_min == 0 && repeat_max == 1)
3770 *code++ = OP_CRQUERY + repeat_type;
3771 else
3772 {
3773 *code++ = OP_CRRANGE + repeat_type;
3774 PUT2INC(code, 0, repeat_min);
3775 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3776 PUT2INC(code, 0, repeat_max);
3777 }
3778 }
3779
3780 /* If previous was a bracket group, we may have to replicate it in certain
3781 cases. */
3782
3783 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3784 *previous == OP_ONCE || *previous == OP_COND)
3785 {
3786 register int i;
3787 int ketoffset = 0;
3788 int len = code - previous;
3789 uschar *bralink = NULL;
3790
3791 /* Repeating a DEFINE group is pointless */
3792
3793 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3794 {
3795 *errorcodeptr = ERR55;
3796 goto FAILED;
3797 }
3798
3799 /* If the maximum repeat count is unlimited, find the end of the bracket
3800 by scanning through from the start, and compute the offset back to it
3801 from the current code pointer. There may be an OP_OPT setting following
3802 the final KET, so we can't find the end just by going back from the code
3803 pointer. */
3804
3805 if (repeat_max == -1)
3806 {
3807 register uschar *ket = previous;
3808 do ket += GET(ket, 1); while (*ket != OP_KET);
3809 ketoffset = code - ket;
3810 }
3811
3812 /* The case of a zero minimum is special because of the need to stick
3813 OP_BRAZERO in front of it, and because the group appears once in the
3814 data, whereas in other cases it appears the minimum number of times. For
3815 this reason, it is simplest to treat this case separately, as otherwise
3816 the code gets far too messy. There are several special subcases when the
3817 minimum is zero. */
3818
3819 if (repeat_min == 0)
3820 {
3821 /* If the maximum is also zero, we just omit the group from the output
3822 altogether. */
3823
3824 if (repeat_max == 0)
3825 {
3826 code = previous;
3827 goto END_REPEAT;
3828 }
3829
3830 /* If the maximum is 1 or unlimited, we just have to stick in the
3831 BRAZERO and do no more at this point. However, we do need to adjust
3832 any OP_RECURSE calls inside the group that refer to the group itself or
3833 any internal or forward referenced group, because the offset is from
3834 the start of the whole regex. Temporarily terminate the pattern while
3835 doing this. */
3836
3837 if (repeat_max <= 1)
3838 {
3839 *code = OP_END;
3840 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3841 memmove(previous+1, previous, len);
3842 code++;
3843 *previous++ = OP_BRAZERO + repeat_type;
3844 }
3845
3846 /* If the maximum is greater than 1 and limited, we have to replicate
3847 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3848 The first one has to be handled carefully because it's the original
3849 copy, which has to be moved up. The remainder can be handled by code
3850 that is common with the non-zero minimum case below. We have to
3851 adjust the value or repeat_max, since one less copy is required. Once
3852 again, we may have to adjust any OP_RECURSE calls inside the group. */
3853
3854 else
3855 {
3856 int offset;
3857 *code = OP_END;
3858 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3859 memmove(previous + 2 + LINK_SIZE, previous, len);
3860 code += 2 + LINK_SIZE;
3861 *previous++ = OP_BRAZERO + repeat_type;
3862 *previous++ = OP_BRA;
3863
3864 /* We chain together the bracket offset fields that have to be
3865 filled in later when the ends of the brackets are reached. */
3866
3867 offset = (bralink == NULL)? 0 : previous - bralink;
3868 bralink = previous;
3869 PUTINC(previous, 0, offset);
3870 }
3871
3872 repeat_max--;
3873 }
3874
3875 /* If the minimum is greater than zero, replicate the group as many
3876 times as necessary, and adjust the maximum to the number of subsequent
3877 copies that we need. If we set a first char from the group, and didn't
3878 set a required char, copy the latter from the former. If there are any
3879 forward reference subroutine calls in the group, there will be entries on
3880 the workspace list; replicate these with an appropriate increment. */
3881
3882 else
3883 {
3884 if (repeat_min > 1)
3885 {
3886 /* In the pre-compile phase, we don't actually do the replication. We
3887 just adjust the length as if we had. Do some paranoid checks for
3888 potential integer overflow. */
3889
3890 if (lengthptr != NULL)
3891 {
3892 int delta = (repeat_min - 1)*length_prevgroup;
3893 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3894 (double)INT_MAX ||
3895 OFLOW_MAX - *lengthptr < delta)
3896 {
3897 *errorcodeptr = ERR20;
3898 goto FAILED;
3899 }
3900 *lengthptr += delta;
3901 }
3902
3903 /* This is compiling for real */
3904
3905 else
3906 {
3907 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3908 for (i = 1; i < repeat_min; i++)
3909 {
3910 uschar *hc;
3911 uschar *this_hwm = cd->hwm;
3912 memcpy(code, previous, len);
3913 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3914 {
3915 PUT(cd->hwm, 0, GET(hc, 0) + len);
3916 cd->hwm += LINK_SIZE;
3917 }
3918 save_hwm = this_hwm;
3919 code += len;
3920 }
3921 }
3922 }
3923
3924 if (repeat_max > 0) repeat_max -= repeat_min;
3925 }
3926
3927 /* This code is common to both the zero and non-zero minimum cases. If
3928 the maximum is limited, it replicates the group in a nested fashion,
3929 remembering the bracket starts on a stack. In the case of a zero minimum,
3930 the first one was set up above. In all cases the repeat_max now specifies
3931 the number of additional copies needed. Again, we must remember to
3932 replicate entries on the forward reference list. */
3933
3934 if (repeat_max >= 0)
3935 {
3936 /* In the pre-compile phase, we don't actually do the replication. We
3937 just adjust the length as if we had. For each repetition we must add 1
3938 to the length for BRAZERO and for all but the last repetition we must
3939 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3940 paranoid checks to avoid integer overflow. */
3941
3942 if (lengthptr != NULL && repeat_max > 0)
3943 {
3944 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3945 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3946 if ((double)repeat_max *
3947 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3948 > (double)INT_MAX ||
3949 OFLOW_MAX - *lengthptr < delta)
3950 {
3951 *errorcodeptr = ERR20;
3952 goto FAILED;
3953 }
3954 *lengthptr += delta;
3955 }
3956
3957 /* This is compiling for real */
3958
3959 else for (i = repeat_max - 1; i >= 0; i--)
3960 {
3961 uschar *hc;
3962 uschar *this_hwm = cd->hwm;
3963
3964 *code++ = OP_BRAZERO + repeat_type;
3965
3966 /* All but the final copy start a new nesting, maintaining the
3967 chain of brackets outstanding. */
3968
3969 if (i != 0)
3970 {
3971 int offset;
3972 *code++ = OP_BRA;
3973 offset = (bralink == NULL)? 0 : code - bralink;
3974 bralink = code;
3975 PUTINC(code, 0, offset);
3976 }
3977
3978 memcpy(code, previous, len);
3979 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3980 {
3981 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3982 cd->hwm += LINK_SIZE;
3983 }
3984 save_hwm = this_hwm;
3985 code += len;
3986 }
3987
3988 /* Now chain through the pending brackets, and fill in their length
3989 fields (which are holding the chain links pro tem). */
3990
3991 while (bralink != NULL)
3992 {
3993 int oldlinkoffset;
3994 int offset = code - bralink + 1;
3995 uschar *bra = code - offset;
3996 oldlinkoffset = GET(bra, 1);
3997 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3998 *code++ = OP_KET;
3999 PUTINC(code, 0, offset);
4000 PUT(bra, 1, offset);
4001 }
4002 }
4003
4004 /* If the maximum is unlimited, set a repeater in the final copy. We
4005 can't just offset backwards from the current code point, because we
4006 don't know if there's been an options resetting after the ket. The
4007 correct offset was computed above.
4008
4009 Then, when we are doing the actual compile phase, check to see whether
4010 this group is a non-atomic one that could match an empty string. If so,
4011 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4012 that runtime checking can be done. [This check is also applied to
4013 atomic groups at runtime, but in a different way.] */
4014
4015 else
4016 {
4017 uschar *ketcode = code - ketoffset;
4018 uschar *bracode = ketcode - GET(ketcode, 1);
4019 *ketcode = OP_KETRMAX + repeat_type;
4020 if (lengthptr == NULL && *bracode != OP_ONCE)
4021 {
4022 uschar *scode = bracode;
4023 do
4024 {
4025 if (could_be_empty_branch(scode, ketcode, utf8))
4026 {
4027 *bracode += OP_SBRA - OP_BRA;
4028 break;
4029 }
4030 scode += GET(scode, 1);
4031 }
4032 while (*scode == OP_ALT);
4033 }
4034 }
4035 }
4036
4037 /* Else there's some kind of shambles */
4038
4039 else
4040 {
4041 *errorcodeptr = ERR11;
4042 goto FAILED;
4043 }
4044
4045 /* If the character following a repeat is '+', or if certain optimization
4046 tests above succeeded, possessive_quantifier is TRUE. For some of the
4047 simpler opcodes, there is an special alternative opcode for this. For
4048 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4049 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4050 but the special opcodes can optimize it a bit. The repeated item starts at
4051 tempcode, not at previous, which might be the first part of a string whose
4052 (former) last char we repeated.
4053
4054 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4055 an 'upto' may follow. We skip over an 'exact' item, and then test the
4056 length of what remains before proceeding. */
4057
4058 if (possessive_quantifier)
4059 {
4060 int len;
4061 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4062 *tempcode == OP_NOTEXACT)
4063 tempcode += _pcre_OP_lengths[*tempcode] +
4064 ((*tempcode == OP_TYPEEXACT &&
4065 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4066 len = code - tempcode;
4067 if (len > 0) switch (*tempcode)
4068 {
4069 case OP_STAR: *tempcode = OP_POSSTAR; break;
4070 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4071 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4072 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4073
4074 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4075 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4076 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4077 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4078
4079 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4080 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4081 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4082 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4083
4084 default:
4085 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4086 code += 1 + LINK_SIZE;
4087 len += 1 + LINK_SIZE;
4088 tempcode[0] = OP_ONCE;
4089 *code++ = OP_KET;
4090 PUTINC(code, 0, len);
4091 PUT(tempcode, 1, len);
4092 break;
4093 }
4094 }
4095
4096 /* In all case we no longer have a previous item. We also set the
4097 "follows varying string" flag for subsequently encountered reqbytes if
4098 it isn't already set and we have just passed a varying length item. */
4099
4100 END_REPEAT:
4101 previous = NULL;
4102 cd->req_varyopt |= reqvary;
4103 break;
4104
4105
4106 /* ===================================================================*/
4107 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4108 lookbehind or option setting or condition or all the other extended
4109 parenthesis forms. */
4110
4111 case '(':
4112 newoptions = options;
4113 skipbytes = 0;
4114 bravalue = OP_CBRA;
4115 save_hwm = cd->hwm;
4116 reset_bracount = FALSE;
4117
4118 /* First deal with various "verbs" that can be introduced by '*'. */
4119
4120 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4121 {
4122 int i, namelen;
4123 const char *vn = verbnames;
4124 const uschar *name = ++ptr;
4125 previous = NULL;
4126 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4127 if (*ptr == ':')
4128 {
4129 *errorcodeptr = ERR59; /* Not supported */
4130 goto FAILED;
4131 }
4132 if (*ptr != ')')
4133 {
4134 *errorcodeptr = ERR60;
4135 goto FAILED;
4136 }
4137 namelen = ptr - name;
4138 for (i = 0; i < verbcount; i++)
4139 {
4140 if (namelen == verbs[i].len &&
4141 strncmp((char *)name, vn, namelen) == 0)
4142 {
4143 *code = verbs[i].op;
4144 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4145 break;
4146 }
4147 vn += verbs[i].len + 1;
4148 }
4149 if (i < verbcount) continue;
4150 *errorcodeptr = ERR60;
4151 goto FAILED;
4152 }
4153
4154 /* Deal with the extended parentheses; all are introduced by '?', and the
4155 appearance of any of them means that this is not a capturing group. */
4156
4157 else if (*ptr == '?')
4158 {
4159 int i, set, unset, namelen;
4160 int *optset;
4161 const uschar *name;
4162 uschar *slot;
4163
4164 switch (*(++ptr))
4165 {
4166 case '#': /* Comment; skip to ket */
4167 ptr++;
4168 while (*ptr != 0 && *ptr != ')') ptr++;
4169 if (*ptr == 0)
4170 {
4171 *errorcodeptr = ERR18;
4172 goto FAILED;
4173 }
4174 continue;
4175
4176
4177 /* ------------------------------------------------------------ */
4178 case '|': /* Reset capture count for each branch */
4179 reset_bracount = TRUE;
4180 /* Fall through */
4181
4182 /* ------------------------------------------------------------ */
4183 case ':': /* Non-capturing bracket */
4184 bravalue = OP_BRA;
4185 ptr++;
4186 break;
4187
4188
4189 /* ------------------------------------------------------------ */
4190 case '(':
4191 bravalue = OP_COND; /* Conditional group */
4192
4193 /* A condition can be an assertion, a number (referring to a numbered
4194 group), a name (referring to a named group), or 'R', referring to
4195 recursion. R<digits> and R&name are also permitted for recursion tests.
4196
4197 There are several syntaxes for testing a named group: (?(name)) is used
4198 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4199
4200 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4201 be the recursive thing or the name 'R' (and similarly for 'R' followed
4202 by digits), and (b) a number could be a name that consists of digits.
4203 In both cases, we look for a name first; if not found, we try the other
4204 cases. */
4205
4206 /* For conditions that are assertions, check the syntax, and then exit
4207 the switch. This will take control down to where bracketed groups,
4208 including assertions, are processed. */
4209
4210 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4211 break;
4212
4213 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4214 below), and all need to skip 3 bytes at the start of the group. */
4215
4216 code[1+LINK_SIZE] = OP_CREF;
4217 skipbytes = 3;
4218 refsign = -1;
4219
4220 /* Check for a test for recursion in a named group. */
4221
4222 if (ptr[1] == 'R' && ptr[2] == '&')
4223 {
4224 terminator = -1;
4225 ptr += 2;
4226 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4227 }
4228
4229 /* Check for a test for a named group's having been set, using the Perl
4230 syntax (?(<name>) or (?('name') */
4231
4232 else if (ptr[1] == '<')
4233 {
4234 terminator = '>';
4235 ptr++;
4236 }
4237 else if (ptr[1] == '\'')
4238 {
4239 terminator = '\'';
4240 ptr++;
4241 }
4242 else
4243 {
4244 terminator = 0;
4245 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4246 }
4247
4248 /* We now expect to read a name; any thing else is an error */
4249
4250 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4251 {
4252 ptr += 1; /* To get the right offset */
4253 *errorcodeptr = ERR28;
4254 goto FAILED;
4255 }
4256
4257 /* Read the name, but also get it as a number if it's all digits */
4258
4259 recno = 0;
4260 name = ++ptr;
4261 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4262 {
4263 if (recno >= 0)
4264 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4265 recno * 10 + *ptr - '0' : -1;
4266 ptr++;
4267 }
4268 namelen = ptr - name;
4269
4270 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4271 {
4272 ptr--; /* Error offset */
4273 *errorcodeptr = ERR26;
4274 goto FAILED;
4275 }
4276
4277 /* Do no further checking in the pre-compile phase. */
4278
4279 if (lengthptr != NULL) break;
4280
4281 /* In the real compile we do the work of looking for the actual
4282 reference. If the string started with "+" or "-" we require the rest to
4283 be digits, in which case recno will be set. */
4284
4285 if (refsign > 0)
4286 {
4287 if (recno <= 0)
4288 {
4289 *errorcodeptr = ERR58;
4290 goto FAILED;
4291 }
4292 recno = (refsign == '-')?
4293 cd->bracount - recno + 1 : recno +cd->bracount;
4294 if (recno <= 0 || recno > cd->final_bracount)
4295 {
4296 *errorcodeptr = ERR15;
4297 goto FAILED;
4298 }
4299 PUT2(code, 2+LINK_SIZE, recno);
4300 break;
4301 }
4302
4303 /* Otherwise (did not start with "+" or "-"), start by looking for the
4304 name. */
4305
4306 slot = cd->name_table;
4307 for (i = 0; i < cd->names_found; i++)
4308 {
4309 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4310 slot += cd->name_entry_size;
4311 }
4312
4313 /* Found a previous named subpattern */
4314
4315 if (i < cd->names_found)
4316 {
4317 recno = GET2(slot, 0);
4318 PUT2(code, 2+LINK_SIZE, recno);
4319 }
4320
4321 /* Search the pattern for a forward reference */
4322
4323 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4324 (options & PCRE_EXTENDED) != 0)) > 0)
4325 {
4326 PUT2(code, 2+LINK_SIZE, i);
4327 }
4328
4329 /* If terminator == 0 it means that the name followed directly after
4330 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4331 some further alternatives to try. For the cases where terminator != 0
4332 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4333 now checked all the possibilities, so give an error. */
4334
4335 else if (terminator != 0)
4336 {
4337 *errorcodeptr = ERR15;
4338 goto FAILED;
4339 }
4340
4341 /* Check for (?(R) for recursion. Allow digits after R to specify a
4342 specific group number. */
4343
4344 else if (*name == 'R')
4345 {
4346 recno = 0;
4347 for (i = 1; i < namelen; i++)
4348 {
4349 if ((digitab[name[i]] & ctype_digit) == 0)
4350 {
4351 *errorcodeptr = ERR15;
4352 goto FAILED;
4353 }
4354 recno = recno * 10 + name[i] - '0';
4355 }
4356 if (recno == 0) recno = RREF_ANY;
4357 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4358 PUT2(code, 2+LINK_SIZE, recno);
4359 }
4360
4361 /* Similarly, check for the (?(DEFINE) "condition", which is always
4362 false. */
4363
4364 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4365 {
4366 code[1+LINK_SIZE] = OP_DEF;
4367 skipbytes = 1;
4368 }
4369
4370 /* Check for the "name" actually being a subpattern number. We are
4371 in the second pass here, so final_bracount is set. */
4372
4373 else if (recno > 0 && recno <= cd->final_bracount)
4374 {
4375 PUT2(code, 2+LINK_SIZE, recno);
4376 }
4377
4378 /* Either an unidentified subpattern, or a reference to (?(0) */
4379
4380 else
4381 {
4382 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4383 goto FAILED;
4384 }
4385 break;
4386
4387
4388 /* ------------------------------------------------------------ */
4389 case '=': /* Positive lookahead */
4390 bravalue = OP_ASSERT;
4391 ptr++;
4392 break;
4393
4394
4395 /* ------------------------------------------------------------ */
4396 case '!': /* Negative lookahead */
4397 ptr++;
4398 if (*ptr == ')') /* Optimize (?!) */
4399 {
4400 *code++ = OP_FAIL;
4401 previous = NULL;
4402 continue;
4403 }
4404 bravalue = OP_ASSERT_NOT;
4405 break;
4406
4407
4408 /* ------------------------------------------------------------ */
4409 case '<': /* Lookbehind or named define */
4410 switch (ptr[1])
4411 {
4412 case '=': /* Positive lookbehind */
4413 bravalue = OP_ASSERTBACK;
4414 ptr += 2;
4415 break;
4416
4417 case '!': /* Negative lookbehind */
4418 bravalue = OP_ASSERTBACK_NOT;
4419 ptr += 2;
4420 break;
4421
4422 default: /* Could be name define, else bad */
4423 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4424 ptr++; /* Correct offset for error */
4425 *errorcodeptr = ERR24;
4426 goto FAILED;
4427 }
4428 break;
4429
4430
4431 /* ------------------------------------------------------------ */
4432 case '>': /* One-time brackets */
4433 bravalue = OP_ONCE;
4434 ptr++;
4435 break;
4436
4437
4438 /* ------------------------------------------------------------ */
4439 case 'C': /* Callout - may be followed by digits; */
4440 previous_callout = code; /* Save for later completion */
4441 after_manual_callout = 1; /* Skip one item before completing */
4442 *code++ = OP_CALLOUT;
4443 {
4444 int n = 0;
4445 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4446 n = n * 10 + *ptr - '0';
4447 if (*ptr != ')')
4448 {
4449 *errorcodeptr = ERR39;
4450 goto FAILED;
4451 }
4452 if (n > 255)
4453 {
4454 *errorcodeptr = ERR38;
4455 goto FAILED;
4456 }
4457 *code++ = n;
4458 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4459 PUT(code, LINK_SIZE, 0); /* Default length */
4460 code += 2 * LINK_SIZE;
4461 }
4462 previous = NULL;
4463 continue;
4464
4465
4466 /* ------------------------------------------------------------ */
4467 case 'P': /* Python-style named subpattern handling */
4468 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4469 {
4470 is_recurse = *ptr == '>';
4471 terminator = ')';
4472 goto NAMED_REF_OR_RECURSE;
4473 }
4474 else if (*ptr != '<') /* Test for Python-style definition */
4475 {
4476 *errorcodeptr = ERR41;
4477 goto FAILED;
4478 }
4479 /* Fall through to handle (?P< as (?< is handled */
4480
4481
4482 /* ------------------------------------------------------------ */
4483 DEFINE_NAME: /* Come here from (?< handling */
4484 case '\'':
4485 {
4486 terminator = (*ptr == '<')? '>' : '\'';
4487 name = ++ptr;
4488
4489 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4490 namelen = ptr - name;
4491
4492 /* In the pre-compile phase, just do a syntax check. */
4493
4494 if (lengthptr != NULL)
4495 {
4496 if (*ptr != terminator)
4497 {
4498 *errorcodeptr = ERR42;
4499 goto FAILED;
4500 }
4501 if (cd->names_found >= MAX_NAME_COUNT)
4502 {
4503 *errorcodeptr = ERR49;
4504 goto FAILED;
4505 }
4506 if (namelen + 3 > cd->name_entry_size)
4507 {
4508 cd->name_entry_size = namelen + 3;
4509 if (namelen > MAX_NAME_SIZE)
4510 {
4511 *errorcodeptr = ERR48;
4512 goto FAILED;
4513 }
4514 }
4515 }
4516
4517 /* In the real compile, create the entry in the table */
4518
4519 else
4520 {
4521 slot = cd->name_table;
4522 for (i = 0; i < cd->names_found; i++)
4523 {
4524 int crc = memcmp(name, slot+2, namelen);
4525 if (crc == 0)
4526 {
4527 if (slot[2+namelen] == 0)
4528 {
4529 if ((options & PCRE_DUPNAMES) == 0)
4530 {
4531 *errorcodeptr = ERR43;
4532 goto FAILED;
4533 }
4534 }
4535 else crc = -1; /* Current name is substring */
4536 }
4537 if (crc < 0)
4538 {
4539 memmove(slot + cd->name_entry_size, slot,
4540 (cd->names_found - i) * cd->name_entry_size);
4541 break;
4542 }
4543 slot += cd->name_entry_size;
4544 }
4545
4546 PUT2(slot, 0, cd->bracount + 1);
4547 memcpy(slot + 2, name, namelen);
4548 slot[2+namelen] = 0;
4549 }
4550 }
4551
4552 /* In both cases, count the number of names we've encountered. */
4553
4554 ptr++; /* Move past > or ' */
4555 cd->names_found++;
4556 goto NUMBERED_GROUP;
4557
4558
4559 /* ------------------------------------------------------------ */
4560 case '&': /* Perl recursion/subroutine syntax */
4561 terminator = ')';
4562 is_recurse = TRUE;
4563 /* Fall through */
4564
4565 /* We come here from the Python syntax above that handles both
4566 references (?P=name) and recursion (?P>name), as well as falling
4567 through from the Perl recursion syntax (?&name). We also come here from
4568 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4569 .NET syntax. */
4570
4571 NAMED_REF_OR_RECURSE:
4572 name = ++ptr;
4573 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4574 namelen = ptr - name;
4575
4576 /* In the pre-compile phase, do a syntax check and set a dummy
4577 reference number. */
4578
4579 if (lengthptr != NULL)
4580 {
4581 if (namelen == 0)
4582 {
4583 *errorcodeptr = ERR62;
4584 goto FAILED;
4585 }
4586 if (*ptr != terminator)
4587 {
4588 *errorcodeptr = ERR42;
4589 goto FAILED;
4590 }
4591 if (namelen > MAX_NAME_SIZE)
4592 {
4593 *errorcodeptr = ERR48;
4594 goto FAILED;
4595 }
4596 recno = 0;
4597 }
4598
4599 /* In the real compile, seek the name in the table. We check the name
4600 first, and then check that we have reached the end of the name in the
4601 table. That way, if the name that is longer than any in the table,
4602 the comparison will fail without reading beyond the table entry. */
4603
4604 else
4605 {
4606 slot = cd->name_table;
4607 for (i = 0; i < cd->names_found; i++)
4608 {
4609 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4610 slot[2+namelen] == 0)
4611 break;
4612 slot += cd->name_entry_size;
4613 }
4614
4615 if (i < cd->names_found) /* Back reference */
4616 {
4617 recno = GET2(slot, 0);
4618 }
4619 else if ((recno = /* Forward back reference */
4620 find_parens(ptr, cd->bracount, name, namelen,
4621 (options & PCRE_EXTENDED) != 0)) <= 0)
4622 {
4623 *errorcodeptr = ERR15;
4624 goto FAILED;
4625 }
4626 }
4627
4628 /* In both phases, we can now go to the code than handles numerical
4629 recursion or backreferences. */
4630
4631 if (is_recurse) goto HANDLE_RECURSION;
4632 else goto HANDLE_REFERENCE;
4633
4634
4635 /* ------------------------------------------------------------ */
4636 case 'R': /* Recursion */
4637 ptr++; /* Same as (?0) */
4638 /* Fall through */
4639
4640
4641 /* ------------------------------------------------------------ */
4642 case '-': case '+':
4643 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4644 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4645 {
4646 const uschar *called;
4647
4648 if ((refsign = *ptr) == '+')
4649 {
4650 ptr++;
4651 if ((digitab[*ptr] & ctype_digit) == 0)
4652 {
4653 *errorcodeptr = ERR63;
4654 goto FAILED;
4655 }
4656 }
4657 else if (refsign == '-')
4658 {
4659 if ((digitab[ptr[1]] & ctype_digit) == 0)
4660 goto OTHER_CHAR_AFTER_QUERY;
4661 ptr++;
4662 }
4663
4664 recno = 0;
4665 while((digitab[*ptr] & ctype_digit) != 0)
4666 recno = recno * 10 + *ptr++ - '0';
4667
4668 if (*ptr != ')')
4669 {
4670 *errorcodeptr = ERR29;
4671 goto FAILED;
4672 }
4673
4674 if (refsign == '-')
4675 {
4676 if (recno == 0)
4677 {
4678 *errorcodeptr = ERR58;
4679 goto FAILED;
4680 }
4681 recno = cd->bracount - recno + 1;
4682 if (recno <= 0)
4683 {
4684 *errorcodeptr = ERR15;
4685 goto FAILED;
4686 }
4687 }
4688 else if (refsign == '+')
4689 {
4690 if (recno == 0)
4691 {
4692 *errorcodeptr = ERR58;
4693 goto FAILED;
4694 }
4695 recno += cd->bracount;
4696 }
4697
4698 /* Come here from code above that handles a named recursion */
4699
4700 HANDLE_RECURSION:
4701
4702 previous = code;
4703 called = cd->start_code;
4704
4705 /* When we are actually compiling, find the bracket that is being
4706 referenced. Temporarily end the regex in case it doesn't exist before
4707 this point. If we end up with a forward reference, first check that
4708 the bracket does occur later so we can give the error (and position)
4709 now. Then remember this forward reference in the workspace so it can
4710 be filled in at the end. */
4711
4712 if (lengthptr == NULL)
4713 {
4714 *code = OP_END;
4715 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4716
4717 /* Forward reference */
4718
4719 if (called == NULL)
4720 {
4721 if (find_parens(ptr, cd->bracount, NULL, recno,
4722 (options & PCRE_EXTENDED) != 0) < 0)
4723 {
4724 *errorcodeptr = ERR15;
4725 goto FAILED;
4726 }
4727 called = cd->start_code + recno;
4728 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4729 }
4730
4731 /* If not a forward reference, and the subpattern is still open,
4732 this is a recursive call. We check to see if this is a left
4733 recursion that could loop for ever, and diagnose that case. */
4734
4735 else if (GET(called, 1) == 0 &&
4736 could_be_empty(called, code, bcptr, utf8))
4737 {
4738 *errorcodeptr = ERR40;
4739 goto FAILED;
4740 }
4741 }
4742
4743 /* Insert the recursion/subroutine item, automatically wrapped inside
4744 "once" brackets. Set up a "previous group" length so that a
4745 subsequent quantifier will work. */
4746
4747 *code = OP_ONCE;
4748 PUT(code, 1, 2 + 2*LINK_SIZE);
4749 code += 1 + LINK_SIZE;
4750
4751 *code = OP_RECURSE;
4752 PUT(code, 1, called - cd->start_code);
4753 code += 1 + LINK_SIZE;
4754
4755 *code = OP_KET;
4756 PUT(code, 1, 2 + 2*LINK_SIZE);
4757 code += 1 + LINK_SIZE;
4758
4759 length_prevgroup = 3 + 3*LINK_SIZE;
4760 }
4761
4762 /* Can't determine a first byte now */
4763
4764 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4765 continue;
4766
4767
4768 /* ------------------------------------------------------------ */
4769 default: /* Other characters: check option setting */
4770 OTHER_CHAR_AFTER_QUERY:
4771 set = unset = 0;
4772 optset = &set;
4773
4774 while (*ptr != ')' && *ptr != ':')
4775 {
4776 switch (*ptr++)
4777 {
4778 case '-': optset = &unset; break;
4779
4780 case 'J': /* Record that it changed in the external options */
4781 *optset |= PCRE_DUPNAMES;
4782 cd->external_flags |= PCRE_JCHANGED;
4783 break;
4784
4785 case 'i': *optset |= PCRE_CASELESS; break;
4786 case 'm': *optset |= PCRE_MULTILINE; break;
4787 case 's': *optset |= PCRE_DOTALL; break;
4788 case 'x': *optset |= PCRE_EXTENDED; break;
4789 case 'U': *optset |= PCRE_UNGREEDY; break;
4790 case 'X': *optset |= PCRE_EXTRA; break;
4791
4792 default: *errorcodeptr = ERR12;
4793 ptr--; /* Correct the offset */
4794 goto FAILED;
4795 }
4796 }
4797
4798 /* Set up the changed option bits, but don't change anything yet. */
4799
4800 newoptions = (options | set) & (~unset);
4801
4802 /* If the options ended with ')' this is not the start of a nested
4803 group with option changes, so the options change at this level. If this
4804 item is right at the start of the pattern, the options can be
4805 abstracted and made external in the pre-compile phase, and ignored in
4806 the compile phase. This can be helpful when matching -- for instance in
4807 caseless checking of required bytes.
4808
4809 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4810 definitely *not* at the start of the pattern because something has been
4811 compiled. In the pre-compile phase, however, the code pointer can have
4812 that value after the start, because it gets reset as code is discarded
4813 during the pre-compile. However, this can happen only at top level - if
4814 we are within parentheses, the starting BRA will still be present. At
4815 any parenthesis level, the length value can be used to test if anything
4816 has been compiled at that level. Thus, a test for both these conditions
4817 is necessary to ensure we correctly detect the start of the pattern in
4818 both phases.
4819
4820 If we are not at the pattern start, compile code to change the ims
4821 options if this setting actually changes any of them. We also pass the
4822 new setting back so that it can be put at the start of any following
4823 branches, and when this group ends (if we are in a group), a resetting
4824 item can be compiled. */
4825
4826 if (*ptr == ')')
4827 {
4828 if (code == cd->start_code + 1 + LINK_SIZE &&
4829 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4830 {
4831 cd->external_options = newoptions;
4832 options = newoptions;
4833 }
4834 else
4835 {
4836 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4837 {
4838 *code++ = OP_OPT;
4839 *code++ = newoptions & PCRE_IMS;
4840 }
4841
4842 /* Change options at this level, and pass them back for use
4843 in subsequent branches. Reset the greedy defaults and the case
4844 value for firstbyte and reqbyte. */
4845
4846 *optionsptr = options = newoptions;
4847 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4848 greedy_non_default = greedy_default ^ 1;
4849 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4850 }
4851
4852 previous = NULL; /* This item can't be repeated */
4853 continue; /* It is complete */
4854 }
4855
4856 /* If the options ended with ':' we are heading into a nested group
4857 with possible change of options. Such groups are non-capturing and are
4858 not assertions of any kind. All we need to do is skip over the ':';
4859 the newoptions value is handled below. */
4860
4861 bravalue = OP_BRA;
4862 ptr++;
4863 } /* End of switch for character following (? */
4864 } /* End of (? handling */
4865
4866 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4867 all unadorned brackets become non-capturing and behave like (?:...)
4868 brackets. */
4869
4870 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4871 {
4872 bravalue = OP_BRA;
4873 }
4874
4875 /* Else we have a capturing group. */
4876
4877 else
4878 {
4879 NUMBERED_GROUP:
4880 cd->bracount += 1;
4881 PUT2(code, 1+LINK_SIZE, cd->bracount);
4882 skipbytes = 2;
4883 }
4884
4885 /* Process nested bracketed regex. Assertions may not be repeated, but
4886 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4887 non-register variable in order to be able to pass its address because some
4888 compilers complain otherwise. Pass in a new setting for the ims options if
4889 they have changed. */
4890
4891 previous = (bravalue >= OP_ONCE)? code : NULL;
4892 *code = bravalue;
4893 tempcode = code;
4894 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4895 length_prevgroup = 0; /* Initialize for pre-compile phase */
4896
4897 if (!compile_regex(
4898 newoptions, /* The complete new option state */
4899 options & PCRE_IMS, /* The previous ims option state */
4900 &tempcode, /* Where to put code (updated) */
4901 &ptr, /* Input pointer (updated) */
4902 errorcodeptr, /* Where to put an error message */
4903 (bravalue == OP_ASSERTBACK ||
4904 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4905 reset_bracount, /* True if (?| group */
4906 skipbytes, /* Skip over bracket number */
4907 &subfirstbyte, /* For possible first char */
4908 &subreqbyte, /* For possible last char */
4909 bcptr, /* Current branch chain */
4910 cd, /* Tables block */
4911 (lengthptr == NULL)? NULL : /* Actual compile phase */
4912 &length_prevgroup /* Pre-compile phase */
4913 ))
4914 goto FAILED;
4915
4916 /* At the end of compiling, code is still pointing to the start of the
4917 group, while tempcode has been updated to point past the end of the group
4918 and any option resetting that may follow it. The pattern pointer (ptr)
4919 is on the bracket. */
4920
4921 /* If this is a conditional bracket, check that there are no more than
4922 two branches in the group, or just one if it's a DEFINE group. We do this
4923 in the real compile phase, not in the pre-pass, where the whole group may
4924 not be available. */
4925
4926 if (bravalue == OP_COND && lengthptr == NULL)
4927 {
4928 uschar *tc = code;
4929 int condcount = 0;
4930
4931 do {
4932 condcount++;
4933 tc += GET(tc,1);
4934 }
4935 while (*tc != OP_KET);
4936
4937 /* A DEFINE group is never obeyed inline (the "condition" is always
4938 false). It must have only one branch. */
4939
4940 if (code[LINK_SIZE+1] == OP_DEF)
4941 {
4942 if (condcount > 1)
4943 {
4944 *errorcodeptr = ERR54;
4945 goto FAILED;
4946 }
4947 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4948 }
4949
4950 /* A "normal" conditional group. If there is just one branch, we must not
4951 make use of its firstbyte or reqbyte, because this is equivalent to an
4952 empty second branch. */
4953
4954 else
4955 {
4956 if (condcount > 2)
4957 {
4958 *errorcodeptr = ERR27;
4959 goto FAILED;
4960 }
4961 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4962 }
4963 }
4964
4965 /* Error if hit end of pattern */
4966
4967 if (*ptr != ')')
4968 {
4969 *errorcodeptr = ERR14;
4970 goto FAILED;
4971 }
4972
4973 /* In the pre-compile phase, update the length by the length of the group,
4974 less the brackets at either end. Then reduce the compiled code to just a
4975 set of non-capturing brackets so that it doesn't use much memory if it is
4976 duplicated by a quantifier.*/
4977
4978 if (lengthptr != NULL)
4979 {
4980 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4981 {
4982 *errorcodeptr = ERR20;
4983 goto FAILED;
4984 }
4985 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4986 *code++ = OP_BRA;
4987 PUTINC(code, 0, 1 + LINK_SIZE);
4988 *code++ = OP_KET;
4989 PUTINC(code, 0, 1 + LINK_SIZE);
4990 break; /* No need to waste time with special character handling */
4991 }
4992
4993 /* Otherwise update the main code pointer to the end of the group. */
4994
4995 code = tempcode;
4996
4997 /* For a DEFINE group, required and first character settings are not
4998 relevant. */
4999
5000 if (bravalue == OP_DEF) break;
5001
5002 /* Handle updating of the required and first characters for other types of
5003 group. Update for normal brackets of all kinds, and conditions with two
5004 branches (see code above). If the bracket is followed by a quantifier with
5005 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5006 zerofirstbyte outside the main loop so that they can be accessed for the
5007 back off. */
5008
5009 zeroreqbyte = reqbyte;
5010 zerofirstbyte = firstbyte;
5011 groupsetfirstbyte = FALSE;
5012
5013 if (bravalue >= OP_ONCE)
5014 {
5015 /* If we have not yet set a firstbyte in this branch, take it from the
5016 subpattern, remembering that it was set here so that a repeat of more
5017 than one can replicate it as reqbyte if necessary. If the subpattern has
5018 no firstbyte, set "none" for the whole branch. In both cases, a zero
5019 repeat forces firstbyte to "none". */
5020
5021 if (firstbyte == REQ_UNSET)
5022 {
5023 if (subfirstbyte >= 0)
5024 {
5025 firstbyte = subfirstbyte;
5026 groupsetfirstbyte = TRUE;
5027 }
5028 else firstbyte = REQ_NONE;
5029 zerofirstbyte = REQ_NONE;
5030 }
5031
5032 /* If firstbyte was previously set, convert the subpattern's firstbyte
5033 into reqbyte if there wasn't one, using the vary flag that was in
5034 existence beforehand. */
5035
5036 else if (subfirstbyte >= 0 && subreqbyte < 0)
5037 subreqbyte = subfirstbyte | tempreqvary;
5038
5039 /* If the subpattern set a required byte (or set a first byte that isn't
5040 really the first byte - see above), set it. */
5041
5042 if (subreqbyte >= 0) reqbyte = subreqbyte;
5043 }
5044
5045 /* For a forward assertion, we take the reqbyte, if set. This can be
5046 helpful if the pattern that follows the assertion doesn't set a different
5047 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5048 for an assertion, however because it leads to incorrect effect for patterns
5049 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5050 of a firstbyte. This is overcome by a scan at the end if there's no
5051 firstbyte, looking for an asserted first char. */