monotone

monotone Mtn Source Tree

Root/pcre/pcre_dfa_exec.c

1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_dfa_exec(), which is an
42alternative matching function that uses a sort of DFA algorithm (not a true
43FSM). This is NOT Perl- compatible, but it has advantages in certain
44applications. */
45
46
47#include "pcre_config.h"
48
49#define NLBLOCK md /* Block containing newline information */
50#define PSSTART start_subject /* Field containing processed string start */
51#define PSEND end_subject /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* For use to indent debugging output */
57
58#define SP " "
59
60
61
62/*************************************************
63* Code parameters and static tables *
64*************************************************/
65
66/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
67into others, under special conditions. A gap of 20 between the blocks should be
68enough. The resulting opcodes don't have to be less than 256 because they are
69never stored, so we push them well clear of the normal opcodes. */
70
71#define OP_PROP_EXTRA 300
72#define OP_EXTUNI_EXTRA 320
73#define OP_ANYNL_EXTRA 340
74#define OP_HSPACE_EXTRA 360
75#define OP_VSPACE_EXTRA 380
76
77
78/* This table identifies those opcodes that are followed immediately by a
79character that is to be tested in some way. This makes is possible to
80centralize the loading of these characters. In the case of Type * etc, the
81"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
82small value. ***NOTE*** If the start of this table is modified, the two tables
83that follow must also be modified. */
84
85static uschar coptable[] = {
86 0, /* End */
87 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
88 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
89 0, 0, /* Any, Anybyte */
90 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
91 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
92 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
93 1, /* Char */
94 1, /* Charnc */
95 1, /* not */
96 /* Positive single-char repeats */
97 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
98 3, 3, 3, /* upto, minupto, exact */
99 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
100 /* Negative single-char repeats - only for chars < 256 */
101 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
102 3, 3, 3, /* NOT upto, minupto, exact */
103 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
104 /* Positive type repeats */
105 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
106 3, 3, 3, /* Type upto, minupto, exact */
107 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
108 /* Character class & ref repeats */
109 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
110 0, 0, /* CRRANGE, CRMINRANGE */
111 0, /* CLASS */
112 0, /* NCLASS */
113 0, /* XCLASS - variable length */
114 0, /* REF */
115 0, /* RECURSE */
116 0, /* CALLOUT */
117 0, /* Alt */
118 0, /* Ket */
119 0, /* KetRmax */
120 0, /* KetRmin */
121 0, /* Assert */
122 0, /* Assert not */
123 0, /* Assert behind */
124 0, /* Assert behind not */
125 0, /* Reverse */
126 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
127 0, 0, 0, /* SBRA, SCBRA, SCOND */
128 0, /* CREF */
129 0, /* RREF */
130 0, /* DEF */
131 0, 0, /* BRAZERO, BRAMINZERO */
132 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
133 0, 0 /* FAIL, ACCEPT */
134};
135
136/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
137and \w */
138
139static uschar toptable1[] = {
140 0, 0, 0, 0, 0, 0,
141 ctype_digit, ctype_digit,
142 ctype_space, ctype_space,
143 ctype_word, ctype_word,
144 0 /* OP_ANY */
145};
146
147static uschar toptable2[] = {
148 0, 0, 0, 0, 0, 0,
149 ctype_digit, 0,
150 ctype_space, 0,
151 ctype_word, 0,
152 1 /* OP_ANY */
153};
154
155
156/* Structure for holding data about a particular state, which is in effect the
157current data for an active path through the match tree. It must consist
158entirely of ints because the working vector we are passed, and which we put
159these structures in, is a vector of ints. */
160
161typedef struct stateblock {
162 int offset; /* Offset to opcode */
163 int count; /* Count for repeats */
164 int ims; /* ims flag bits */
165 int data; /* Some use extra data */
166} stateblock;
167
168#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
169
170
171#ifdef DEBUG
172/*************************************************
173* Print character string *
174*************************************************/
175
176/* Character string printing function for debugging.
177
178Arguments:
179 p points to string
180 length number of bytes
181 f where to print
182
183Returns: nothing
184*/
185
186static void
187pchars(unsigned char *p, int length, FILE *f)
188{
189int c;
190while (length-- > 0)
191 {
192 if (isprint(c = *(p++)))
193 fprintf(f, "%c", c);
194 else
195 fprintf(f, "\\x%02x", c);
196 }
197}
198#endif
199
200
201
202/*************************************************
203* Execute a Regular Expression - DFA engine *
204*************************************************/
205
206/* This internal function applies a compiled pattern to a subject string,
207starting at a given point, using a DFA engine. This function is called from the
208external one, possibly multiple times if the pattern is not anchored. The
209function calls itself recursively for some kinds of subpattern.
210
211Arguments:
212 md the match_data block with fixed information
213 this_start_code the opening bracket of this subexpression's code
214 current_subject where we currently are in the subject string
215 start_offset start offset in the subject string
216 offsets vector to contain the matching string offsets
217 offsetcount size of same
218 workspace vector of workspace
219 wscount size of same
220 ims the current ims flags
221 rlevel function call recursion level
222 recursing regex recursive call level
223
224Returns: > 0 =>
225 = 0 =>
226 -1 => failed to match
227 < -1 => some kind of unexpected problem
228
229The following macros are used for adding states to the two state vectors (one
230for the current character, one for the following character). */
231
232#define ADD_ACTIVE(x,y) \
233 if (active_count++ < wscount) \
234 { \
235 next_active_state->offset = (x); \
236 next_active_state->count = (y); \
237 next_active_state->ims = ims; \
238 next_active_state++; \
239 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
240 } \
241 else return PCRE_ERROR_DFA_WSSIZE
242
243#define ADD_ACTIVE_DATA(x,y,z) \
244 if (active_count++ < wscount) \
245 { \
246 next_active_state->offset = (x); \
247 next_active_state->count = (y); \
248 next_active_state->ims = ims; \
249 next_active_state->data = (z); \
250 next_active_state++; \
251 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
252 } \
253 else return PCRE_ERROR_DFA_WSSIZE
254
255#define ADD_NEW(x,y) \
256 if (new_count++ < wscount) \
257 { \
258 next_new_state->offset = (x); \
259 next_new_state->count = (y); \
260 next_new_state->ims = ims; \
261 next_new_state++; \
262 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
263 } \
264 else return PCRE_ERROR_DFA_WSSIZE
265
266#define ADD_NEW_DATA(x,y,z) \
267 if (new_count++ < wscount) \
268 { \
269 next_new_state->offset = (x); \
270 next_new_state->count = (y); \
271 next_new_state->ims = ims; \
272 next_new_state->data = (z); \
273 next_new_state++; \
274 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
275 } \
276 else return PCRE_ERROR_DFA_WSSIZE
277
278/* And now, here is the code */
279
280static int
281internal_dfa_exec(
282 dfa_match_data *md,
283 const uschar *this_start_code,
284 const uschar *current_subject,
285 int start_offset,
286 int *offsets,
287 int offsetcount,
288 int *workspace,
289 int wscount,
290 int ims,
291 int rlevel,
292 int recursing)
293{
294stateblock *active_states, *new_states, *temp_states;
295stateblock *next_active_state, *next_new_state;
296
297const uschar *ctypes, *lcc, *fcc;
298const uschar *ptr;
299const uschar *end_code, *first_op;
300
301int active_count, new_count, match_count;
302
303/* Some fields in the md block are frequently referenced, so we load them into
304independent variables in the hope that this will perform better. */
305
306const uschar *start_subject = md->start_subject;
307const uschar *end_subject = md->end_subject;
308const uschar *start_code = md->start_code;
309
310#ifdef SUPPORT_UTF8
311BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
312#else
313BOOL utf8 = FALSE;
314#endif
315
316rlevel++;
317offsetcount &= (-2);
318
319wscount -= 2;
320wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
321 (2 * INTS_PER_STATEBLOCK);
322
323DPRINTF(("\n%.*s---------------------\n"
324 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
325 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
326
327ctypes = md->tables + ctypes_offset;
328lcc = md->tables + lcc_offset;
329fcc = md->tables + fcc_offset;
330
331match_count = PCRE_ERROR_NOMATCH; /* A negative number */
332
333active_states = (stateblock *)(workspace + 2);
334next_new_state = new_states = active_states + wscount;
335new_count = 0;
336
337first_op = this_start_code + 1 + LINK_SIZE +
338 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
339
340/* The first thing in any (sub) pattern is a bracket of some sort. Push all
341the alternative states onto the list, and find out where the end is. This
342makes is possible to use this function recursively, when we want to stop at a
343matching internal ket rather than at the end.
344
345If the first opcode in the first alternative is OP_REVERSE, we are dealing with
346a backward assertion. In that case, we have to find out the maximum amount to
347move back, and set up each alternative appropriately. */
348
349if (*first_op == OP_REVERSE)
350 {
351 int max_back = 0;
352 int gone_back;
353
354 end_code = this_start_code;
355 do
356 {
357 int back = GET(end_code, 2+LINK_SIZE);
358 if (back > max_back) max_back = back;
359 end_code += GET(end_code, 1);
360 }
361 while (*end_code == OP_ALT);
362
363 /* If we can't go back the amount required for the longest lookbehind
364 pattern, go back as far as we can; some alternatives may still be viable. */
365
366#ifdef SUPPORT_UTF8
367 /* In character mode we have to step back character by character */
368
369 if (utf8)
370 {
371 for (gone_back = 0; gone_back < max_back; gone_back++)
372 {
373 if (current_subject <= start_subject) break;
374 current_subject--;
375 while (current_subject > start_subject &&
376 (*current_subject & 0xc0) == 0x80)
377 current_subject--;
378 }
379 }
380 else
381#endif
382
383 /* In byte-mode we can do this quickly. */
384
385 {
386 gone_back = (current_subject - max_back < start_subject)?
387 current_subject - start_subject : max_back;
388 current_subject -= gone_back;
389 }
390
391 /* Now we can process the individual branches. */
392
393 end_code = this_start_code;
394 do
395 {
396 int back = GET(end_code, 2+LINK_SIZE);
397 if (back <= gone_back)
398 {
399 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
400 ADD_NEW_DATA(-bstate, 0, gone_back - back);
401 }
402 end_code += GET(end_code, 1);
403 }
404 while (*end_code == OP_ALT);
405 }
406
407/* This is the code for a "normal" subpattern (not a backward assertion). The
408start of a whole pattern is always one of these. If we are at the top level,
409we may be asked to restart matching from the same point that we reached for a
410previous partial match. We still have to scan through the top-level branches to
411find the end state. */
412
413else
414 {
415 end_code = this_start_code;
416
417 /* Restarting */
418
419 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
420 {
421 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
422 new_count = workspace[1];
423 if (!workspace[0])
424 memcpy(new_states, active_states, new_count * sizeof(stateblock));
425 }
426
427 /* Not restarting */
428
429 else
430 {
431 int length = 1 + LINK_SIZE +
432 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
433 do
434 {
435 ADD_NEW(end_code - start_code + length, 0);
436 end_code += GET(end_code, 1);
437 length = 1 + LINK_SIZE;
438 }
439 while (*end_code == OP_ALT);
440 }
441 }
442
443workspace[0] = 0; /* Bit indicating which vector is current */
444
445DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
446
447/* Loop for scanning the subject */
448
449ptr = current_subject;
450for (;;)
451 {
452 int i, j;
453 int clen, dlen;
454 unsigned int c, d;
455
456 /* Make the new state list into the active state list and empty the
457 new state list. */
458
459 temp_states = active_states;
460 active_states = new_states;
461 new_states = temp_states;
462 active_count = new_count;
463 new_count = 0;
464
465 workspace[0] ^= 1; /* Remember for the restarting feature */
466 workspace[1] = active_count;
467
468#ifdef DEBUG
469 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
470 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
471 printf("\"\n");
472
473 printf("%.*sActive states: ", rlevel*2-2, SP);
474 for (i = 0; i < active_count; i++)
475 printf("%d/%d ", active_states[i].offset, active_states[i].count);
476 printf("\n");
477#endif
478
479 /* Set the pointers for adding new states */
480
481 next_active_state = active_states + active_count;
482 next_new_state = new_states;
483
484 /* Load the current character from the subject outside the loop, as many
485 different states may want to look at it, and we assume that at least one
486 will. */
487
488 if (ptr < end_subject)
489 {
490 clen = 1; /* Number of bytes in the character */
491#ifdef SUPPORT_UTF8
492 if (utf8) { GETCHARLEN(c, ptr, clen); } else
493#endif /* SUPPORT_UTF8 */
494 c = *ptr;
495 }
496 else
497 {
498 clen = 0; /* This indicates the end of the subject */
499 c = NOTACHAR; /* This value should never actually be used */
500 }
501
502 /* Scan up the active states and act on each one. The result of an action
503 may be to add more states to the currently active list (e.g. on hitting a
504 parenthesis) or it may be to put states on the new list, for considering
505 when we move the character pointer on. */
506
507 for (i = 0; i < active_count; i++)
508 {
509 stateblock *current_state = active_states + i;
510 const uschar *code;
511 int state_offset = current_state->offset;
512 int count, codevalue;
513#ifdef SUPPORT_UCP
514 int chartype, script;
515#endif
516
517#ifdef DEBUG
518 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 if (clen == 0) printf("EOL\n");
520 else if (c > 32 && c < 127) printf("'%c'\n", c);
521 else printf("0x%02x\n", c);
522#endif
523
524 /* This variable is referred to implicity in the ADD_xxx macros. */
525
526 ims = current_state->ims;
527
528 /* A negative offset is a special case meaning "hold off going to this
529 (negated) state until the number of characters in the data field have
530 been skipped". */
531
532 if (state_offset < 0)
533 {
534 if (current_state->data > 0)
535 {
536 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537 ADD_NEW_DATA(state_offset, current_state->count,
538 current_state->data - 1);
539 continue;
540 }
541 else
542 {
543 current_state->offset = state_offset = -state_offset;
544 }
545 }
546
547 /* Check for a duplicate state with the same count, and skip if found. */
548
549 for (j = 0; j < i; j++)
550 {
551 if (active_states[j].offset == state_offset &&
552 active_states[j].count == current_state->count)
553 {
554 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555 goto NEXT_ACTIVE_STATE;
556 }
557 }
558
559 /* The state offset is the offset to the opcode */
560
561 code = start_code + state_offset;
562 codevalue = *code;
563
564 /* If this opcode is followed by an inline character, load it. It is
565 tempting to test for the presence of a subject character here, but that
566 is wrong, because sometimes zero repetitions of the subject are
567 permitted.
568
569 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 argument that is not a data character - but is always one byte long. We
571 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572 this case. To keep the other cases fast, convert these ones to new opcodes.
573 */
574
575 if (coptable[codevalue] > 0)
576 {
577 dlen = 1;
578#ifdef SUPPORT_UTF8
579 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580#endif /* SUPPORT_UTF8 */
581 d = code[coptable[codevalue]];
582 if (codevalue >= OP_TYPESTAR)
583 {
584 switch(d)
585 {
586 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587 case OP_NOTPROP:
588 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 case OP_NOT_HSPACE:
592 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 case OP_NOT_VSPACE:
594 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 default: break;
596 }
597 }
598 }
599 else
600 {
601 dlen = 0; /* Not strictly necessary, but compilers moan */
602 d = NOTACHAR; /* if these variables are not set. */
603 }
604
605
606 /* Now process the individual opcodes */
607
608 switch (codevalue)
609 {
610
611/* ========================================================================== */
612 /* Reached a closing bracket. If not at the end of the pattern, carry
613 on with the next opcode. Otherwise, unless we have an empty string and
614 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615 matches so we always have the longest first. */
616
617 case OP_KET:
618 case OP_KETRMIN:
619 case OP_KETRMAX:
620 if (code != end_code)
621 {
622 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623 if (codevalue != OP_KET)
624 {
625 ADD_ACTIVE(state_offset - GET(code, 1), 0);
626 }
627 }
628 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629 {
630 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632 match_count = 0;
633 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635 if (offsetcount >= 2)
636 {
637 offsets[0] = current_subject - start_subject;
638 offsets[1] = ptr - start_subject;
639 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640 offsets[1] - offsets[0], current_subject));
641 }
642 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643 {
644 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646 match_count, rlevel*2-2, SP));
647 return match_count;
648 }
649 }
650 break;
651
652/* ========================================================================== */
653 /* These opcodes add to the current list of states without looking
654 at the current character. */
655
656 /*-----------------------------------------------------------------*/
657 case OP_ALT:
658 do { code += GET(code, 1); } while (*code == OP_ALT);
659 ADD_ACTIVE(code - start_code, 0);
660 break;
661
662 /*-----------------------------------------------------------------*/
663 case OP_BRA:
664 case OP_SBRA:
665 do
666 {
667 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668 code += GET(code, 1);
669 }
670 while (*code == OP_ALT);
671 break;
672
673 /*-----------------------------------------------------------------*/
674 case OP_CBRA:
675 case OP_SCBRA:
676 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677 code += GET(code, 1);
678 while (*code == OP_ALT)
679 {
680 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681 code += GET(code, 1);
682 }
683 break;
684
685 /*-----------------------------------------------------------------*/
686 case OP_BRAZERO:
687 case OP_BRAMINZERO:
688 ADD_ACTIVE(state_offset + 1, 0);
689 code += 1 + GET(code, 2);
690 while (*code == OP_ALT) code += GET(code, 1);
691 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692 break;
693
694 /*-----------------------------------------------------------------*/
695 case OP_CIRC:
696 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
697 ((ims & PCRE_MULTILINE) != 0 &&
698 ptr != end_subject &&
699 WAS_NEWLINE(ptr)))
700 { ADD_ACTIVE(state_offset + 1, 0); }
701 break;
702
703 /*-----------------------------------------------------------------*/
704 case OP_EOD:
705 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
706 break;
707
708 /*-----------------------------------------------------------------*/
709 case OP_OPT:
710 ims = code[1];
711 ADD_ACTIVE(state_offset + 2, 0);
712 break;
713
714 /*-----------------------------------------------------------------*/
715 case OP_SOD:
716 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
717 break;
718
719 /*-----------------------------------------------------------------*/
720 case OP_SOM:
721 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
722 break;
723
724
725/* ========================================================================== */
726 /* These opcodes inspect the next subject character, and sometimes
727 the previous one as well, but do not have an argument. The variable
728 clen contains the length of the current character and is zero if we are
729 at the end of the subject. */
730
731 /*-----------------------------------------------------------------*/
732 case OP_ANY:
733 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
734 { ADD_NEW(state_offset + 1, 0); }
735 break;
736
737 /*-----------------------------------------------------------------*/
738 case OP_EODN:
739 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
740 { ADD_ACTIVE(state_offset + 1, 0); }
741 break;
742
743 /*-----------------------------------------------------------------*/
744 case OP_DOLL:
745 if ((md->moptions & PCRE_NOTEOL) == 0)
746 {
747 if (clen == 0 ||
748 (IS_NEWLINE(ptr) &&
749 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
750 ))
751 { ADD_ACTIVE(state_offset + 1, 0); }
752 }
753 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
754 { ADD_ACTIVE(state_offset + 1, 0); }
755 break;
756
757 /*-----------------------------------------------------------------*/
758
759 case OP_DIGIT:
760 case OP_WHITESPACE:
761 case OP_WORDCHAR:
762 if (clen > 0 && c < 256 &&
763 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
764 { ADD_NEW(state_offset + 1, 0); }
765 break;
766
767 /*-----------------------------------------------------------------*/
768 case OP_NOT_DIGIT:
769 case OP_NOT_WHITESPACE:
770 case OP_NOT_WORDCHAR:
771 if (clen > 0 && (c >= 256 ||
772 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
773 { ADD_NEW(state_offset + 1, 0); }
774 break;
775
776 /*-----------------------------------------------------------------*/
777 case OP_WORD_BOUNDARY:
778 case OP_NOT_WORD_BOUNDARY:
779 {
780 int left_word, right_word;
781
782 if (ptr > start_subject)
783 {
784 const uschar *temp = ptr - 1;
785#ifdef SUPPORT_UTF8
786 if (utf8) BACKCHAR(temp);
787#endif
788 GETCHARTEST(d, temp);
789 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
790 }
791 else left_word = 0;
792
793 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
794 else right_word = 0;
795
796 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
797 { ADD_ACTIVE(state_offset + 1, 0); }
798 }
799 break;
800
801
802 /*-----------------------------------------------------------------*/
803 /* Check the next character by Unicode property. We will get here only
804 if the support is in the binary; otherwise a compile-time error occurs.
805 */
806
807#ifdef SUPPORT_UCP
808 case OP_PROP:
809 case OP_NOTPROP:
810 if (clen > 0)
811 {
812 BOOL OK;
813 int category = _pcre_ucp_findprop(c, &chartype, &script);
814 switch(code[1])
815 {
816 case PT_ANY:
817 OK = TRUE;
818 break;
819
820 case PT_LAMP:
821 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
822 break;
823
824 case PT_GC:
825 OK = category == code[2];
826 break;
827
828 case PT_PC:
829 OK = chartype == code[2];
830 break;
831
832 case PT_SC:
833 OK = script == code[2];
834 break;
835
836 /* Should never occur, but keep compilers from grumbling. */
837
838 default:
839 OK = codevalue != OP_PROP;
840 break;
841 }
842
843 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
844 }
845 break;
846#endif
847
848
849
850/* ========================================================================== */
851 /* These opcodes likewise inspect the subject character, but have an
852 argument that is not a data character. It is one of these opcodes:
853 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
854 OP_NOT_WORDCHAR. The value is loaded into d. */
855
856 case OP_TYPEPLUS:
857 case OP_TYPEMINPLUS:
858 case OP_TYPEPOSPLUS:
859 count = current_state->count; /* Already matched */
860 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
861 if (clen > 0)
862 {
863 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
864 (c < 256 &&
865 (d != OP_ANY ||
866 (ims & PCRE_DOTALL) != 0 ||
867 !IS_NEWLINE(ptr)
868 ) &&
869 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
870 {
871 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
872 {
873 active_count--; /* Remove non-match possibility */
874 next_active_state--;
875 }
876 count++;
877 ADD_NEW(state_offset, count);
878 }
879 }
880 break;
881
882 /*-----------------------------------------------------------------*/
883 case OP_TYPEQUERY:
884 case OP_TYPEMINQUERY:
885 case OP_TYPEPOSQUERY:
886 ADD_ACTIVE(state_offset + 2, 0);
887 if (clen > 0)
888 {
889 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
890 (c < 256 &&
891 (d != OP_ANY ||
892 (ims & PCRE_DOTALL) != 0 ||
893 !IS_NEWLINE(ptr)
894 ) &&
895 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896 {
897 if (codevalue == OP_TYPEPOSQUERY)
898 {
899 active_count--; /* Remove non-match possibility */
900 next_active_state--;
901 }
902 ADD_NEW(state_offset + 2, 0);
903 }
904 }
905 break;
906
907 /*-----------------------------------------------------------------*/
908 case OP_TYPESTAR:
909 case OP_TYPEMINSTAR:
910 case OP_TYPEPOSSTAR:
911 ADD_ACTIVE(state_offset + 2, 0);
912 if (clen > 0)
913 {
914 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
915 (c < 256 &&
916 (d != OP_ANY ||
917 (ims & PCRE_DOTALL) != 0 ||
918 !IS_NEWLINE(ptr)
919 ) &&
920 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
921 {
922 if (codevalue == OP_TYPEPOSSTAR)
923 {
924 active_count--; /* Remove non-match possibility */
925 next_active_state--;
926 }
927 ADD_NEW(state_offset, 0);
928 }
929 }
930 break;
931
932 /*-----------------------------------------------------------------*/
933 case OP_TYPEEXACT:
934 count = current_state->count; /* Number already matched */
935 if (clen > 0)
936 {
937 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938 (c < 256 &&
939 (d != OP_ANY ||
940 (ims & PCRE_DOTALL) != 0 ||
941 !IS_NEWLINE(ptr)
942 ) &&
943 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944 {
945 if (++count >= GET2(code, 1))
946 { ADD_NEW(state_offset + 4, 0); }
947 else
948 { ADD_NEW(state_offset, count); }
949 }
950 }
951 break;
952
953 /*-----------------------------------------------------------------*/
954 case OP_TYPEUPTO:
955 case OP_TYPEMINUPTO:
956 case OP_TYPEPOSUPTO:
957 ADD_ACTIVE(state_offset + 4, 0);
958 count = current_state->count; /* Number already matched */
959 if (clen > 0)
960 {
961 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962 (c < 256 &&
963 (d != OP_ANY ||
964 (ims & PCRE_DOTALL) != 0 ||
965 !IS_NEWLINE(ptr)
966 ) &&
967 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
968 {
969 if (codevalue == OP_TYPEPOSUPTO)
970 {
971 active_count--; /* Remove non-match possibility */
972 next_active_state--;
973 }
974 if (++count >= GET2(code, 1))
975 { ADD_NEW(state_offset + 4, 0); }
976 else
977 { ADD_NEW(state_offset, count); }
978 }
979 }
980 break;
981
982/* ========================================================================== */
983 /* These are virtual opcodes that are used when something like
984 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985 argument. It keeps the code above fast for the other cases. The argument
986 is in the d variable. */
987
988#ifdef SUPPORT_UCP
989 case OP_PROP_EXTRA + OP_TYPEPLUS:
990 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 count = current_state->count; /* Already matched */
993 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
994 if (clen > 0)
995 {
996 BOOL OK;
997 int category = _pcre_ucp_findprop(c, &chartype, &script);
998 switch(code[2])
999 {
1000 case PT_ANY:
1001 OK = TRUE;
1002 break;
1003
1004 case PT_LAMP:
1005 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006 break;
1007
1008 case PT_GC:
1009 OK = category == code[3];
1010 break;
1011
1012 case PT_PC:
1013 OK = chartype == code[3];
1014 break;
1015
1016 case PT_SC:
1017 OK = script == code[3];
1018 break;
1019
1020 /* Should never occur, but keep compilers from grumbling. */
1021
1022 default:
1023 OK = codevalue != OP_PROP;
1024 break;
1025 }
1026
1027 if (OK == (d == OP_PROP))
1028 {
1029 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030 {
1031 active_count--; /* Remove non-match possibility */
1032 next_active_state--;
1033 }
1034 count++;
1035 ADD_NEW(state_offset, count);
1036 }
1037 }
1038 break;
1039
1040 /*-----------------------------------------------------------------*/
1041 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 count = current_state->count; /* Already matched */
1045 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047 {
1048 const uschar *nptr = ptr + clen;
1049 int ncount = 0;
1050 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051 {
1052 active_count--; /* Remove non-match possibility */
1053 next_active_state--;
1054 }
1055 while (nptr < end_subject)
1056 {
1057 int nd;
1058 int ndlen = 1;
1059 GETCHARLEN(nd, nptr, ndlen);
1060 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 ncount++;
1062 nptr += ndlen;
1063 }
1064 count++;
1065 ADD_NEW_DATA(-state_offset, count, ncount);
1066 }
1067 break;
1068#endif
1069
1070 /*-----------------------------------------------------------------*/
1071 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074 count = current_state->count; /* Already matched */
1075 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076 if (clen > 0)
1077 {
1078 int ncount = 0;
1079 switch (c)
1080 {
1081 case 0x000b:
1082 case 0x000c:
1083 case 0x0085:
1084 case 0x2028:
1085 case 0x2029:
1086 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1087 goto ANYNL01;
1088
1089 case 0x000d:
1090 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1091 /* Fall through */
1092
1093 ANYNL01:
1094 case 0x000a:
1095 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1096 {
1097 active_count--; /* Remove non-match possibility */
1098 next_active_state--;
1099 }
1100 count++;
1101 ADD_NEW_DATA(-state_offset, count, ncount);
1102 break;
1103
1104 default:
1105 break;
1106 }
1107 }
1108 break;
1109
1110 /*-----------------------------------------------------------------*/
1111 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1112 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1113 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1114 count = current_state->count; /* Already matched */
1115 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1116 if (clen > 0)
1117 {
1118 BOOL OK;
1119 switch (c)
1120 {
1121 case 0x000a:
1122 case 0x000b:
1123 case 0x000c:
1124 case 0x000d:
1125 case 0x0085:
1126 case 0x2028:
1127 case 0x2029:
1128 OK = TRUE;
1129 break;
1130
1131 default:
1132 OK = FALSE;
1133 break;
1134 }
1135
1136 if (OK == (d == OP_VSPACE))
1137 {
1138 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1139 {
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1142 }
1143 count++;
1144 ADD_NEW_DATA(-state_offset, count, 0);
1145 }
1146 }
1147 break;
1148
1149 /*-----------------------------------------------------------------*/
1150 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1151 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1152 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1153 count = current_state->count; /* Already matched */
1154 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1155 if (clen > 0)
1156 {
1157 BOOL OK;
1158 switch (c)
1159 {
1160 case 0x09: /* HT */
1161 case 0x20: /* SPACE */
1162 case 0xa0: /* NBSP */
1163 case 0x1680: /* OGHAM SPACE MARK */
1164 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1165 case 0x2000: /* EN QUAD */
1166 case 0x2001: /* EM QUAD */
1167 case 0x2002: /* EN SPACE */
1168 case 0x2003: /* EM SPACE */
1169 case 0x2004: /* THREE-PER-EM SPACE */
1170 case 0x2005: /* FOUR-PER-EM SPACE */
1171 case 0x2006: /* SIX-PER-EM SPACE */
1172 case 0x2007: /* FIGURE SPACE */
1173 case 0x2008: /* PUNCTUATION SPACE */
1174 case 0x2009: /* THIN SPACE */
1175 case 0x200A: /* HAIR SPACE */
1176 case 0x202f: /* NARROW NO-BREAK SPACE */
1177 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1178 case 0x3000: /* IDEOGRAPHIC SPACE */
1179 OK = TRUE;
1180 break;
1181
1182 default:
1183 OK = FALSE;
1184 break;
1185 }
1186
1187 if (OK == (d == OP_HSPACE))
1188 {
1189 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1190 {
1191 active_count--; /* Remove non-match possibility */
1192 next_active_state--;
1193 }
1194 count++;
1195 ADD_NEW_DATA(-state_offset, count, 0);
1196 }
1197 }
1198 break;
1199
1200 /*-----------------------------------------------------------------*/
1201#ifdef SUPPORT_UCP
1202 case OP_PROP_EXTRA + OP_TYPEQUERY:
1203 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1204 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1205 count = 4;
1206 goto QS1;
1207
1208 case OP_PROP_EXTRA + OP_TYPESTAR:
1209 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1210 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1211 count = 0;
1212
1213 QS1:
1214
1215 ADD_ACTIVE(state_offset + 4, 0);
1216 if (clen > 0)
1217 {
1218 BOOL OK;
1219 int category = _pcre_ucp_findprop(c, &chartype, &script);
1220 switch(code[2])
1221 {
1222 case PT_ANY:
1223 OK = TRUE;
1224 break;
1225
1226 case PT_LAMP:
1227 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1228 break;
1229
1230 case PT_GC:
1231 OK = category == code[3];
1232 break;
1233
1234 case PT_PC:
1235 OK = chartype == code[3];
1236 break;
1237
1238 case PT_SC:
1239 OK = script == code[3];
1240 break;
1241
1242 /* Should never occur, but keep compilers from grumbling. */
1243
1244 default:
1245 OK = codevalue != OP_PROP;
1246 break;
1247 }
1248
1249 if (OK == (d == OP_PROP))
1250 {
1251 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1252 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1253 {
1254 active_count--; /* Remove non-match possibility */
1255 next_active_state--;
1256 }
1257 ADD_NEW(state_offset + count, 0);
1258 }
1259 }
1260 break;
1261
1262 /*-----------------------------------------------------------------*/
1263 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1264 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1265 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1266 count = 2;
1267 goto QS2;
1268
1269 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1270 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1271 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1272 count = 0;
1273
1274 QS2:
1275
1276 ADD_ACTIVE(state_offset + 2, 0);
1277 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1278 {
1279 const uschar *nptr = ptr + clen;
1280 int ncount = 0;
1281 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1282 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1283 {
1284 active_count--; /* Remove non-match possibility */
1285 next_active_state--;
1286 }
1287 while (nptr < end_subject)
1288 {
1289 int nd;
1290 int ndlen = 1;
1291 GETCHARLEN(nd, nptr, ndlen);
1292 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1293 ncount++;
1294 nptr += ndlen;
1295 }
1296 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1297 }
1298 break;
1299#endif
1300
1301 /*-----------------------------------------------------------------*/
1302 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1303 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1304 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1305 count = 2;
1306 goto QS3;
1307
1308 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1309 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1310 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1311 count = 0;
1312
1313 QS3:
1314 ADD_ACTIVE(state_offset + 2, 0);
1315 if (clen > 0)
1316 {
1317 int ncount = 0;
1318 switch (c)
1319 {
1320 case 0x000b:
1321 case 0x000c:
1322 case 0x0085:
1323 case 0x2028:
1324 case 0x2029:
1325 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1326 goto ANYNL02;
1327
1328 case 0x000d:
1329 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1330 /* Fall through */
1331
1332 ANYNL02:
1333 case 0x000a:
1334 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1335 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1336 {
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1339 }
1340 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341 break;
1342
1343 default:
1344 break;
1345 }
1346 }
1347 break;
1348
1349 /*-----------------------------------------------------------------*/
1350 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1351 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1352 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1353 count = 2;
1354 goto QS4;
1355
1356 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1357 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1358 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1359 count = 0;
1360
1361 QS4:
1362 ADD_ACTIVE(state_offset + 2, 0);
1363 if (clen > 0)
1364 {
1365 BOOL OK;
1366 switch (c)
1367 {
1368 case 0x000a:
1369 case 0x000b:
1370 case 0x000c:
1371 case 0x000d:
1372 case 0x0085:
1373 case 0x2028:
1374 case 0x2029:
1375 OK = TRUE;
1376 break;
1377
1378 default:
1379 OK = FALSE;
1380 break;
1381 }
1382 if (OK == (d == OP_VSPACE))
1383 {
1384 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1385 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1386 {
1387 active_count--; /* Remove non-match possibility */
1388 next_active_state--;
1389 }
1390 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1391 }
1392 }
1393 break;
1394
1395 /*-----------------------------------------------------------------*/
1396 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1397 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1398 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1399 count = 2;
1400 goto QS5;
1401
1402 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1403 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1404 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1405 count = 0;
1406
1407 QS5:
1408 ADD_ACTIVE(state_offset + 2, 0);
1409 if (clen > 0)
1410 {
1411 BOOL OK;
1412 switch (c)
1413 {
1414 case 0x09: /* HT */
1415 case 0x20: /* SPACE */
1416 case 0xa0: /* NBSP */
1417 case 0x1680: /* OGHAM SPACE MARK */
1418 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1419 case 0x2000: /* EN QUAD */
1420 case 0x2001: /* EM QUAD */
1421 case 0x2002: /* EN SPACE */
1422 case 0x2003: /* EM SPACE */
1423 case 0x2004: /* THREE-PER-EM SPACE */
1424 case 0x2005: /* FOUR-PER-EM SPACE */
1425 case 0x2006: /* SIX-PER-EM SPACE */
1426 case 0x2007: /* FIGURE SPACE */
1427 case 0x2008: /* PUNCTUATION SPACE */
1428 case 0x2009: /* THIN SPACE */
1429 case 0x200A: /* HAIR SPACE */
1430 case 0x202f: /* NARROW NO-BREAK SPACE */
1431 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1432 case 0x3000: /* IDEOGRAPHIC SPACE */
1433 OK = TRUE;
1434 break;
1435
1436 default:
1437 OK = FALSE;
1438 break;
1439 }
1440
1441 if (OK == (d == OP_HSPACE))
1442 {
1443 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1444 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1445 {
1446 active_count--; /* Remove non-match possibility */
1447 next_active_state--;
1448 }
1449 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1450 }
1451 }
1452 break;
1453
1454 /*-----------------------------------------------------------------*/
1455#ifdef SUPPORT_UCP
1456 case OP_PROP_EXTRA + OP_TYPEEXACT:
1457 case OP_PROP_EXTRA + OP_TYPEUPTO:
1458 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1459 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1460 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1461 { ADD_ACTIVE(state_offset + 6, 0); }
1462 count = current_state->count; /* Number already matched */
1463 if (clen > 0)
1464 {
1465 BOOL OK;
1466 int category = _pcre_ucp_findprop(c, &chartype, &script);
1467 switch(code[4])
1468 {
1469 case PT_ANY:
1470 OK = TRUE;
1471 break;
1472
1473 case PT_LAMP:
1474 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1475 break;
1476
1477 case PT_GC:
1478 OK = category == code[5];
1479 break;
1480
1481 case PT_PC:
1482 OK = chartype == code[5];
1483 break;
1484
1485 case PT_SC:
1486 OK = script == code[5];
1487 break;
1488
1489 /* Should never occur, but keep compilers from grumbling. */
1490
1491 default:
1492 OK = codevalue != OP_PROP;
1493 break;
1494 }
1495
1496 if (OK == (d == OP_PROP))
1497 {
1498 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1499 {
1500 active_count--; /* Remove non-match possibility */
1501 next_active_state--;
1502 }
1503 if (++count >= GET2(code, 1))
1504 { ADD_NEW(state_offset + 6, 0); }
1505 else
1506 { ADD_NEW(state_offset, count); }
1507 }
1508 }
1509 break;
1510
1511 /*-----------------------------------------------------------------*/
1512 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1513 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1514 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1515 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1516 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1517 { ADD_ACTIVE(state_offset + 4, 0); }
1518 count = current_state->count; /* Number already matched */
1519 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1520 {
1521 const uschar *nptr = ptr + clen;
1522 int ncount = 0;
1523 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1524 {
1525 active_count--; /* Remove non-match possibility */
1526 next_active_state--;
1527 }
1528 while (nptr < end_subject)
1529 {
1530 int nd;
1531 int ndlen = 1;
1532 GETCHARLEN(nd, nptr, ndlen);
1533 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1534 ncount++;
1535 nptr += ndlen;
1536 }
1537 if (++count >= GET2(code, 1))
1538 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1539 else
1540 { ADD_NEW_DATA(-state_offset, count, ncount); }
1541 }
1542 break;
1543#endif
1544
1545 /*-----------------------------------------------------------------*/
1546 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1547 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1548 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1549 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1550 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1551 { ADD_ACTIVE(state_offset + 4, 0); }
1552 count = current_state->count; /* Number already matched */
1553 if (clen > 0)
1554 {
1555 int ncount = 0;
1556 switch (c)
1557 {
1558 case 0x000b:
1559 case 0x000c:
1560 case 0x0085:
1561 case 0x2028:
1562 case 0x2029:
1563 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1564 goto ANYNL03;
1565
1566 case 0x000d:
1567 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1568 /* Fall through */
1569
1570 ANYNL03:
1571 case 0x000a:
1572 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1573 {
1574 active_count--; /* Remove non-match possibility */
1575 next_active_state--;
1576 }
1577 if (++count >= GET2(code, 1))
1578 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1579 else
1580 { ADD_NEW_DATA(-state_offset, count, ncount); }
1581 break;
1582
1583 default:
1584 break;
1585 }
1586 }
1587 break;
1588
1589 /*-----------------------------------------------------------------*/
1590 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1591 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1592 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1593 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1594 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1595 { ADD_ACTIVE(state_offset + 4, 0); }
1596 count = current_state->count; /* Number already matched */
1597 if (clen > 0)
1598 {
1599 BOOL OK;
1600 switch (c)
1601 {
1602 case 0x000a:
1603 case 0x000b:
1604 case 0x000c:
1605 case 0x000d:
1606 case 0x0085:
1607 case 0x2028:
1608 case 0x2029:
1609 OK = TRUE;
1610 break;
1611
1612 default:
1613 OK = FALSE;
1614 }
1615
1616 if (OK == (d == OP_VSPACE))
1617 {
1618 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1619 {
1620 active_count--; /* Remove non-match possibility */
1621 next_active_state--;
1622 }
1623 if (++count >= GET2(code, 1))
1624 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1625 else
1626 { ADD_NEW_DATA(-state_offset, count, 0); }
1627 }
1628 }
1629 break;
1630
1631 /*-----------------------------------------------------------------*/
1632 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1633 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1634 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1635 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1636 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1637 { ADD_ACTIVE(state_offset + 4, 0); }
1638 count = current_state->count; /* Number already matched */
1639 if (clen > 0)
1640 {
1641 BOOL OK;
1642 switch (c)
1643 {
1644 case 0x09: /* HT */
1645 case 0x20: /* SPACE */
1646 case 0xa0: /* NBSP */
1647 case 0x1680: /* OGHAM SPACE MARK */
1648 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649 case 0x2000: /* EN QUAD */
1650 case 0x2001: /* EM QUAD */
1651 case 0x2002: /* EN SPACE */
1652 case 0x2003: /* EM SPACE */
1653 case 0x2004: /* THREE-PER-EM SPACE */
1654 case 0x2005: /* FOUR-PER-EM SPACE */
1655 case 0x2006: /* SIX-PER-EM SPACE */
1656 case 0x2007: /* FIGURE SPACE */
1657 case 0x2008: /* PUNCTUATION SPACE */
1658 case 0x2009: /* THIN SPACE */
1659 case 0x200A: /* HAIR SPACE */
1660 case 0x202f: /* NARROW NO-BREAK SPACE */
1661 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662 case 0x3000: /* IDEOGRAPHIC SPACE */
1663 OK = TRUE;
1664 break;
1665
1666 default:
1667 OK = FALSE;
1668 break;
1669 }
1670
1671 if (OK == (d == OP_HSPACE))
1672 {
1673 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1674 {
1675 active_count--; /* Remove non-match possibility */
1676 next_active_state--;
1677 }
1678 if (++count >= GET2(code, 1))
1679 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1680 else
1681 { ADD_NEW_DATA(-state_offset, count, 0); }
1682 }
1683 }
1684 break;
1685
1686/* ========================================================================== */
1687 /* These opcodes are followed by a character that is usually compared
1688 to the current subject character; it is loaded into d. We still get
1689 here even if there is no subject character, because in some cases zero
1690 repetitions are permitted. */
1691
1692 /*-----------------------------------------------------------------*/
1693 case OP_CHAR:
1694 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1695 break;
1696
1697 /*-----------------------------------------------------------------*/
1698 case OP_CHARNC:
1699 if (clen == 0) break;
1700
1701#ifdef SUPPORT_UTF8
1702 if (utf8)
1703 {
1704 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1705 {
1706 unsigned int othercase;
1707 if (c < 128) othercase = fcc[c]; else
1708
1709 /* If we have Unicode property support, we can use it to test the
1710 other case of the character. */
1711
1712#ifdef SUPPORT_UCP
1713 othercase = _pcre_ucp_othercase(c);
1714#else
1715 othercase = NOTACHAR;
1716#endif
1717
1718 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1719 }
1720 }
1721 else
1722#endif /* SUPPORT_UTF8 */
1723
1724 /* Non-UTF-8 mode */
1725 {
1726 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1727 }
1728 break;
1729
1730
1731#ifdef SUPPORT_UCP
1732 /*-----------------------------------------------------------------*/
1733 /* This is a tricky one because it can match more than one character.
1734 Find out how many characters to skip, and then set up a negative state
1735 to wait for them to pass before continuing. */
1736
1737 case OP_EXTUNI:
1738 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1739 {
1740 const uschar *nptr = ptr + clen;
1741 int ncount = 0;
1742 while (nptr < end_subject)
1743 {
1744 int nclen = 1;
1745 GETCHARLEN(c, nptr, nclen);
1746 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1747 ncount++;
1748 nptr += nclen;
1749 }
1750 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1751 }
1752 break;
1753#endif
1754
1755 /*-----------------------------------------------------------------*/
1756 /* This is a tricky like EXTUNI because it too can match more than one
1757 character (when CR is followed by LF). In this case, set up a negative
1758 state to wait for one character to pass before continuing. */
1759
1760 case OP_ANYNL:
1761 if (clen > 0) switch(c)
1762 {
1763 case 0x000b:
1764 case 0x000c:
1765 case 0x0085:
1766 case 0x2028:
1767 case 0x2029:
1768 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1769
1770 case 0x000a:
1771 ADD_NEW(state_offset + 1, 0);
1772 break;
1773
1774 case 0x000d:
1775 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1776 {
1777 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1778 }
1779 else
1780 {
1781 ADD_NEW(state_offset + 1, 0);
1782 }
1783 break;
1784 }
1785 break;
1786
1787 /*-----------------------------------------------------------------*/
1788 case OP_NOT_VSPACE:
1789 if (clen > 0) switch(c)
1790 {
1791 case 0x000a:
1792 case 0x000b:
1793 case 0x000c:
1794 case 0x000d:
1795 case 0x0085:
1796 case 0x2028:
1797 case 0x2029:
1798 break;
1799
1800 default:
1801 ADD_NEW(state_offset + 1, 0);
1802 break;
1803 }
1804 break;
1805
1806 /*-----------------------------------------------------------------*/
1807 case OP_VSPACE:
1808 if (clen > 0) switch(c)
1809 {
1810 case 0x000a:
1811 case 0x000b:
1812 case 0x000c:
1813 case 0x000d:
1814 case 0x0085:
1815 case 0x2028:
1816 case 0x2029:
1817 ADD_NEW(state_offset + 1, 0);
1818 break;
1819
1820 default: break;
1821 }
1822 break;
1823
1824 /*-----------------------------------------------------------------*/
1825 case OP_NOT_HSPACE:
1826 if (clen > 0) switch(c)
1827 {
1828 case 0x09: /* HT */
1829 case 0x20: /* SPACE */
1830 case 0xa0: /* NBSP */
1831 case 0x1680: /* OGHAM SPACE MARK */
1832 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1833 case 0x2000: /* EN QUAD */
1834 case 0x2001: /* EM QUAD */
1835 case 0x2002: /* EN SPACE */
1836 case 0x2003: /* EM SPACE */
1837 case 0x2004: /* THREE-PER-EM SPACE */
1838 case 0x2005: /* FOUR-PER-EM SPACE */
1839 case 0x2006: /* SIX-PER-EM SPACE */
1840 case 0x2007: /* FIGURE SPACE */
1841 case 0x2008: /* PUNCTUATION SPACE */
1842 case 0x2009: /* THIN SPACE */
1843 case 0x200A: /* HAIR SPACE */
1844 case 0x202f: /* NARROW NO-BREAK SPACE */
1845 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1846 case 0x3000: /* IDEOGRAPHIC SPACE */
1847 break;
1848
1849 default:
1850 ADD_NEW(state_offset + 1, 0);
1851 break;
1852 }
1853 break;
1854
1855 /*-----------------------------------------------------------------*/
1856 case OP_HSPACE:
1857 if (clen > 0) switch(c)
1858 {
1859 case 0x09: /* HT */
1860 case 0x20: /* SPACE */
1861 case 0xa0: /* NBSP */
1862 case 0x1680: /* OGHAM SPACE MARK */
1863 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1864 case 0x2000: /* EN QUAD */
1865 case 0x2001: /* EM QUAD */
1866 case 0x2002: /* EN SPACE */
1867 case 0x2003: /* EM SPACE */
1868 case 0x2004: /* THREE-PER-EM SPACE */
1869 case 0x2005: /* FOUR-PER-EM SPACE */
1870 case 0x2006: /* SIX-PER-EM SPACE */
1871 case 0x2007: /* FIGURE SPACE */
1872 case 0x2008: /* PUNCTUATION SPACE */
1873 case 0x2009: /* THIN SPACE */
1874 case 0x200A: /* HAIR SPACE */
1875 case 0x202f: /* NARROW NO-BREAK SPACE */
1876 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1877 case 0x3000: /* IDEOGRAPHIC SPACE */
1878 ADD_NEW(state_offset + 1, 0);
1879 break;
1880 }
1881 break;
1882
1883 /*-----------------------------------------------------------------*/
1884 /* Match a negated single character. This is only used for one-byte
1885 characters, that is, we know that d < 256. The character we are
1886 checking (c) can be multibyte. */
1887
1888 case OP_NOT:
1889 if (clen > 0)
1890 {
1891 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1892 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1893 }
1894 break;
1895
1896 /*-----------------------------------------------------------------*/
1897 case OP_PLUS:
1898 case OP_MINPLUS:
1899 case OP_POSPLUS:
1900 case OP_NOTPLUS:
1901 case OP_NOTMINPLUS:
1902 case OP_NOTPOSPLUS:
1903 count = current_state->count; /* Already matched */
1904 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1905 if (clen > 0)
1906 {
1907 unsigned int otherd = NOTACHAR;
1908 if ((ims & PCRE_CASELESS) != 0)
1909 {
1910#ifdef SUPPORT_UTF8
1911 if (utf8 && d >= 128)
1912 {
1913#ifdef SUPPORT_UCP
1914 otherd = _pcre_ucp_othercase(d);
1915#endif /* SUPPORT_UCP */
1916 }
1917 else
1918#endif /* SUPPORT_UTF8 */
1919 otherd = fcc[d];
1920 }
1921 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1922 {
1923 if (count > 0 &&
1924 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1925 {
1926 active_count--; /* Remove non-match possibility */
1927 next_active_state--;
1928 }
1929 count++;
1930 ADD_NEW(state_offset, count);
1931 }
1932 }
1933 break;
1934
1935 /*-----------------------------------------------------------------*/
1936 case OP_QUERY:
1937 case OP_MINQUERY:
1938 case OP_POSQUERY:
1939 case OP_NOTQUERY:
1940 case OP_NOTMINQUERY:
1941 case OP_NOTPOSQUERY:
1942 ADD_ACTIVE(state_offset + dlen + 1, 0);
1943 if (clen > 0)
1944 {
1945 unsigned int otherd = NOTACHAR;
1946 if ((ims & PCRE_CASELESS) != 0)
1947 {
1948#ifdef SUPPORT_UTF8
1949 if (utf8 && d >= 128)
1950 {
1951#ifdef SUPPORT_UCP
1952 otherd = _pcre_ucp_othercase(d);
1953#endif /* SUPPORT_UCP */
1954 }
1955 else
1956#endif /* SUPPORT_UTF8 */
1957 otherd = fcc[d];
1958 }
1959 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1960 {
1961 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1962 {
1963 active_count--; /* Remove non-match possibility */
1964 next_active_state--;
1965 }
1966 ADD_NEW(state_offset + dlen + 1, 0);
1967 }
1968 }
1969 break;
1970
1971 /*-----------------------------------------------------------------*/
1972 case OP_STAR:
1973 case OP_MINSTAR:
1974 case OP_POSSTAR:
1975 case OP_NOTSTAR:
1976 case OP_NOTMINSTAR:
1977 case OP_NOTPOSSTAR:
1978 ADD_ACTIVE(state_offset + dlen + 1, 0);
1979 if (clen > 0)
1980 {
1981 unsigned int otherd = NOTACHAR;
1982 if ((ims & PCRE_CASELESS) != 0)
1983 {
1984#ifdef SUPPORT_UTF8
1985 if (utf8 && d >= 128)
1986 {
1987#ifdef SUPPORT_UCP
1988 otherd = _pcre_ucp_othercase(d);
1989#endif /* SUPPORT_UCP */
1990 }
1991 else
1992#endif /* SUPPORT_UTF8 */
1993 otherd = fcc[d];
1994 }
1995 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1996 {
1997 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1998 {
1999 active_count--; /* Remove non-match possibility */
2000 next_active_state--;
2001 }
2002 ADD_NEW(state_offset, 0);
2003 }
2004 }
2005 break;
2006
2007 /*-----------------------------------------------------------------*/
2008 case OP_EXACT:
2009 case OP_NOTEXACT:
2010 count = current_state->count; /* Number already matched */
2011 if (clen > 0)
2012 {
2013 unsigned int otherd = NOTACHAR;
2014 if ((ims & PCRE_CASELESS) != 0)
2015 {
2016#ifdef SUPPORT_UTF8
2017 if (utf8 && d >= 128)
2018 {
2019#ifdef SUPPORT_UCP
2020 otherd = _pcre_ucp_othercase(d);
2021#endif /* SUPPORT_UCP */
2022 }
2023 else
2024#endif /* SUPPORT_UTF8 */
2025 otherd = fcc[d];
2026 }
2027 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2028 {
2029 if (++count >= GET2(code, 1))
2030 { ADD_NEW(state_offset + dlen + 3, 0); }
2031 else
2032 { ADD_NEW(state_offset, count); }
2033 }
2034 }
2035 break;
2036
2037 /*-----------------------------------------------------------------*/
2038 case OP_UPTO:
2039 case OP_MINUPTO:
2040 case OP_POSUPTO:
2041 case OP_NOTUPTO:
2042 case OP_NOTMINUPTO:
2043 case OP_NOTPOSUPTO:
2044 ADD_ACTIVE(state_offset + dlen + 3, 0);
2045 count = current_state->count; /* Number already matched */
2046 if (clen > 0)
2047 {
2048 unsigned int otherd = NOTACHAR;
2049 if ((ims & PCRE_CASELESS) != 0)
2050 {
2051#ifdef SUPPORT_UTF8
2052 if (utf8 && d >= 128)
2053 {
2054#ifdef SUPPORT_UCP
2055 otherd = _pcre_ucp_othercase(d);
2056#endif /* SUPPORT_UCP */
2057 }
2058 else
2059#endif /* SUPPORT_UTF8 */
2060 otherd = fcc[d];
2061 }
2062 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2063 {
2064 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2065 {
2066 active_count--; /* Remove non-match possibility */
2067 next_active_state--;
2068 }
2069 if (++count >= GET2(code, 1))
2070 { ADD_NEW(state_offset + dlen + 3, 0); }
2071 else
2072 { ADD_NEW(state_offset, count); }
2073 }
2074 }
2075 break;
2076
2077
2078/* ========================================================================== */
2079 /* These are the class-handling opcodes */
2080
2081 case OP_CLASS:
2082 case OP_NCLASS:
2083 case OP_XCLASS:
2084 {
2085 BOOL isinclass = FALSE;
2086 int next_state_offset;
2087 const uschar *ecode;
2088
2089 /* For a simple class, there is always just a 32-byte table, and we
2090 can set isinclass from it. */
2091
2092 if (codevalue != OP_XCLASS)
2093 {
2094 ecode = code + 33;
2095 if (clen > 0)
2096 {
2097 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2098 ((code[1 + c/8] & (1 << (c&7))) != 0);
2099 }
2100 }
2101
2102 /* An extended class may have a table or a list of single characters,
2103 ranges, or both, and it may be positive or negative. There's a
2104 function that sorts all this out. */
2105
2106 else
2107 {
2108 ecode = code + GET(code, 1);
2109 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2110 }
2111
2112 /* At this point, isinclass is set for all kinds of class, and ecode
2113 points to the byte after the end of the class. If there is a
2114 quantifier, this is where it will be. */
2115
2116 next_state_offset = ecode - start_code;
2117
2118 switch (*ecode)
2119 {
2120 case OP_CRSTAR:
2121 case OP_CRMINSTAR:
2122 ADD_ACTIVE(next_state_offset + 1, 0);
2123 if (isinclass) { ADD_NEW(state_offset, 0); }
2124 break;
2125
2126 case OP_CRPLUS:
2127 case OP_CRMINPLUS:
2128 count = current_state->count; /* Already matched */
2129 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2130 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2131 break;
2132
2133 case OP_CRQUERY:
2134 case OP_CRMINQUERY:
2135 ADD_ACTIVE(next_state_offset + 1, 0);
2136 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2137 break;
2138
2139 case OP_CRRANGE:
2140 case OP_CRMINRANGE:
2141 count = current_state->count; /* Already matched */
2142 if (count >= GET2(ecode, 1))
2143 { ADD_ACTIVE(next_state_offset + 5, 0); }
2144 if (isinclass)
2145 {
2146 int max = GET2(ecode, 3);
2147 if (++count >= max && max != 0) /* Max 0 => no limit */
2148 { ADD_NEW(next_state_offset + 5, 0); }
2149 else
2150 { ADD_NEW(state_offset, count); }
2151 }
2152 break;
2153
2154 default:
2155 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2156 break;
2157 }
2158 }
2159 break;
2160
2161/* ========================================================================== */
2162 /* These are the opcodes for fancy brackets of various kinds. We have
2163 to use recursion in order to handle them. */
2164
2165 case OP_ASSERT:
2166 case OP_ASSERT_NOT:
2167 case OP_ASSERTBACK:
2168 case OP_ASSERTBACK_NOT:
2169 {
2170 int rc;
2171 int local_offsets[2];
2172 int local_workspace[1000];
2173 const uschar *endasscode = code + GET(code, 1);
2174
2175 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2176
2177 rc = internal_dfa_exec(
2178 md, /* static match data */
2179 code, /* this subexpression's code */
2180 ptr, /* where we currently are */
2181 ptr - start_subject, /* start offset */
2182 local_offsets, /* offset vector */
2183 sizeof(local_offsets)/sizeof(int), /* size of same */
2184 local_workspace, /* workspace vector */
2185 sizeof(local_workspace)/sizeof(int), /* size of same */
2186 ims, /* the current ims flags */
2187 rlevel, /* function recursion level */
2188 recursing); /* pass on regex recursion */
2189
2190 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2191 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2192 }
2193 break;
2194
2195 /*-----------------------------------------------------------------*/
2196 case OP_COND:
2197 case OP_SCOND:
2198 {
2199 int local_offsets[1000];
2200 int local_workspace[1000];
2201 int condcode = code[LINK_SIZE+1];
2202
2203 /* Back reference conditions are not supported */
2204
2205 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2206
2207 /* The DEFINE condition is always false */
2208
2209 if (condcode == OP_DEF)
2210 {
2211 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2212 }
2213
2214 /* The only supported version of OP_RREF is for the value RREF_ANY,
2215 which means "test if in any recursion". We can't test for specifically
2216 recursed groups. */
2217
2218 else if (condcode == OP_RREF)
2219 {
2220 int value = GET2(code, LINK_SIZE+2);
2221 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2222 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2223 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2224 }
2225
2226 /* Otherwise, the condition is an assertion */
2227
2228 else
2229 {
2230 int rc;
2231 const uschar *asscode = code + LINK_SIZE + 1;
2232 const uschar *endasscode = asscode + GET(asscode, 1);
2233
2234 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2235
2236 rc = internal_dfa_exec(
2237 md, /* fixed match data */
2238 asscode, /* this subexpression's code */
2239 ptr, /* where we currently are */
2240 ptr - start_subject, /* start offset */
2241 local_offsets, /* offset vector */
2242 sizeof(local_offsets)/sizeof(int), /* size of same */
2243 local_workspace, /* workspace vector */
2244 sizeof(local_workspace)/sizeof(int), /* size of same */
2245 ims, /* the current ims flags */
2246 rlevel, /* function recursion level */
2247 recursing); /* pass on regex recursion */
2248
2249 if ((rc >= 0) ==
2250 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2251 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2252 else
2253 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2254 }
2255 }
2256 break;
2257
2258 /*-----------------------------------------------------------------*/
2259 case OP_RECURSE:
2260 {
2261 int local_offsets[1000];
2262 int local_workspace[1000];
2263 int rc;
2264
2265 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2266 recursing + 1));
2267
2268 rc = internal_dfa_exec(
2269 md, /* fixed match data */
2270 start_code + GET(code, 1), /* this subexpression's code */
2271 ptr, /* where we currently are */
2272 ptr - start_subject, /* start offset */
2273 local_offsets, /* offset vector */
2274 sizeof(local_offsets)/sizeof(int), /* size of same */
2275 local_workspace, /* workspace vector */
2276 sizeof(local_workspace)/sizeof(int), /* size of same */
2277 ims, /* the current ims flags */
2278 rlevel, /* function recursion level */
2279 recursing + 1); /* regex recurse level */
2280
2281 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2282 recursing + 1, rc));
2283
2284 /* Ran out of internal offsets */
2285
2286 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2287
2288 /* For each successful matched substring, set up the next state with a
2289 count of characters to skip before trying it. Note that the count is in
2290 characters, not bytes. */
2291
2292 if (rc > 0)
2293 {
2294 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2295 {
2296 const uschar *p = start_subject + local_offsets[rc];
2297 const uschar *pp = start_subject + local_offsets[rc+1];
2298 int charcount = local_offsets[rc+1] - local_offsets[rc];
2299 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2300 if (charcount > 0)
2301 {
2302 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2303 }
2304 else
2305 {
2306 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2307 }
2308 }
2309 }
2310 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2311 }
2312 break;
2313
2314 /*-----------------------------------------------------------------*/
2315 case OP_ONCE:
2316 {
2317 int local_offsets[2];
2318 int local_workspace[1000];
2319
2320 int rc = internal_dfa_exec(
2321 md, /* fixed match data */
2322 code, /* this subexpression's code */
2323 ptr, /* where we currently are */
2324 ptr - start_subject, /* start offset */
2325 local_offsets, /* offset vector */
2326 sizeof(local_offsets)/sizeof(int), /* size of same */
2327 local_workspace, /* workspace vector */
2328 sizeof(local_workspace)/sizeof(int), /* size of same */
2329 ims, /* the current ims flags */
2330 rlevel, /* function recursion level */
2331 recursing); /* pass on regex recursion */
2332
2333 if (rc >= 0)
2334 {
2335 const uschar *end_subpattern = code;
2336 int charcount = local_offsets[1] - local_offsets[0];
2337 int next_state_offset, repeat_state_offset;
2338
2339 do { end_subpattern += GET(end_subpattern, 1); }
2340 while (*end_subpattern == OP_ALT);
2341 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2342
2343 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2344 arrange for the repeat state also to be added to the relevant list.
2345 Calculate the offset, or set -1 for no repeat. */
2346
2347 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2348 *end_subpattern == OP_KETRMIN)?
2349 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2350
2351 /* If we have matched an empty string, add the next state at the
2352 current character pointer. This is important so that the duplicate
2353 checking kicks in, which is what breaks infinite loops that match an
2354 empty string. */
2355
2356 if (charcount == 0)
2357 {
2358 ADD_ACTIVE(next_state_offset, 0);
2359 }
2360
2361 /* Optimization: if there are no more active states, and there
2362 are no new states yet set up, then skip over the subject string
2363 right here, to save looping. Otherwise, set up the new state to swing
2364 into action when the end of the substring is reached. */
2365
2366 else if (i + 1 >= active_count && new_count == 0)
2367 {
2368 ptr += charcount;
2369 clen = 0;
2370 ADD_NEW(next_state_offset, 0);
2371
2372 /* If we are adding a repeat state at the new character position,
2373 we must fudge things so that it is the only current state.
2374 Otherwise, it might be a duplicate of one we processed before, and
2375 that would cause it to be skipped. */
2376
2377 if (repeat_state_offset >= 0)
2378 {
2379 next_active_state = active_states;
2380 active_count = 0;
2381 i = -1;
2382 ADD_ACTIVE(repeat_state_offset, 0);
2383 }
2384 }
2385 else
2386 {
2387 const uschar *p = start_subject + local_offsets[0];
2388 const uschar *pp = start_subject + local_offsets[1];
2389 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2390 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2391 if (repeat_state_offset >= 0)
2392 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2393 }
2394
2395 }
2396 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2397 }
2398 break;
2399
2400
2401/* ========================================================================== */
2402 /* Handle callouts */
2403
2404 case OP_CALLOUT:
2405 if (pcre_callout != NULL)
2406 {
2407 int rrc;
2408 pcre_callout_block cb;
2409 cb.version = 1; /* Version 1 of the callout block */
2410 cb.callout_number = code[1];
2411 cb.offset_vector = offsets;
2412 cb.subject = (PCRE_SPTR)start_subject;
2413 cb.subject_length = end_subject - start_subject;
2414 cb.start_match = current_subject - start_subject;
2415 cb.current_position = ptr - start_subject;
2416 cb.pattern_position = GET(code, 2);
2417 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2418 cb.capture_top = 1;
2419 cb.capture_last = -1;
2420 cb.callout_data = md->callout_data;
2421 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2422 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2423 }
2424 break;
2425
2426
2427/* ========================================================================== */
2428 default: /* Unsupported opcode */
2429 return PCRE_ERROR_DFA_UITEM;
2430 }
2431
2432 NEXT_ACTIVE_STATE: continue;
2433
2434 } /* End of loop scanning active states */
2435
2436 /* We have finished the processing at the current subject character. If no
2437 new states have been set for the next character, we have found all the
2438 matches that we are going to find. If we are at the top level and partial
2439 matching has been requested, check for appropriate conditions. */
2440
2441 if (new_count <= 0)
2442 {
2443 if (match_count < 0 && /* No matches found */
2444 rlevel == 1 && /* Top level match function */
2445 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2446 ptr >= end_subject && /* Reached end of subject */
2447 ptr > current_subject) /* Matched non-empty string */
2448 {
2449 if (offsetcount >= 2)
2450 {
2451 offsets[0] = current_subject - start_subject;
2452 offsets[1] = end_subject - start_subject;
2453 }
2454 match_count = PCRE_ERROR_PARTIAL;
2455 }
2456
2457 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2458 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2459 rlevel*2-2, SP));
2460 break; /* In effect, "return", but see the comment below */
2461 }
2462
2463 /* One or more states are active for the next character. */
2464
2465 ptr += clen; /* Advance to next subject character */
2466 } /* Loop to move along the subject string */
2467
2468/* Control gets here from "break" a few lines above. We do it this way because
2469if we use "return" above, we have compiler trouble. Some compilers warn if
2470there's nothing here because they think the function doesn't return a value. On
2471the other hand, if we put a dummy statement here, some more clever compilers
2472complain that it can't be reached. Sigh. */
2473
2474return match_count;
2475}
2476
2477
2478
2479
2480/*************************************************
2481* Execute a Regular Expression - DFA engine *
2482*************************************************/
2483
2484/* This external function applies a compiled re to a subject string using a DFA
2485engine. This function calls the internal function multiple times if the pattern
2486is not anchored.
2487
2488Arguments:
2489 argument_re points to the compiled expression
2490 extra_data points to extra data or is NULL
2491 subject points to the subject string
2492 length length of subject string (may contain binary zeros)
2493 start_offset where to start in the subject string
2494 options option bits
2495 offsets vector of match offsets
2496 offsetcount size of same
2497 workspace workspace vector
2498 wscount size of same
2499
2500Returns: > 0 => number of match offset pairs placed in offsets
2501 = 0 => offsets overflowed; longest matches are present
2502 -1 => failed to match
2503 < -1 => some kind of unexpected problem
2504*/
2505
2506PCRE_EXP_DEFN int
2507pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2508 const char *subject, int length, int start_offset, int options, int *offsets,
2509 int offsetcount, int *workspace, int wscount)
2510{
2511real_pcre *re = (real_pcre *)argument_re;
2512dfa_match_data match_block;
2513dfa_match_data *md = &match_block;
2514BOOL utf8, anchored, startline, firstline;
2515const uschar *current_subject, *end_subject, *lcc;
2516
2517pcre_study_data internal_study;
2518const pcre_study_data *study = NULL;
2519real_pcre internal_re;
2520
2521const uschar *req_byte_ptr;
2522const uschar *start_bits = NULL;
2523BOOL first_byte_caseless = FALSE;
2524BOOL req_byte_caseless = FALSE;
2525int first_byte = -1;
2526int req_byte = -1;
2527int req_byte2 = -1;
2528int newline;
2529
2530/* Plausibility checks */
2531
2532if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2533if (re == NULL || subject == NULL || workspace == NULL ||
2534 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2535if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2536if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2537
2538/* We need to find the pointer to any study data before we test for byte
2539flipping, so we scan the extra_data block first. This may set two fields in the
2540match block, so we must initialize them beforehand. However, the other fields
2541in the match block must not be set until after the byte flipping. */
2542
2543md->tables = re->tables;
2544md->callout_data = NULL;
2545
2546if (extra_data != NULL)
2547 {
2548 unsigned int flags = extra_data->flags;
2549 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2550 study = (const pcre_study_data *)extra_data->study_data;
2551 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2552 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2553 return PCRE_ERROR_DFA_UMLIMIT;
2554 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2555 md->callout_data = extra_data->callout_data;
2556 if ((flags & PCRE_EXTRA_TABLES) != 0)
2557 md->tables = extra_data->tables;
2558 }
2559
2560/* Check that the first field in the block is the magic number. If it is not,
2561test for a regex that was compiled on a host of opposite endianness. If this is
2562the case, flipped values are put in internal_re and internal_study if there was
2563study data too. */
2564
2565if (re->magic_number != MAGIC_NUMBER)
2566 {
2567 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2568 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2569 if (study != NULL) study = &internal_study;
2570 }
2571
2572/* Set some local values */
2573
2574current_subject = (const unsigned char *)subject + start_offset;
2575end_subject = (const unsigned char *)subject + length;
2576req_byte_ptr = current_subject - 1;
2577
2578#ifdef SUPPORT_UTF8
2579utf8 = (re->options & PCRE_UTF8) != 0;
2580#else
2581utf8 = FALSE;
2582#endif
2583
2584anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2585 (re->options & PCRE_ANCHORED) != 0;
2586
2587/* The remaining fixed data for passing around. */
2588
2589md->start_code = (const uschar *)argument_re +
2590 re->name_table_offset + re->name_count * re->name_entry_size;
2591md->start_subject = (const unsigned char *)subject;
2592md->end_subject = end_subject;
2593md->moptions = options;
2594md->poptions = re->options;
2595
2596/* If the BSR option is not set at match time, copy what was set
2597at compile time. */
2598
2599if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2600 {
2601 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2602 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2603#ifdef BSR_ANYCRLF
2604 else md->moptions |= PCRE_BSR_ANYCRLF;
2605#endif
2606 }
2607
2608/* Handle different types of newline. The three bits give eight cases. If
2609nothing is set at run time, whatever was used at compile time applies. */
2610
2611switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2612 PCRE_NEWLINE_BITS)
2613 {
2614 case 0: newline = NEWLINE; break; /* Compile-time default */
2615 case PCRE_NEWLINE_CR: newline = '\r'; break;
2616 case PCRE_NEWLINE_LF: newline = '\n'; break;
2617 case PCRE_NEWLINE_CR+
2618 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2619 case PCRE_NEWLINE_ANY: newline = -1; break;
2620 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2621 default: return PCRE_ERROR_BADNEWLINE;
2622 }
2623
2624if (newline == -2)
2625 {
2626 md->nltype = NLTYPE_ANYCRLF;
2627 }
2628else if (newline < 0)
2629 {
2630 md->nltype = NLTYPE_ANY;
2631 }
2632else
2633 {
2634 md->nltype = NLTYPE_FIXED;
2635 if (newline > 255)
2636 {
2637 md->nllen = 2;
2638 md->nl[0] = (newline >> 8) & 255;
2639 md->nl[1] = newline & 255;
2640 }
2641 else
2642 {
2643 md->nllen = 1;
2644 md->nl[0] = newline;
2645 }
2646 }
2647
2648/* Check a UTF-8 string if required. Unfortunately there's no way of passing
2649back the character offset. */
2650
2651#ifdef SUPPORT_UTF8
2652if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2653 {
2654 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2655 return PCRE_ERROR_BADUTF8;
2656 if (start_offset > 0 && start_offset < length)
2657 {
2658 int tb = ((uschar *)subject)[start_offset];
2659 if (tb > 127)
2660 {
2661 tb &= 0xc0;
2662 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2663 }
2664 }
2665 }
2666#endif
2667
2668/* If the exec call supplied NULL for tables, use the inbuilt ones. This
2669is a feature that makes it possible to save compiled regex and re-use them
2670in other programs later. */
2671
2672if (md->tables == NULL) md->tables = _pcre_default_tables;
2673
2674/* The lower casing table and the "must be at the start of a line" flag are
2675used in a loop when finding where to start. */
2676
2677lcc = md->tables + lcc_offset;
2678startline = (re->flags & PCRE_STARTLINE) != 0;
2679firstline = (re->options & PCRE_FIRSTLINE) != 0;
2680
2681/* Set up the first character to match, if available. The first_byte value is
2682never set for an anchored regular expression, but the anchoring may be forced
2683at run time, so we have to test for anchoring. The first char may be unset for
2684an unanchored pattern, of course. If there's no first char and the pattern was
2685studied, there may be a bitmap of possible first characters. */
2686
2687if (!anchored)
2688 {
2689 if ((re->flags & PCRE_FIRSTSET) != 0)
2690 {
2691 first_byte = re->first_byte & 255;
2692 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2693 first_byte = lcc[first_byte];
2694 }
2695 else
2696 {
2697 if (startline && study != NULL &&
2698 (study->options & PCRE_STUDY_MAPPED) != 0)
2699 start_bits = study->start_bits;
2700 }
2701 }
2702
2703/* For anchored or unanchored matches, there may be a "last known required
2704character" set. */
2705
2706if ((re->flags & PCRE_REQCHSET) != 0)
2707 {
2708 req_byte = re->req_byte & 255;
2709 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2710 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2711 }
2712
2713/* Call the main matching function, looping for a non-anchored regex after a
2714failed match. Unless restarting, optimize by moving to the first match
2715character if possible, when not anchored. Then unless wanting a partial match,
2716check for a required later character. */
2717
2718for (;;)
2719 {
2720 int rc;
2721
2722 if ((options & PCRE_DFA_RESTART) == 0)
2723 {
2724 const uschar *save_end_subject = end_subject;
2725
2726 /* Advance to a unique first char if possible. If firstline is TRUE, the
2727 start of the match is constrained to the first line of a multiline string.
2728 Implement this by temporarily adjusting end_subject so that we stop
2729 scanning at a newline. If the match fails at the newline, later code breaks
2730 this loop. */
2731
2732 if (firstline)
2733 {
2734 const uschar *t = current_subject;
2735 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2736 end_subject = t;
2737 }
2738
2739 if (first_byte >= 0)
2740 {
2741 if (first_byte_caseless)
2742 while (current_subject < end_subject &&
2743 lcc[*current_subject] != first_byte)
2744 current_subject++;
2745 else
2746 while (current_subject < end_subject && *current_subject != first_byte)
2747 current_subject++;
2748 }
2749
2750 /* Or to just after a linebreak for a multiline match if possible */
2751
2752 else if (startline)
2753 {
2754 if (current_subject > md->start_subject + start_offset)
2755 {
2756 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2757 current_subject++;
2758
2759 /* If we have just passed a CR and the newline option is ANY or
2760 ANYCRLF, and we are now at a LF, advance the match position by one more
2761 character. */
2762
2763 if (current_subject[-1] == '\r' &&
2764 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2765 current_subject < end_subject &&
2766 *current_subject == '\n')
2767 current_subject++;
2768 }
2769 }
2770
2771 /* Or to a non-unique first char after study */
2772
2773 else if (start_bits != NULL)
2774 {
2775 while (current_subject < end_subject)
2776 {
2777 register unsigned int c = *current_subject;
2778 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2779 else break;
2780 }
2781 }
2782
2783 /* Restore fudged end_subject */
2784
2785 end_subject = save_end_subject;
2786 }
2787
2788 /* If req_byte is set, we know that that character must appear in the subject
2789 for the match to succeed. If the first character is set, req_byte must be
2790 later in the subject; otherwise the test starts at the match point. This
2791 optimization can save a huge amount of work in patterns with nested unlimited
2792 repeats that aren't going to match. Writing separate code for cased/caseless
2793 versions makes it go faster, as does using an autoincrement and backing off
2794 on a match.
2795
2796 HOWEVER: when the subject string is very, very long, searching to its end can
2797 take a long time, and give bad performance on quite ordinary patterns. This
2798 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2799 don't do this when the string is sufficiently long.
2800
2801 ALSO: this processing is disabled when partial matching is requested.
2802 */
2803
2804 if (req_byte >= 0 &&
2805 end_subject - current_subject < REQ_BYTE_MAX &&
2806 (options & PCRE_PARTIAL) == 0)
2807 {
2808 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2809
2810 /* We don't need to repeat the search if we haven't yet reached the
2811 place we found it at last time. */
2812
2813 if (p > req_byte_ptr)
2814 {
2815 if (req_byte_caseless)
2816 {
2817 while (p < end_subject)
2818 {
2819 register int pp = *p++;
2820 if (pp == req_byte || pp == req_byte2) { p--; break; }
2821 }
2822 }
2823 else
2824 {
2825 while (p < end_subject)
2826 {
2827 if (*p++ == req_byte) { p--; break; }
2828 }
2829 }
2830
2831 /* If we can't find the required character, break the matching loop,
2832 which will cause a return or PCRE_ERROR_NOMATCH. */
2833
2834 if (p >= end_subject) break;
2835
2836 /* If we have found the required character, save the point where we
2837 found it, so that we don't search again next time round the loop if
2838 the start hasn't passed this character yet. */
2839
2840 req_byte_ptr = p;
2841 }
2842 }
2843
2844 /* OK, now we can do the business */
2845
2846 rc = internal_dfa_exec(
2847 md, /* fixed match data */
2848 md->start_code, /* this subexpression's code */
2849 current_subject, /* where we currently are */
2850 start_offset, /* start offset in subject */
2851 offsets, /* offset vector */
2852 offsetcount, /* size of same */
2853 workspace, /* workspace vector */
2854 wscount, /* size of same */
2855 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2856 0, /* function recurse level */
2857 0); /* regex recurse level */
2858
2859 /* Anything other than "no match" means we are done, always; otherwise, carry
2860 on only if not anchored. */
2861
2862 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2863
2864 /* Advance to the next subject character unless we are at the end of a line
2865 and firstline is set. */
2866
2867 if (firstline && IS_NEWLINE(current_subject)) break;
2868 current_subject++;
2869 if (utf8)
2870 {
2871 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2872 current_subject++;
2873 }
2874 if (current_subject > end_subject) break;
2875
2876 /* If we have just passed a CR and we are now at a LF, and the pattern does
2877 not contain any explicit matches for \r or \n, and the newline option is CRLF
2878 or ANY or ANYCRLF, advance the match position by one more character. */
2879
2880 if (current_subject[-1] == '\r' &&
2881 current_subject < end_subject &&
2882 *current_subject == '\n' &&
2883 (re->flags & PCRE_HASCRORLF) == 0 &&
2884 (md->nltype == NLTYPE_ANY ||
2885 md->nltype == NLTYPE_ANYCRLF ||
2886 md->nllen == 2))
2887 current_subject++;
2888
2889 } /* "Bumpalong" loop */
2890
2891return PCRE_ERROR_NOMATCH;
2892}
2893
2894/* End of pcre_dfa_exec.c */

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status