monotone

monotone Mtn Source Tree

Root/pcre/pcre_exec.c

1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains pcre_exec(), the externally visible function that does
42pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43possible. There are also some static supporting functions. */
44
45#include "pcre_config.h"
46
47#define NLBLOCK md /* Block containing newline information */
48#define PSSTART start_subject /* Field containing processed string start */
49#define PSEND end_subject /* Field containing processed string end */
50
51#include "pcre_internal.h"
52
53/* Undefine some potentially clashing cpp symbols */
54
55#undef min
56#undef max
57
58/* Flag bits for the match() function */
59
60#define match_condassert 0x01 /* Called to check a condition assertion */
61#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
62
63/* Non-error returns from the match() function. Error returns are externally
64defined PCRE_ERROR_xxx codes, which are all negative. */
65
66#define MATCH_MATCH 1
67#define MATCH_NOMATCH 0
68
69/* Special internal returns from the match() function. Make them sufficiently
70negative to avoid the external error codes. */
71
72#define MATCH_COMMIT (-999)
73#define MATCH_PRUNE (-998)
74#define MATCH_SKIP (-997)
75#define MATCH_THEN (-996)
76
77/* Maximum number of ints of offset to save on the stack for recursive calls.
78If the offset vector is bigger, malloc is used. This should be a multiple of 3,
79because the offset vector is always a multiple of 3 long. */
80
81#define REC_STACK_SAVE_MAX 30
82
83/* Min and max values for the common repeats; for the maxima, 0 => infinity */
84
85static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
86static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87
88
89
90#ifdef DEBUG
91/*************************************************
92* Debugging function to print chars *
93*************************************************/
94
95/* Print a sequence of chars in printable format, stopping at the end of the
96subject if the requested.
97
98Arguments:
99 p points to characters
100 length number to print
101 is_subject TRUE if printing from within md->start_subject
102 md pointer to matching data block, if is_subject is TRUE
103
104Returns: nothing
105*/
106
107static void
108pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
109{
110unsigned int c;
111if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
112while (length-- > 0)
113 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
114}
115#endif
116
117
118
119/*************************************************
120* Match a back-reference *
121*************************************************/
122
123/* If a back reference hasn't been set, the length that is passed is greater
124than the number of characters left in the string, so the match fails.
125
126Arguments:
127 offset index into the offset vector
128 eptr points into the subject
129 length length to be matched
130 md points to match data block
131 ims the ims flags
132
133Returns: TRUE if matched
134*/
135
136static BOOL
137match_ref(int offset, register USPTR eptr, int length, match_data *md,
138 unsigned long int ims)
139{
140USPTR p = md->start_subject + md->offset_vector[offset];
141
142#ifdef DEBUG
143if (eptr >= md->end_subject)
144 printf("matching subject <null>");
145else
146 {
147 printf("matching subject ");
148 pchars(eptr, length, TRUE, md);
149 }
150printf(" against backref ");
151pchars(p, length, FALSE, md);
152printf("\n");
153#endif
154
155/* Always fail if not enough characters left */
156
157if (length > md->end_subject - eptr) return FALSE;
158
159/* Separate the caselesss case for speed */
160
161if ((ims & PCRE_CASELESS) != 0)
162 {
163 while (length-- > 0)
164 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
165 }
166else
167 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
168
169return TRUE;
170}
171
172
173
174/***************************************************************************
175****************************************************************************
176 RECURSION IN THE match() FUNCTION
177
178The match() function is highly recursive, though not every recursive call
179increases the recursive depth. Nevertheless, some regular expressions can cause
180it to recurse to a great depth. I was writing for Unix, so I just let it call
181itself recursively. This uses the stack for saving everything that has to be
182saved for a recursive call. On Unix, the stack can be large, and this works
183fine.
184
185It turns out that on some non-Unix-like systems there are problems with
186programs that use a lot of stack. (This despite the fact that every last chip
187has oodles of memory these days, and techniques for extending the stack have
188been known for decades.) So....
189
190There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
191calls by keeping local variables that need to be preserved in blocks of memory
192obtained from malloc() instead instead of on the stack. Macros are used to
193achieve this so that the actual code doesn't look very different to what it
194always used to.
195
196The original heap-recursive code used longjmp(). However, it seems that this
197can be very slow on some operating systems. Following a suggestion from Stan
198Switzer, the use of longjmp() has been abolished, at the cost of having to
199provide a unique number for each call to RMATCH. There is no way of generating
200a sequence of numbers at compile time in C. I have given them names, to make
201them stand out more clearly.
202
203Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
204FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
205tests. Furthermore, not using longjmp() means that local dynamic variables
206don't have indeterminate values; this has meant that the frame size can be
207reduced because the result can be "passed back" by straight setting of the
208variable instead of being passed in the frame.
209****************************************************************************
210***************************************************************************/
211
212/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
213below must be updated in sync. */
214
215enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
216 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
217 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
218 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
219 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
220 RM51, RM52, RM53, RM54 };
221
222/* These versions of the macros use the stack, as normal. There are debugging
223versions and production versions. Note that the "rw" argument of RMATCH isn't
224actuall used in this definition. */
225
226#ifndef NO_RECURSE
227#define REGISTER register
228
229#ifdef DEBUG
230#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
231 { \
232 printf("match() called in line %d\n", __LINE__); \
233 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
234 printf("to line %d\n", __LINE__); \
235 }
236#define RRETURN(ra) \
237 { \
238 printf("match() returned %d from line %d ", ra, __LINE__); \
239 return ra; \
240 }
241#else
242#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
243 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
244#define RRETURN(ra) return ra
245#endif
246
247#else
248
249
250/* These versions of the macros manage a private stack on the heap. Note that
251the "rd" argument of RMATCH isn't actually used in this definition. It's the md
252argument of match(), which never changes. */
253
254#define REGISTER
255
256#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
257 {\
258 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
259 frame->Xwhere = rw; \
260 newframe->Xeptr = ra;\
261 newframe->Xecode = rb;\
262 newframe->Xmstart = mstart;\
263 newframe->Xoffset_top = rc;\
264 newframe->Xims = re;\
265 newframe->Xeptrb = rf;\
266 newframe->Xflags = rg;\
267 newframe->Xrdepth = frame->Xrdepth + 1;\
268 newframe->Xprevframe = frame;\
269 frame = newframe;\
270 DPRINTF(("restarting from line %d\n", __LINE__));\
271 goto HEAP_RECURSE;\
272 L_##rw:\
273 DPRINTF(("jumped back to line %d\n", __LINE__));\
274 }
275
276#define RRETURN(ra)\
277 {\
278 heapframe *newframe = frame;\
279 frame = newframe->Xprevframe;\
280 (pcre_stack_free)(newframe);\
281 if (frame != NULL)\
282 {\
283 rrc = ra;\
284 goto HEAP_RETURN;\
285 }\
286 return ra;\
287 }
288
289
290/* Structure for remembering the local variables in a private frame */
291
292typedef struct heapframe {
293 struct heapframe *Xprevframe;
294
295 /* Function arguments that may change */
296
297 const uschar *Xeptr;
298 const uschar *Xecode;
299 const uschar *Xmstart;
300 int Xoffset_top;
301 long int Xims;
302 eptrblock *Xeptrb;
303 int Xflags;
304 unsigned int Xrdepth;
305
306 /* Function local variables */
307
308 const uschar *Xcallpat;
309 const uschar *Xcharptr;
310 const uschar *Xdata;
311 const uschar *Xnext;
312 const uschar *Xpp;
313 const uschar *Xprev;
314 const uschar *Xsaved_eptr;
315
316 recursion_info Xnew_recursive;
317
318 BOOL Xcur_is_word;
319 BOOL Xcondition;
320 BOOL Xprev_is_word;
321
322 unsigned long int Xoriginal_ims;
323
324#ifdef SUPPORT_UCP
325 int Xprop_type;
326 int Xprop_value;
327 int Xprop_fail_result;
328 int Xprop_category;
329 int Xprop_chartype;
330 int Xprop_script;
331 int Xoclength;
332 uschar Xocchars[8];
333#endif
334
335 int Xctype;
336 unsigned int Xfc;
337 int Xfi;
338 int Xlength;
339 int Xmax;
340 int Xmin;
341 int Xnumber;
342 int Xoffset;
343 int Xop;
344 int Xsave_capture_last;
345 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
346 int Xstacksave[REC_STACK_SAVE_MAX];
347
348 eptrblock Xnewptrb;
349
350 /* Where to jump back to */
351
352 int Xwhere;
353
354} heapframe;
355
356#endif
357
358
359/***************************************************************************
360***************************************************************************/
361
362
363
364/*************************************************
365* Match from current position *
366*************************************************/
367
368/* This function is called recursively in many circumstances. Whenever it
369returns a negative (error) response, the outer incarnation must also return the
370same response.
371
372Performance note: It might be tempting to extract commonly used fields from the
373md structure (e.g. utf8, end_subject) into individual variables to improve
374performance. Tests using gcc on a SPARC disproved this; in the first case, it
375made performance worse.
376
377Arguments:
378 eptr pointer to current character in subject
379 ecode pointer to current position in compiled code
380 mstart pointer to the current match start position (can be modified
381 by encountering \K)
382 offset_top current top pointer
383 md pointer to "static" info for the match
384 ims current /i, /m, and /s options
385 eptrb pointer to chain of blocks containing eptr at start of
386 brackets - for testing for empty matches
387 flags can contain
388 match_condassert - this is an assertion condition
389 match_cbegroup - this is the start of an unlimited repeat
390 group that can match an empty string
391 rdepth the recursion depth
392
393Returns: MATCH_MATCH if matched ) these values are >= 0
394 MATCH_NOMATCH if failed to match )
395 a negative PCRE_ERROR_xxx value if aborted by an error condition
396 (e.g. stopped by repeated call or recursion limit)
397*/
398
399static int
400match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
401 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
402 int flags, unsigned int rdepth)
403{
404/* These variables do not need to be preserved over recursion in this function,
405so they can be ordinary variables in all cases. Mark some of them with
406"register" because they are used a lot in loops. */
407
408register int rrc; /* Returns from recursive calls */
409register int i; /* Used for loops not involving calls to RMATCH() */
410register unsigned int c; /* Character values not kept over RMATCH() calls */
411register BOOL utf8; /* Local copy of UTF-8 flag for speed */
412
413BOOL minimize, possessive; /* Quantifier options */
414
415/* When recursion is not being used, all "local" variables that have to be
416preserved over calls to RMATCH() are part of a "frame" which is obtained from
417heap storage. Set up the top-level frame here; others are obtained from the
418heap whenever RMATCH() does a "recursion". See the macro definitions above. */
419
420#ifdef NO_RECURSE
421heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
422frame->Xprevframe = NULL; /* Marks the top level */
423
424/* Copy in the original argument variables */
425
426frame->Xeptr = eptr;
427frame->Xecode = ecode;
428frame->Xmstart = mstart;
429frame->Xoffset_top = offset_top;
430frame->Xims = ims;
431frame->Xeptrb = eptrb;
432frame->Xflags = flags;
433frame->Xrdepth = rdepth;
434
435/* This is where control jumps back to to effect "recursion" */
436
437HEAP_RECURSE:
438
439/* Macros make the argument variables come from the current frame */
440
441#define eptr frame->Xeptr
442#define ecode frame->Xecode
443#define mstart frame->Xmstart
444#define offset_top frame->Xoffset_top
445#define ims frame->Xims
446#define eptrb frame->Xeptrb
447#define flags frame->Xflags
448#define rdepth frame->Xrdepth
449
450/* Ditto for the local variables */
451
452#ifdef SUPPORT_UTF8
453#define charptr frame->Xcharptr
454#endif
455#define callpat frame->Xcallpat
456#define data frame->Xdata
457#define next frame->Xnext
458#define pp frame->Xpp
459#define prev frame->Xprev
460#define saved_eptr frame->Xsaved_eptr
461
462#define new_recursive frame->Xnew_recursive
463
464#define cur_is_word frame->Xcur_is_word
465#define condition frame->Xcondition
466#define prev_is_word frame->Xprev_is_word
467
468#define original_ims frame->Xoriginal_ims
469
470#ifdef SUPPORT_UCP
471#define prop_type frame->Xprop_type
472#define prop_value frame->Xprop_value
473#define prop_fail_result frame->Xprop_fail_result
474#define prop_category frame->Xprop_category
475#define prop_chartype frame->Xprop_chartype
476#define prop_script frame->Xprop_script
477#define oclength frame->Xoclength
478#define occhars frame->Xocchars
479#endif
480
481#define ctype frame->Xctype
482#define fc frame->Xfc
483#define fi frame->Xfi
484#define length frame->Xlength
485#define max frame->Xmax
486#define min frame->Xmin
487#define number frame->Xnumber
488#define offset frame->Xoffset
489#define op frame->Xop
490#define save_capture_last frame->Xsave_capture_last
491#define save_offset1 frame->Xsave_offset1
492#define save_offset2 frame->Xsave_offset2
493#define save_offset3 frame->Xsave_offset3
494#define stacksave frame->Xstacksave
495
496#define newptrb frame->Xnewptrb
497
498/* When recursion is being used, local variables are allocated on the stack and
499get preserved during recursion in the normal way. In this environment, fi and
500i, and fc and c, can be the same variables. */
501
502#else /* NO_RECURSE not defined */
503#define fi i
504#define fc c
505
506
507#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
508const uschar *charptr; /* in small blocks of the code. My normal */
509#endif /* style of coding would have declared */
510const uschar *callpat; /* them within each of those blocks. */
511const uschar *data; /* However, in order to accommodate the */
512const uschar *next; /* version of this code that uses an */
513USPTR pp; /* external "stack" implemented on the */
514const uschar *prev; /* heap, it is easier to declare them all */
515USPTR saved_eptr; /* here, so the declarations can be cut */
516 /* out in a block. The only declarations */
517recursion_info new_recursive; /* within blocks below are for variables */
518 /* that do not have to be preserved over */
519BOOL cur_is_word; /* a recursive call to RMATCH(). */
520BOOL condition;
521BOOL prev_is_word;
522
523unsigned long int original_ims;
524
525#ifdef SUPPORT_UCP
526int prop_type;
527int prop_value;
528int prop_fail_result;
529int prop_category;
530int prop_chartype;
531int prop_script;
532int oclength;
533uschar occhars[8];
534#endif
535
536int ctype;
537int length;
538int max;
539int min;
540int number;
541int offset;
542int op;
543int save_capture_last;
544int save_offset1, save_offset2, save_offset3;
545int stacksave[REC_STACK_SAVE_MAX];
546
547eptrblock newptrb;
548#endif /* NO_RECURSE */
549
550/* These statements are here to stop the compiler complaining about unitialized
551variables. */
552
553#ifdef SUPPORT_UCP
554prop_value = 0;
555prop_fail_result = 0;
556#endif
557
558
559/* This label is used for tail recursion, which is used in a few cases even
560when NO_RECURSE is not defined, in order to reduce the amount of stack that is
561used. Thanks to Ian Taylor for noticing this possibility and sending the
562original patch. */
563
564TAIL_RECURSE:
565
566/* OK, now we can get on with the real code of the function. Recursive calls
567are specified by the macro RMATCH and RRETURN is used to return. When
568NO_RECURSE is *not* defined, these just turn into a recursive call to match()
569and a "return", respectively (possibly with some debugging if DEBUG is
570defined). However, RMATCH isn't like a function call because it's quite a
571complicated macro. It has to be used in one particular way. This shouldn't,
572however, impact performance when true recursion is being used. */
573
574#ifdef SUPPORT_UTF8
575utf8 = md->utf8; /* Local copy of the flag */
576#else
577utf8 = FALSE;
578#endif
579
580/* First check that we haven't called match() too many times, or that we
581haven't exceeded the recursive call limit. */
582
583if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
584if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
585
586original_ims = ims; /* Save for resetting on ')' */
587
588/* At the start of a group with an unlimited repeat that may match an empty
589string, the match_cbegroup flag is set. When this is the case, add the current
590subject pointer to the chain of such remembered pointers, to be checked when we
591hit the closing ket, in order to break infinite loops that match no characters.
592When match() is called in other circumstances, don't add to the chain. The
593match_cbegroup flag must NOT be used with tail recursion, because the memory
594block that is used is on the stack, so a new one may be required for each
595match(). */
596
597if ((flags & match_cbegroup) != 0)
598 {
599 newptrb.epb_saved_eptr = eptr;
600 newptrb.epb_prev = eptrb;
601 eptrb = &newptrb;
602 }
603
604/* Now start processing the opcodes. */
605
606for (;;)
607 {
608 minimize = possessive = FALSE;
609 op = *ecode;
610
611 /* For partial matching, remember if we ever hit the end of the subject after
612 matching at least one subject character. */
613
614 if (md->partial &&
615 eptr >= md->end_subject &&
616 eptr > mstart)
617 md->hitend = TRUE;
618
619 switch(op)
620 {
621 case OP_FAIL:
622 RRETURN(MATCH_NOMATCH);
623
624 case OP_PRUNE:
625 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
626 ims, eptrb, flags, RM51);
627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
628 RRETURN(MATCH_PRUNE);
629
630 case OP_COMMIT:
631 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
632 ims, eptrb, flags, RM52);
633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
634 RRETURN(MATCH_COMMIT);
635
636 case OP_SKIP:
637 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
638 ims, eptrb, flags, RM53);
639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
640 md->start_match_ptr = eptr; /* Pass back current position */
641 RRETURN(MATCH_SKIP);
642
643 case OP_THEN:
644 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
645 ims, eptrb, flags, RM54);
646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
647 RRETURN(MATCH_THEN);
648
649 /* Handle a capturing bracket. If there is space in the offset vector, save
650 the current subject position in the working slot at the top of the vector.
651 We mustn't change the current values of the data slot, because they may be
652 set from a previous iteration of this group, and be referred to by a
653 reference inside the group.
654
655 If the bracket fails to match, we need to restore this value and also the
656 values of the final offsets, in case they were set by a previous iteration
657 of the same bracket.
658
659 If there isn't enough space in the offset vector, treat this as if it were
660 a non-capturing bracket. Don't worry about setting the flag for the error
661 case here; that is handled in the code for KET. */
662
663 case OP_CBRA:
664 case OP_SCBRA:
665 number = GET2(ecode, 1+LINK_SIZE);
666 offset = number << 1;
667
668#ifdef DEBUG
669 printf("start bracket %d\n", number);
670 printf("subject=");
671 pchars(eptr, 16, TRUE, md);
672 printf("\n");
673#endif
674
675 if (offset < md->offset_max)
676 {
677 save_offset1 = md->offset_vector[offset];
678 save_offset2 = md->offset_vector[offset+1];
679 save_offset3 = md->offset_vector[md->offset_end - number];
680 save_capture_last = md->capture_last;
681
682 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
683 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
684
685 flags = (op == OP_SCBRA)? match_cbegroup : 0;
686 do
687 {
688 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
689 ims, eptrb, flags, RM1);
690 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
691 md->capture_last = save_capture_last;
692 ecode += GET(ecode, 1);
693 }
694 while (*ecode == OP_ALT);
695
696 DPRINTF(("bracket %d failed\n", number));
697
698 md->offset_vector[offset] = save_offset1;
699 md->offset_vector[offset+1] = save_offset2;
700 md->offset_vector[md->offset_end - number] = save_offset3;
701
702 RRETURN(MATCH_NOMATCH);
703 }
704
705 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
706 as a non-capturing bracket. */
707
708 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
709 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
710
711 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
712
713 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
714 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
715
716 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
717 final alternative within the brackets, we would return the result of a
718 recursive call to match() whatever happened. We can reduce stack usage by
719 turning this into a tail recursion, except in the case when match_cbegroup
720 is set.*/
721
722 case OP_BRA:
723 case OP_SBRA:
724 DPRINTF(("start non-capturing bracket\n"));
725 flags = (op >= OP_SBRA)? match_cbegroup : 0;
726 for (;;)
727 {
728 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
729 {
730 if (flags == 0) /* Not a possibly empty group */
731 {
732 ecode += _pcre_OP_lengths[*ecode];
733 DPRINTF(("bracket 0 tail recursion\n"));
734 goto TAIL_RECURSE;
735 }
736
737 /* Possibly empty group; can't use tail recursion. */
738
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
740 eptrb, flags, RM48);
741 RRETURN(rrc);
742 }
743
744 /* For non-final alternatives, continue the loop for a NOMATCH result;
745 otherwise return. */
746
747 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
748 eptrb, flags, RM2);
749 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
750 ecode += GET(ecode, 1);
751 }
752 /* Control never reaches here. */
753
754 /* Conditional group: compilation checked that there are no more than
755 two branches. If the condition is false, skipping the first branch takes us
756 past the end if there is only one branch, but that's OK because that is
757 exactly what going to the ket would do. As there is only one branch to be
758 obeyed, we can use tail recursion to avoid using another stack frame. */
759
760 case OP_COND:
761 case OP_SCOND:
762 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
763 {
764 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
765 condition = md->recursive != NULL &&
766 (offset == RREF_ANY || offset == md->recursive->group_num);
767 ecode += condition? 3 : GET(ecode, 1);
768 }
769
770 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
771 {
772 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
773 condition = offset < offset_top && md->offset_vector[offset] >= 0;
774 ecode += condition? 3 : GET(ecode, 1);
775 }
776
777 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
778 {
779 condition = FALSE;
780 ecode += GET(ecode, 1);
781 }
782
783 /* The condition is an assertion. Call match() to evaluate it - setting
784 the final argument match_condassert causes it to stop at the end of an
785 assertion. */
786
787 else
788 {
789 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
790 match_condassert, RM3);
791 if (rrc == MATCH_MATCH)
792 {
793 condition = TRUE;
794 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
795 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
796 }
797 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
798 {
799 RRETURN(rrc); /* Need braces because of following else */
800 }
801 else
802 {
803 condition = FALSE;
804 ecode += GET(ecode, 1);
805 }
806 }
807
808 /* We are now at the branch that is to be obeyed. As there is only one,
809 we can use tail recursion to avoid using another stack frame, except when
810 match_cbegroup is required for an unlimited repeat of a possibly empty
811 group. If the second alternative doesn't exist, we can just plough on. */
812
813 if (condition || *ecode == OP_ALT)
814 {
815 ecode += 1 + LINK_SIZE;
816 if (op == OP_SCOND) /* Possibly empty group */
817 {
818 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
819 RRETURN(rrc);
820 }
821 else /* Group must match something */
822 {
823 flags = 0;
824 goto TAIL_RECURSE;
825 }
826 }
827 else /* Condition false & no 2nd alternative */
828 {
829 ecode += 1 + LINK_SIZE;
830 }
831 break;
832
833
834 /* End of the pattern, either real or forced. If we are in a top-level
835 recursion, we should restore the offsets appropriately and continue from
836 after the call. */
837
838 case OP_ACCEPT:
839 case OP_END:
840 if (md->recursive != NULL && md->recursive->group_num == 0)
841 {
842 recursion_info *rec = md->recursive;
843 DPRINTF(("End of pattern in a (?0) recursion\n"));
844 md->recursive = rec->prevrec;
845 memmove(md->offset_vector, rec->offset_save,
846 rec->saved_max * sizeof(int));
847 mstart = rec->save_start;
848 ims = original_ims;
849 ecode = rec->after_call;
850 break;
851 }
852
853 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
854 string - backtracking will then try other alternatives, if any. */
855
856 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
857 md->end_match_ptr = eptr; /* Record where we ended */
858 md->end_offset_top = offset_top; /* and how many extracts were taken */
859 md->start_match_ptr = mstart; /* and the start (\K can modify) */
860 RRETURN(MATCH_MATCH);
861
862 /* Change option settings */
863
864 case OP_OPT:
865 ims = ecode[1];
866 ecode += 2;
867 DPRINTF(("ims set to %02lx\n", ims));
868 break;
869
870 /* Assertion brackets. Check the alternative branches in turn - the
871 matching won't pass the KET for an assertion. If any one branch matches,
872 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
873 start of each branch to move the current point backwards, so the code at
874 this level is identical to the lookahead case. */
875
876 case OP_ASSERT:
877 case OP_ASSERTBACK:
878 do
879 {
880 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
881 RM4);
882 if (rrc == MATCH_MATCH) break;
883 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
884 ecode += GET(ecode, 1);
885 }
886 while (*ecode == OP_ALT);
887 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
888
889 /* If checking an assertion for a condition, return MATCH_MATCH. */
890
891 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
892
893 /* Continue from after the assertion, updating the offsets high water
894 mark, since extracts may have been taken during the assertion. */
895
896 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
897 ecode += 1 + LINK_SIZE;
898 offset_top = md->end_offset_top;
899 continue;
900
901 /* Negative assertion: all branches must fail to match */
902
903 case OP_ASSERT_NOT:
904 case OP_ASSERTBACK_NOT:
905 do
906 {
907 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
908 RM5);
909 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
910 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
911 ecode += GET(ecode,1);
912 }
913 while (*ecode == OP_ALT);
914
915 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
916
917 ecode += 1 + LINK_SIZE;
918 continue;
919
920 /* Move the subject pointer back. This occurs only at the start of
921 each branch of a lookbehind assertion. If we are too close to the start to
922 move back, this match function fails. When working with UTF-8 we move
923 back a number of characters, not bytes. */
924
925 case OP_REVERSE:
926#ifdef SUPPORT_UTF8
927 if (utf8)
928 {
929 i = GET(ecode, 1);
930 while (i-- > 0)
931 {
932 eptr--;
933 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
934 BACKCHAR(eptr);
935 }
936 }
937 else
938#endif
939
940 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
941
942 {
943 eptr -= GET(ecode, 1);
944 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
945 }
946
947 /* Skip to next op code */
948
949 ecode += 1 + LINK_SIZE;
950 break;
951
952 /* The callout item calls an external function, if one is provided, passing
953 details of the match so far. This is mainly for debugging, though the
954 function is able to force a failure. */
955
956 case OP_CALLOUT:
957 if (pcre_callout != NULL)
958 {
959 pcre_callout_block cb;
960 cb.version = 1; /* Version 1 of the callout block */
961 cb.callout_number = ecode[1];
962 cb.offset_vector = md->offset_vector;
963 cb.subject = (PCRE_SPTR)md->start_subject;
964 cb.subject_length = md->end_subject - md->start_subject;
965 cb.start_match = mstart - md->start_subject;
966 cb.current_position = eptr - md->start_subject;
967 cb.pattern_position = GET(ecode, 2);
968 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
969 cb.capture_top = offset_top/2;
970 cb.capture_last = md->capture_last;
971 cb.callout_data = md->callout_data;
972 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
973 if (rrc < 0) RRETURN(rrc);
974 }
975 ecode += 2 + 2*LINK_SIZE;
976 break;
977
978 /* Recursion either matches the current regex, or some subexpression. The
979 offset data is the offset to the starting bracket from the start of the
980 whole pattern. (This is so that it works from duplicated subpatterns.)
981
982 If there are any capturing brackets started but not finished, we have to
983 save their starting points and reinstate them after the recursion. However,
984 we don't know how many such there are (offset_top records the completed
985 total) so we just have to save all the potential data. There may be up to
986 65535 such values, which is too large to put on the stack, but using malloc
987 for small numbers seems expensive. As a compromise, the stack is used when
988 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
989 is used. A problem is what to do if the malloc fails ... there is no way of
990 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
991 values on the stack, and accept that the rest may be wrong.
992
993 There are also other values that have to be saved. We use a chained
994 sequence of blocks that actually live on the stack. Thanks to Robin Houston
995 for the original version of this logic. */
996
997 case OP_RECURSE:
998 {
999 callpat = md->start_code + GET(ecode, 1);
1000 new_recursive.group_num = (callpat == md->start_code)? 0 :
1001 GET2(callpat, 1 + LINK_SIZE);
1002
1003 /* Add to "recursing stack" */
1004
1005 new_recursive.prevrec = md->recursive;
1006 md->recursive = &new_recursive;
1007
1008 /* Find where to continue from afterwards */
1009
1010 ecode += 1 + LINK_SIZE;
1011 new_recursive.after_call = ecode;
1012
1013 /* Now save the offset data. */
1014
1015 new_recursive.saved_max = md->offset_end;
1016 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1017 new_recursive.offset_save = stacksave;
1018 else
1019 {
1020 new_recursive.offset_save =
1021 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1022 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1023 }
1024
1025 memcpy(new_recursive.offset_save, md->offset_vector,
1026 new_recursive.saved_max * sizeof(int));
1027 new_recursive.save_start = mstart;
1028 mstart = eptr;
1029
1030 /* OK, now we can do the recursion. For each top-level alternative we
1031 restore the offset and recursion data. */
1032
1033 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1034 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1035 do
1036 {
1037 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1038 md, ims, eptrb, flags, RM6);
1039 if (rrc == MATCH_MATCH)
1040 {
1041 DPRINTF(("Recursion matched\n"));
1042 md->recursive = new_recursive.prevrec;
1043 if (new_recursive.offset_save != stacksave)
1044 (pcre_free)(new_recursive.offset_save);
1045 RRETURN(MATCH_MATCH);
1046 }
1047 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1048 {
1049 DPRINTF(("Recursion gave error %d\n", rrc));
1050 RRETURN(rrc);
1051 }
1052
1053 md->recursive = &new_recursive;
1054 memcpy(md->offset_vector, new_recursive.offset_save,
1055 new_recursive.saved_max * sizeof(int));
1056 callpat += GET(callpat, 1);
1057 }
1058 while (*callpat == OP_ALT);
1059
1060 DPRINTF(("Recursion didn't match\n"));
1061 md->recursive = new_recursive.prevrec;
1062 if (new_recursive.offset_save != stacksave)
1063 (pcre_free)(new_recursive.offset_save);
1064 RRETURN(MATCH_NOMATCH);
1065 }
1066 /* Control never reaches here */
1067
1068 /* "Once" brackets are like assertion brackets except that after a match,
1069 the point in the subject string is not moved back. Thus there can never be
1070 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1071 Check the alternative branches in turn - the matching won't pass the KET
1072 for this kind of subpattern. If any one branch matches, we carry on as at
1073 the end of a normal bracket, leaving the subject pointer. */
1074
1075 case OP_ONCE:
1076 prev = ecode;
1077 saved_eptr = eptr;
1078
1079 do
1080 {
1081 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1082 if (rrc == MATCH_MATCH) break;
1083 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1084 ecode += GET(ecode,1);
1085 }
1086 while (*ecode == OP_ALT);
1087
1088 /* If hit the end of the group (which could be repeated), fail */
1089
1090 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1091
1092 /* Continue as from after the assertion, updating the offsets high water
1093 mark, since extracts may have been taken. */
1094
1095 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1096
1097 offset_top = md->end_offset_top;
1098 eptr = md->end_match_ptr;
1099
1100 /* For a non-repeating ket, just continue at this level. This also
1101 happens for a repeating ket if no characters were matched in the group.
1102 This is the forcible breaking of infinite loops as implemented in Perl
1103 5.005. If there is an options reset, it will get obeyed in the normal
1104 course of events. */
1105
1106 if (*ecode == OP_KET || eptr == saved_eptr)
1107 {
1108 ecode += 1+LINK_SIZE;
1109 break;
1110 }
1111
1112 /* The repeating kets try the rest of the pattern or restart from the
1113 preceding bracket, in the appropriate order. The second "call" of match()
1114 uses tail recursion, to avoid using another stack frame. We need to reset
1115 any options that changed within the bracket before re-running it, so
1116 check the next opcode. */
1117
1118 if (ecode[1+LINK_SIZE] == OP_OPT)
1119 {
1120 ims = (ims & ~PCRE_IMS) | ecode[4];
1121 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1122 }
1123
1124 if (*ecode == OP_KETRMIN)
1125 {
1126 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1127 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1128 ecode = prev;
1129 flags = 0;
1130 goto TAIL_RECURSE;
1131 }
1132 else /* OP_KETRMAX */
1133 {
1134 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1135 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1136 ecode += 1 + LINK_SIZE;
1137 flags = 0;
1138 goto TAIL_RECURSE;
1139 }
1140 /* Control never gets here */
1141
1142 /* An alternation is the end of a branch; scan along to find the end of the
1143 bracketed group and go to there. */
1144
1145 case OP_ALT:
1146 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1147 break;
1148
1149 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1150 that it may occur zero times. It may repeat infinitely, or not at all -
1151 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1152 repeat limits are compiled as a number of copies, with the optional ones
1153 preceded by BRAZERO or BRAMINZERO. */
1154
1155 case OP_BRAZERO:
1156 {
1157 next = ecode+1;
1158 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1159 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1160 do next += GET(next,1); while (*next == OP_ALT);
1161 ecode = next + 1 + LINK_SIZE;
1162 }
1163 break;
1164
1165 case OP_BRAMINZERO:
1166 {
1167 next = ecode+1;
1168 do next += GET(next, 1); while (*next == OP_ALT);
1169 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1171 ecode++;
1172 }
1173 break;
1174
1175 /* End of a group, repeated or non-repeating. */
1176
1177 case OP_KET:
1178 case OP_KETRMIN:
1179 case OP_KETRMAX:
1180 prev = ecode - GET(ecode, 1);
1181
1182 /* If this was a group that remembered the subject start, in order to break
1183 infinite repeats of empty string matches, retrieve the subject start from
1184 the chain. Otherwise, set it NULL. */
1185
1186 if (*prev >= OP_SBRA)
1187 {
1188 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1189 eptrb = eptrb->epb_prev; /* Backup to previous group */
1190 }
1191 else saved_eptr = NULL;
1192
1193 /* If we are at the end of an assertion group, stop matching and return
1194 MATCH_MATCH, but record the current high water mark for use by positive
1195 assertions. Do this also for the "once" (atomic) groups. */
1196
1197 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1198 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1199 *prev == OP_ONCE)
1200 {
1201 md->end_match_ptr = eptr; /* For ONCE */
1202 md->end_offset_top = offset_top;
1203 RRETURN(MATCH_MATCH);
1204 }
1205
1206 /* For capturing groups we have to check the group number back at the start
1207 and if necessary complete handling an extraction by setting the offsets and
1208 bumping the high water mark. Note that whole-pattern recursion is coded as
1209 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1210 when the OP_END is reached. Other recursion is handled here. */
1211
1212 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1213 {
1214 number = GET2(prev, 1+LINK_SIZE);
1215 offset = number << 1;
1216
1217#ifdef DEBUG
1218 printf("end bracket %d", number);
1219 printf("\n");
1220#endif
1221
1222 md->capture_last = number;
1223 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1224 {
1225 md->offset_vector[offset] =
1226 md->offset_vector[md->offset_end - number];
1227 md->offset_vector[offset+1] = eptr - md->start_subject;
1228 if (offset_top <= offset) offset_top = offset + 2;
1229 }
1230
1231 /* Handle a recursively called group. Restore the offsets
1232 appropriately and continue from after the call. */
1233
1234 if (md->recursive != NULL && md->recursive->group_num == number)
1235 {
1236 recursion_info *rec = md->recursive;
1237 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1238 md->recursive = rec->prevrec;
1239 mstart = rec->save_start;
1240 memcpy(md->offset_vector, rec->offset_save,
1241 rec->saved_max * sizeof(int));
1242 ecode = rec->after_call;
1243 ims = original_ims;
1244 break;
1245 }
1246 }
1247
1248 /* For both capturing and non-capturing groups, reset the value of the ims
1249 flags, in case they got changed during the group. */
1250
1251 ims = original_ims;
1252 DPRINTF(("ims reset to %02lx\n", ims));
1253
1254 /* For a non-repeating ket, just continue at this level. This also
1255 happens for a repeating ket if no characters were matched in the group.
1256 This is the forcible breaking of infinite loops as implemented in Perl
1257 5.005. If there is an options reset, it will get obeyed in the normal
1258 course of events. */
1259
1260 if (*ecode == OP_KET || eptr == saved_eptr)
1261 {
1262 ecode += 1 + LINK_SIZE;
1263 break;
1264 }
1265
1266 /* The repeating kets try the rest of the pattern or restart from the
1267 preceding bracket, in the appropriate order. In the second case, we can use
1268 tail recursion to avoid using another stack frame, unless we have an
1269 unlimited repeat of a group that can match an empty string. */
1270
1271 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1272
1273 if (*ecode == OP_KETRMIN)
1274 {
1275 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1277 if (flags != 0) /* Could match an empty string */
1278 {
1279 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1280 RRETURN(rrc);
1281 }
1282 ecode = prev;
1283 goto TAIL_RECURSE;
1284 }
1285 else /* OP_KETRMAX */
1286 {
1287 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1288 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1289 ecode += 1 + LINK_SIZE;
1290 flags = 0;
1291 goto TAIL_RECURSE;
1292 }
1293 /* Control never gets here */
1294
1295 /* Start of subject unless notbol, or after internal newline if multiline */
1296
1297 case OP_CIRC:
1298 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1299 if ((ims & PCRE_MULTILINE) != 0)
1300 {
1301 if (eptr != md->start_subject &&
1302 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1303 RRETURN(MATCH_NOMATCH);
1304 ecode++;
1305 break;
1306 }
1307 /* ... else fall through */
1308
1309 /* Start of subject assertion */
1310
1311 case OP_SOD:
1312 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1313 ecode++;
1314 break;
1315
1316 /* Start of match assertion */
1317
1318 case OP_SOM:
1319 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1320 ecode++;
1321 break;
1322
1323 /* Reset the start of match point */
1324
1325 case OP_SET_SOM:
1326 mstart = eptr;
1327 ecode++;
1328 break;
1329
1330 /* Assert before internal newline if multiline, or before a terminating
1331 newline unless endonly is set, else end of subject unless noteol is set. */
1332
1333 case OP_DOLL:
1334 if ((ims & PCRE_MULTILINE) != 0)
1335 {
1336 if (eptr < md->end_subject)
1337 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1338 else
1339 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1340 ecode++;
1341 break;
1342 }
1343 else
1344 {
1345 if (md->noteol) RRETURN(MATCH_NOMATCH);
1346 if (!md->endonly)
1347 {
1348 if (eptr != md->end_subject &&
1349 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1350 RRETURN(MATCH_NOMATCH);
1351 ecode++;
1352 break;
1353 }
1354 }
1355 /* ... else fall through for endonly */
1356
1357 /* End of subject assertion (\z) */
1358
1359 case OP_EOD:
1360 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363
1364 /* End of subject or ending \n assertion (\Z) */
1365
1366 case OP_EODN:
1367 if (eptr != md->end_subject &&
1368 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1369 RRETURN(MATCH_NOMATCH);
1370 ecode++;
1371 break;
1372
1373 /* Word boundary assertions */
1374
1375 case OP_NOT_WORD_BOUNDARY:
1376 case OP_WORD_BOUNDARY:
1377 {
1378
1379 /* Find out if the previous and current characters are "word" characters.
1380 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1381 be "non-word" characters. */
1382
1383#ifdef SUPPORT_UTF8
1384 if (utf8)
1385 {
1386 if (eptr == md->start_subject) prev_is_word = FALSE; else
1387 {
1388 const uschar *lastptr = eptr - 1;
1389 while((*lastptr & 0xc0) == 0x80) lastptr--;
1390 GETCHAR(c, lastptr);
1391 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1392 }
1393 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1394 {
1395 GETCHAR(c, eptr);
1396 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1397 }
1398 }
1399 else
1400#endif
1401
1402 /* More streamlined when not in UTF-8 mode */
1403
1404 {
1405 prev_is_word = (eptr != md->start_subject) &&
1406 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1407 cur_is_word = (eptr < md->end_subject) &&
1408 ((md->ctypes[*eptr] & ctype_word) != 0);
1409 }
1410
1411 /* Now see if the situation is what we want */
1412
1413 if ((*ecode++ == OP_WORD_BOUNDARY)?
1414 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1415 RRETURN(MATCH_NOMATCH);
1416 }
1417 break;
1418
1419 /* Match a single character type; inline for speed */
1420
1421 case OP_ANY:
1422 if ((ims & PCRE_DOTALL) == 0)
1423 {
1424 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1425 }
1426 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1427 if (utf8)
1428 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1429 ecode++;
1430 break;
1431
1432 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1433 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1434
1435 case OP_ANYBYTE:
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 ecode++;
1438 break;
1439
1440 case OP_NOT_DIGIT:
1441 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1442 GETCHARINCTEST(c, eptr);
1443 if (
1444#ifdef SUPPORT_UTF8
1445 c < 256 &&
1446#endif
1447 (md->ctypes[c] & ctype_digit) != 0
1448 )
1449 RRETURN(MATCH_NOMATCH);
1450 ecode++;
1451 break;
1452
1453 case OP_DIGIT:
1454 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1455 GETCHARINCTEST(c, eptr);
1456 if (
1457#ifdef SUPPORT_UTF8
1458 c >= 256 ||
1459#endif
1460 (md->ctypes[c] & ctype_digit) == 0
1461 )
1462 RRETURN(MATCH_NOMATCH);
1463 ecode++;
1464 break;
1465
1466 case OP_NOT_WHITESPACE:
1467 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1468 GETCHARINCTEST(c, eptr);
1469 if (
1470#ifdef SUPPORT_UTF8
1471 c < 256 &&
1472#endif
1473 (md->ctypes[c] & ctype_space) != 0
1474 )
1475 RRETURN(MATCH_NOMATCH);
1476 ecode++;
1477 break;
1478
1479 case OP_WHITESPACE:
1480 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1481 GETCHARINCTEST(c, eptr);
1482 if (
1483#ifdef SUPPORT_UTF8
1484 c >= 256 ||
1485#endif
1486 (md->ctypes[c] & ctype_space) == 0
1487 )
1488 RRETURN(MATCH_NOMATCH);
1489 ecode++;
1490 break;
1491
1492 case OP_NOT_WORDCHAR:
1493 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1494 GETCHARINCTEST(c, eptr);
1495 if (
1496#ifdef SUPPORT_UTF8
1497 c < 256 &&
1498#endif
1499 (md->ctypes[c] & ctype_word) != 0
1500 )
1501 RRETURN(MATCH_NOMATCH);
1502 ecode++;
1503 break;
1504
1505 case OP_WORDCHAR:
1506 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1507 GETCHARINCTEST(c, eptr);
1508 if (
1509#ifdef SUPPORT_UTF8
1510 c >= 256 ||
1511#endif
1512 (md->ctypes[c] & ctype_word) == 0
1513 )
1514 RRETURN(MATCH_NOMATCH);
1515 ecode++;
1516 break;
1517
1518 case OP_ANYNL:
1519 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1520 GETCHARINCTEST(c, eptr);
1521 switch(c)
1522 {
1523 default: RRETURN(MATCH_NOMATCH);
1524 case 0x000d:
1525 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1526 break;
1527
1528 case 0x000a:
1529 break;
1530
1531 case 0x000b:
1532 case 0x000c:
1533 case 0x0085:
1534 case 0x2028:
1535 case 0x2029:
1536 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1537 break;
1538 }
1539 ecode++;
1540 break;
1541
1542 case OP_NOT_HSPACE:
1543 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1544 GETCHARINCTEST(c, eptr);
1545 switch(c)
1546 {
1547 default: break;
1548 case 0x09: /* HT */
1549 case 0x20: /* SPACE */
1550 case 0xa0: /* NBSP */
1551 case 0x1680: /* OGHAM SPACE MARK */
1552 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1553 case 0x2000: /* EN QUAD */
1554 case 0x2001: /* EM QUAD */
1555 case 0x2002: /* EN SPACE */
1556 case 0x2003: /* EM SPACE */
1557 case 0x2004: /* THREE-PER-EM SPACE */
1558 case 0x2005: /* FOUR-PER-EM SPACE */
1559 case 0x2006: /* SIX-PER-EM SPACE */
1560 case 0x2007: /* FIGURE SPACE */
1561 case 0x2008: /* PUNCTUATION SPACE */
1562 case 0x2009: /* THIN SPACE */
1563 case 0x200A: /* HAIR SPACE */
1564 case 0x202f: /* NARROW NO-BREAK SPACE */
1565 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1566 case 0x3000: /* IDEOGRAPHIC SPACE */
1567 RRETURN(MATCH_NOMATCH);
1568 }
1569 ecode++;
1570 break;
1571
1572 case OP_HSPACE:
1573 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1574 GETCHARINCTEST(c, eptr);
1575 switch(c)
1576 {
1577 default: RRETURN(MATCH_NOMATCH);
1578 case 0x09: /* HT */
1579 case 0x20: /* SPACE */
1580 case 0xa0: /* NBSP */
1581 case 0x1680: /* OGHAM SPACE MARK */
1582 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1583 case 0x2000: /* EN QUAD */
1584 case 0x2001: /* EM QUAD */
1585 case 0x2002: /* EN SPACE */
1586 case 0x2003: /* EM SPACE */
1587 case 0x2004: /* THREE-PER-EM SPACE */
1588 case 0x2005: /* FOUR-PER-EM SPACE */
1589 case 0x2006: /* SIX-PER-EM SPACE */
1590 case 0x2007: /* FIGURE SPACE */
1591 case 0x2008: /* PUNCTUATION SPACE */
1592 case 0x2009: /* THIN SPACE */
1593 case 0x200A: /* HAIR SPACE */
1594 case 0x202f: /* NARROW NO-BREAK SPACE */
1595 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1596 case 0x3000: /* IDEOGRAPHIC SPACE */
1597 break;
1598 }
1599 ecode++;
1600 break;
1601
1602 case OP_NOT_VSPACE:
1603 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1604 GETCHARINCTEST(c, eptr);
1605 switch(c)
1606 {
1607 default: break;
1608 case 0x0a: /* LF */
1609 case 0x0b: /* VT */
1610 case 0x0c: /* FF */
1611 case 0x0d: /* CR */
1612 case 0x85: /* NEL */
1613 case 0x2028: /* LINE SEPARATOR */
1614 case 0x2029: /* PARAGRAPH SEPARATOR */
1615 RRETURN(MATCH_NOMATCH);
1616 }
1617 ecode++;
1618 break;
1619
1620 case OP_VSPACE:
1621 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1622 GETCHARINCTEST(c, eptr);
1623 switch(c)
1624 {
1625 default: RRETURN(MATCH_NOMATCH);
1626 case 0x0a: /* LF */
1627 case 0x0b: /* VT */
1628 case 0x0c: /* FF */
1629 case 0x0d: /* CR */
1630 case 0x85: /* NEL */
1631 case 0x2028: /* LINE SEPARATOR */
1632 case 0x2029: /* PARAGRAPH SEPARATOR */
1633 break;
1634 }
1635 ecode++;
1636 break;
1637
1638#ifdef SUPPORT_UCP
1639 /* Check the next character by Unicode property. We will get here only
1640 if the support is in the binary; otherwise a compile-time error occurs. */
1641
1642 case OP_PROP:
1643 case OP_NOTPROP:
1644 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1645 GETCHARINCTEST(c, eptr);
1646 {
1647 int chartype, script;
1648 int category = _pcre_ucp_findprop(c, &chartype, &script);
1649
1650 switch(ecode[1])
1651 {
1652 case PT_ANY:
1653 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1654 break;
1655
1656 case PT_LAMP:
1657 if ((chartype == ucp_Lu ||
1658 chartype == ucp_Ll ||
1659 chartype == ucp_Lt) == (op == OP_NOTPROP))
1660 RRETURN(MATCH_NOMATCH);
1661 break;
1662
1663 case PT_GC:
1664 if ((ecode[2] != category) == (op == OP_PROP))
1665 RRETURN(MATCH_NOMATCH);
1666 break;
1667
1668 case PT_PC:
1669 if ((ecode[2] != chartype) == (op == OP_PROP))
1670 RRETURN(MATCH_NOMATCH);
1671 break;
1672
1673 case PT_SC:
1674 if ((ecode[2] != script) == (op == OP_PROP))
1675 RRETURN(MATCH_NOMATCH);
1676 break;
1677
1678 default:
1679 RRETURN(PCRE_ERROR_INTERNAL);
1680 }
1681
1682 ecode += 3;
1683 }
1684 break;
1685
1686 /* Match an extended Unicode sequence. We will get here only if the support
1687 is in the binary; otherwise a compile-time error occurs. */
1688
1689 case OP_EXTUNI:
1690 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1691 GETCHARINCTEST(c, eptr);
1692 {
1693 int chartype, script;
1694 int category = _pcre_ucp_findprop(c, &chartype, &script);
1695 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1696 while (eptr < md->end_subject)
1697 {
1698 int len = 1;
1699 if (!utf8) c = *eptr; else
1700 {
1701 GETCHARLEN(c, eptr, len);
1702 }
1703 category = _pcre_ucp_findprop(c, &chartype, &script);
1704 if (category != ucp_M) break;
1705 eptr += len;
1706 }
1707 }
1708 ecode++;
1709 break;
1710#endif
1711
1712
1713 /* Match a back reference, possibly repeatedly. Look past the end of the
1714 item to see if there is repeat information following. The code is similar
1715 to that for character classes, but repeated for efficiency. Then obey
1716 similar code to character type repeats - written out again for speed.
1717 However, if the referenced string is the empty string, always treat
1718 it as matched, any number of times (otherwise there could be infinite
1719 loops). */
1720
1721 case OP_REF:
1722 {
1723 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1724 ecode += 3; /* Advance past item */
1725
1726 /* If the reference is unset, set the length to be longer than the amount
1727 of subject left; this ensures that every attempt at a match fails. We
1728 can't just fail here, because of the possibility of quantifiers with zero
1729 minima. */
1730
1731 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1732 md->end_subject - eptr + 1 :
1733 md->offset_vector[offset+1] - md->offset_vector[offset];
1734
1735 /* Set up for repetition, or handle the non-repeated case */
1736
1737 switch (*ecode)
1738 {
1739 case OP_CRSTAR:
1740 case OP_CRMINSTAR:
1741 case OP_CRPLUS:
1742 case OP_CRMINPLUS:
1743 case OP_CRQUERY:
1744 case OP_CRMINQUERY:
1745 c = *ecode++ - OP_CRSTAR;
1746 minimize = (c & 1) != 0;
1747 min = rep_min[c]; /* Pick up values from tables; */
1748 max = rep_max[c]; /* zero for max => infinity */
1749 if (max == 0) max = INT_MAX;
1750 break;
1751
1752 case OP_CRRANGE:
1753 case OP_CRMINRANGE:
1754 minimize = (*ecode == OP_CRMINRANGE);
1755 min = GET2(ecode, 1);
1756 max = GET2(ecode, 3);
1757 if (max == 0) max = INT_MAX;
1758 ecode += 5;
1759 break;
1760
1761 default: /* No repeat follows */
1762 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1763 eptr += length;
1764 continue; /* With the main loop */
1765 }
1766
1767 /* If the length of the reference is zero, just continue with the
1768 main loop. */
1769
1770 if (length == 0) continue;
1771
1772 /* First, ensure the minimum number of matches are present. We get back
1773 the length of the reference string explicitly rather than passing the
1774 address of eptr, so that eptr can be a register variable. */
1775
1776 for (i = 1; i <= min; i++)
1777 {
1778 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1779 eptr += length;
1780 }
1781
1782 /* If min = max, continue at the same level without recursion.
1783 They are not both allowed to be zero. */
1784
1785 if (min == max) continue;
1786
1787 /* If minimizing, keep trying and advancing the pointer */
1788
1789 if (minimize)
1790 {
1791 for (fi = min;; fi++)
1792 {
1793 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1796 RRETURN(MATCH_NOMATCH);
1797 eptr += length;
1798 }
1799 /* Control never gets here */
1800 }
1801
1802 /* If maximizing, find the longest string and work backwards */
1803
1804 else
1805 {
1806 pp = eptr;
1807 for (i = min; i < max; i++)
1808 {
1809 if (!match_ref(offset, eptr, length, md, ims)) break;
1810 eptr += length;
1811 }
1812 while (eptr >= pp)
1813 {
1814 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 eptr -= length;
1817 }
1818 RRETURN(MATCH_NOMATCH);
1819 }
1820 }
1821 /* Control never gets here */
1822
1823
1824
1825 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1826 used when all the characters in the class have values in the range 0-255,
1827 and either the matching is caseful, or the characters are in the range
1828 0-127 when UTF-8 processing is enabled. The only difference between
1829 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1830 encountered.
1831
1832 First, look past the end of the item to see if there is repeat information
1833 following. Then obey similar code to character type repeats - written out
1834 again for speed. */
1835
1836 case OP_NCLASS:
1837 case OP_CLASS:
1838 {
1839 data = ecode + 1; /* Save for matching */
1840 ecode += 33; /* Advance past the item */
1841
1842 switch (*ecode)
1843 {
1844 case OP_CRSTAR:
1845 case OP_CRMINSTAR:
1846 case OP_CRPLUS:
1847 case OP_CRMINPLUS:
1848 case OP_CRQUERY:
1849 case OP_CRMINQUERY:
1850 c = *ecode++ - OP_CRSTAR;
1851 minimize = (c & 1) != 0;
1852 min = rep_min[c]; /* Pick up values from tables; */
1853 max = rep_max[c]; /* zero for max => infinity */
1854 if (max == 0) max = INT_MAX;
1855 break;
1856
1857 case OP_CRRANGE:
1858 case OP_CRMINRANGE:
1859 minimize = (*ecode == OP_CRMINRANGE);
1860 min = GET2(ecode, 1);
1861 max = GET2(ecode, 3);
1862 if (max == 0) max = INT_MAX;
1863 ecode += 5;
1864 break;
1865
1866 default: /* No repeat follows */
1867 min = max = 1;
1868 break;
1869 }
1870
1871 /* First, ensure the minimum number of matches are present. */
1872
1873#ifdef SUPPORT_UTF8
1874 /* UTF-8 mode */
1875 if (utf8)
1876 {
1877 for (i = 1; i <= min; i++)
1878 {
1879 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1880 GETCHARINC(c, eptr);
1881 if (c > 255)
1882 {
1883 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1884 }
1885 else
1886 {
1887 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1888 }
1889 }
1890 }
1891 else
1892#endif
1893 /* Not UTF-8 mode */
1894 {
1895 for (i = 1; i <= min; i++)
1896 {
1897 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1898 c = *eptr++;
1899 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1900 }
1901 }
1902
1903 /* If max == min we can continue with the main loop without the
1904 need to recurse. */
1905
1906 if (min == max) continue;
1907
1908 /* If minimizing, keep testing the rest of the expression and advancing
1909 the pointer while it matches the class. */
1910
1911 if (minimize)
1912 {
1913#ifdef SUPPORT_UTF8
1914 /* UTF-8 mode */
1915 if (utf8)
1916 {
1917 for (fi = min;; fi++)
1918 {
1919 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1922 GETCHARINC(c, eptr);
1923 if (c > 255)
1924 {
1925 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1926 }
1927 else
1928 {
1929 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1930 }
1931 }
1932 }
1933 else
1934#endif
1935 /* Not UTF-8 mode */
1936 {
1937 for (fi = min;; fi++)
1938 {
1939 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1941 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1942 c = *eptr++;
1943 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1944 }
1945 }
1946 /* Control never gets here */
1947 }
1948
1949 /* If maximizing, find the longest possible run, then work backwards. */
1950
1951 else
1952 {
1953 pp = eptr;
1954
1955#ifdef SUPPORT_UTF8
1956 /* UTF-8 mode */
1957 if (utf8)
1958 {
1959 for (i = min; i < max; i++)
1960 {
1961 int len = 1;
1962 if (eptr >= md->end_subject) break;
1963 GETCHARLEN(c, eptr, len);
1964 if (c > 255)
1965 {
1966 if (op == OP_CLASS) break;
1967 }
1968 else
1969 {
1970 if ((data[c/8] & (1 << (c&7))) == 0) break;
1971 }
1972 eptr += len;
1973 }
1974 for (;;)
1975 {
1976 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1978 if (eptr-- == pp) break; /* Stop if tried at original pos */
1979 BACKCHAR(eptr);
1980 }
1981 }
1982 else
1983#endif
1984 /* Not UTF-8 mode */
1985 {
1986 for (i = min; i < max; i++)
1987 {
1988 if (eptr >= md->end_subject) break;
1989 c = *eptr;
1990 if ((data[c/8] & (1 << (c&7))) == 0) break;
1991 eptr++;
1992 }
1993 while (eptr >= pp)
1994 {
1995 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 eptr--;
1998 }
1999 }
2000
2001 RRETURN(MATCH_NOMATCH);
2002 }
2003 }
2004 /* Control never gets here */
2005
2006
2007 /* Match an extended character class. This opcode is encountered only
2008 in UTF-8 mode, because that's the only time it is compiled. */
2009
2010#ifdef SUPPORT_UTF8
2011 case OP_XCLASS:
2012 {
2013 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2014 ecode += GET(ecode, 1); /* Advance past the item */
2015
2016 switch (*ecode)
2017 {
2018 case OP_CRSTAR:
2019 case OP_CRMINSTAR:
2020 case OP_CRPLUS:
2021 case OP_CRMINPLUS:
2022 case OP_CRQUERY:
2023 case OP_CRMINQUERY:
2024 c = *ecode++ - OP_CRSTAR;
2025 minimize = (c & 1) != 0;
2026 min = rep_min[c]; /* Pick up values from tables; */
2027 max = rep_max[c]; /* zero for max => infinity */
2028 if (max == 0) max = INT_MAX;
2029 break;
2030
2031 case OP_CRRANGE:
2032 case OP_CRMINRANGE:
2033 minimize = (*ecode == OP_CRMINRANGE);
2034 min = GET2(ecode, 1);
2035 max = GET2(ecode, 3);
2036 if (max == 0) max = INT_MAX;
2037 ecode += 5;
2038 break;
2039
2040 default: /* No repeat follows */
2041 min = max = 1;
2042 break;
2043 }
2044
2045 /* First, ensure the minimum number of matches are present. */
2046
2047 for (i = 1; i <= min; i++)
2048 {
2049 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2050 GETCHARINC(c, eptr);
2051 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2052 }
2053
2054 /* If max == min we can continue with the main loop without the
2055 need to recurse. */
2056
2057 if (min == max) continue;
2058
2059 /* If minimizing, keep testing the rest of the expression and advancing
2060 the pointer while it matches the class. */
2061
2062 if (minimize)
2063 {
2064 for (fi = min;; fi++)
2065 {
2066 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2067 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2068 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2069 GETCHARINC(c, eptr);
2070 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2071 }
2072 /* Control never gets here */
2073 }
2074
2075 /* If maximizing, find the longest possible run, then work backwards. */
2076
2077 else
2078 {
2079 pp = eptr;
2080 for (i = min; i < max; i++)
2081 {
2082 int len = 1;
2083 if (eptr >= md->end_subject) break;
2084 GETCHARLEN(c, eptr, len);
2085 if (!_pcre_xclass(c, data)) break;
2086 eptr += len;
2087 }
2088 for(;;)
2089 {
2090 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2091 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2092 if (eptr-- == pp) break; /* Stop if tried at original pos */
2093 if (utf8) BACKCHAR(eptr);
2094 }
2095 RRETURN(MATCH_NOMATCH);
2096 }
2097
2098 /* Control never gets here */
2099 }
2100#endif /* End of XCLASS */
2101
2102 /* Match a single character, casefully */
2103
2104 case OP_CHAR:
2105#ifdef SUPPORT_UTF8
2106 if (utf8)
2107 {
2108 length = 1;
2109 ecode++;
2110 GETCHARLEN(fc, ecode, length);
2111 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2112 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2113 }
2114 else
2115#endif
2116
2117 /* Non-UTF-8 mode */
2118 {
2119 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2120 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2121 ecode += 2;
2122 }
2123 break;
2124
2125 /* Match a single character, caselessly */
2126
2127 case OP_CHARNC:
2128#ifdef SUPPORT_UTF8
2129 if (utf8)
2130 {
2131 length = 1;
2132 ecode++;
2133 GETCHARLEN(fc, ecode, length);
2134
2135 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2136
2137 /* If the pattern character's value is < 128, we have only one byte, and
2138 can use the fast lookup table. */
2139
2140 if (fc < 128)
2141 {
2142 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2143 }
2144
2145 /* Otherwise we must pick up the subject character */
2146
2147 else
2148 {
2149 unsigned int dc;
2150 GETCHARINC(dc, eptr);
2151 ecode += length;
2152
2153 /* If we have Unicode property support, we can use it to test the other
2154 case of the character, if there is one. */
2155
2156 if (fc != dc)
2157 {
2158#ifdef SUPPORT_UCP
2159 if (dc != _pcre_ucp_othercase(fc))
2160#endif
2161 RRETURN(MATCH_NOMATCH);
2162 }
2163 }
2164 }
2165 else
2166#endif /* SUPPORT_UTF8 */
2167
2168 /* Non-UTF-8 mode */
2169 {
2170 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2171 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2172 ecode += 2;
2173 }
2174 break;
2175
2176 /* Match a single character repeatedly. */
2177
2178 case OP_EXACT:
2179 min = max = GET2(ecode, 1);
2180 ecode += 3;
2181 goto REPEATCHAR;
2182
2183 case OP_POSUPTO:
2184 possessive = TRUE;
2185 /* Fall through */
2186
2187 case OP_UPTO:
2188 case OP_MINUPTO:
2189 min = 0;
2190 max = GET2(ecode, 1);
2191 minimize = *ecode == OP_MINUPTO;
2192 ecode += 3;
2193 goto REPEATCHAR;
2194
2195 case OP_POSSTAR:
2196 possessive = TRUE;
2197 min = 0;
2198 max = INT_MAX;
2199 ecode++;
2200 goto REPEATCHAR;
2201
2202 case OP_POSPLUS:
2203 possessive = TRUE;
2204 min = 1;
2205 max = INT_MAX;
2206 ecode++;
2207 goto REPEATCHAR;
2208
2209 case OP_POSQUERY:
2210 possessive = TRUE;
2211 min = 0;
2212 max = 1;
2213 ecode++;
2214 goto REPEATCHAR;
2215
2216 case OP_STAR:
2217 case OP_MINSTAR:
2218 case OP_PLUS:
2219 case OP_MINPLUS:
2220 case OP_QUERY:
2221 case OP_MINQUERY:
2222 c = *ecode++ - OP_STAR;
2223 minimize = (c & 1) != 0;
2224 min = rep_min[c]; /* Pick up values from tables; */
2225 max = rep_max[c]; /* zero for max => infinity */
2226 if (max == 0) max = INT_MAX;
2227
2228 /* Common code for all repeated single-character matches. We can give
2229 up quickly if there are fewer than the minimum number of characters left in
2230 the subject. */
2231
2232 REPEATCHAR:
2233#ifdef SUPPORT_UTF8
2234 if (utf8)
2235 {
2236 length = 1;
2237 charptr = ecode;
2238 GETCHARLEN(fc, ecode, length);
2239 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2240 ecode += length;
2241
2242 /* Handle multibyte character matching specially here. There is
2243 support for caseless matching if UCP support is present. */
2244
2245 if (length > 1)
2246 {
2247#ifdef SUPPORT_UCP
2248 unsigned int othercase;
2249 if ((ims & PCRE_CASELESS) != 0 &&
2250 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2251 oclength = _pcre_ord2utf8(othercase, occhars);
2252 else oclength = 0;
2253#endif /* SUPPORT_UCP */
2254
2255 for (i = 1; i <= min; i++)
2256 {
2257 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2258#ifdef SUPPORT_UCP
2259 /* Need braces because of following else */
2260 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2261 else
2262 {
2263 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2264 eptr += oclength;
2265 }
2266#else /* without SUPPORT_UCP */
2267 else { RRETURN(MATCH_NOMATCH); }
2268#endif /* SUPPORT_UCP */
2269 }
2270
2271 if (min == max) continue;
2272
2273 if (minimize)
2274 {
2275 for (fi = min;; fi++)
2276 {
2277 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2279 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2280 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2281#ifdef SUPPORT_UCP
2282 /* Need braces because of following else */
2283 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2284 else
2285 {
2286 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2287 eptr += oclength;
2288 }
2289#else /* without SUPPORT_UCP */
2290 else { RRETURN (MATCH_NOMATCH); }
2291#endif /* SUPPORT_UCP */
2292 }
2293 /* Control never gets here */
2294 }
2295
2296 else /* Maximize */
2297 {
2298 pp = eptr;
2299 for (i = min; i < max; i++)
2300 {
2301 if (eptr > md->end_subject - length) break;
2302 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2303#ifdef SUPPORT_UCP
2304 else if (oclength == 0) break;
2305 else
2306 {
2307 if (memcmp(eptr, occhars, oclength) != 0) break;
2308 eptr += oclength;
2309 }
2310#else /* without SUPPORT_UCP */
2311 else break;
2312#endif /* SUPPORT_UCP */
2313 }
2314
2315 if (possessive) continue;
2316 for(;;)
2317 {
2318 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2319 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2320 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2321#ifdef SUPPORT_UCP
2322 eptr--;
2323 BACKCHAR(eptr);
2324#else /* without SUPPORT_UCP */
2325 eptr -= length;
2326#endif /* SUPPORT_UCP */
2327 }
2328 }
2329 /* Control never gets here */
2330 }
2331
2332 /* If the length of a UTF-8 character is 1, we fall through here, and
2333 obey the code as for non-UTF-8 characters below, though in this case the
2334 value of fc will always be < 128. */
2335 }
2336 else
2337#endif /* SUPPORT_UTF8 */
2338
2339 /* When not in UTF-8 mode, load a single-byte character. */
2340 {
2341 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2342 fc = *ecode++;
2343 }
2344
2345 /* The value of fc at this point is always less than 256, though we may or
2346 may not be in UTF-8 mode. The code is duplicated for the caseless and
2347 caseful cases, for speed, since matching characters is likely to be quite
2348 common. First, ensure the minimum number of matches are present. If min =
2349 max, continue at the same level without recursing. Otherwise, if
2350 minimizing, keep trying the rest of the expression and advancing one
2351 matching character if failing, up to the maximum. Alternatively, if
2352 maximizing, find the maximum number of characters and work backwards. */
2353
2354 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2355 max, eptr));
2356
2357 if ((ims & PCRE_CASELESS) != 0)
2358 {
2359 fc = md->lcc[fc];
2360 for (i = 1; i <= min; i++)
2361 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2362 if (min == max) continue;
2363 if (minimize)
2364 {
2365 for (fi = min;; fi++)
2366 {
2367 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2369 if (fi >= max || eptr >= md->end_subject ||
2370 fc != md->lcc[*eptr++])
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 /* Control never gets here */
2374 }
2375 else /* Maximize */
2376 {
2377 pp = eptr;
2378 for (i = min; i < max; i++)
2379 {
2380 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2381 eptr++;
2382 }
2383 if (possessive) continue;
2384 while (eptr >= pp)
2385 {
2386 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2387 eptr--;
2388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2389 }
2390 RRETURN(MATCH_NOMATCH);
2391 }
2392 /* Control never gets here */
2393 }
2394
2395 /* Caseful comparisons (includes all multi-byte characters) */
2396
2397 else
2398 {
2399 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2400 if (min == max) continue;
2401 if (minimize)
2402 {
2403 for (fi = min;; fi++)
2404 {
2405 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2407 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2408 RRETURN(MATCH_NOMATCH);
2409 }
2410 /* Control never gets here */
2411 }
2412 else /* Maximize */
2413 {
2414 pp = eptr;
2415 for (i = min; i < max; i++)
2416 {
2417 if (eptr >= md->end_subject || fc != *eptr) break;
2418 eptr++;
2419 }
2420 if (possessive) continue;
2421 while (eptr >= pp)
2422 {
2423 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2424 eptr--;
2425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2426 }
2427 RRETURN(MATCH_NOMATCH);
2428 }
2429 }
2430 /* Control never gets here */
2431
2432 /* Match a negated single one-byte character. The character we are
2433 checking can be multibyte. */
2434
2435 case OP_NOT:
2436 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2437 ecode++;
2438 GETCHARINCTEST(c, eptr);
2439 if ((ims & PCRE_CASELESS) != 0)
2440 {
2441#ifdef SUPPORT_UTF8
2442 if (c < 256)
2443#endif
2444 c = md->lcc[c];
2445 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2446 }
2447 else
2448 {
2449 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2450 }
2451 break;
2452
2453 /* Match a negated single one-byte character repeatedly. This is almost a
2454 repeat of the code for a repeated single character, but I haven't found a
2455 nice way of commoning these up that doesn't require a test of the
2456 positive/negative option for each character match. Maybe that wouldn't add
2457 very much to the time taken, but character matching *is* what this is all
2458 about... */
2459
2460 case OP_NOTEXACT:
2461 min = max = GET2(ecode, 1);
2462 ecode += 3;
2463 goto REPEATNOTCHAR;
2464
2465 case OP_NOTUPTO:
2466 case OP_NOTMINUPTO:
2467 min = 0;
2468 max = GET2(ecode, 1);
2469 minimize = *ecode == OP_NOTMINUPTO;
2470 ecode += 3;
2471 goto REPEATNOTCHAR;
2472
2473 case OP_NOTPOSSTAR:
2474 possessive = TRUE;
2475 min = 0;
2476 max = INT_MAX;
2477 ecode++;
2478 goto REPEATNOTCHAR;
2479
2480 case OP_NOTPOSPLUS:
2481 possessive = TRUE;
2482 min = 1;
2483 max = INT_MAX;
2484 ecode++;
2485 goto REPEATNOTCHAR;
2486
2487 case OP_NOTPOSQUERY:
2488 possessive = TRUE;
2489 min = 0;
2490 max = 1;
2491 ecode++;
2492 goto REPEATNOTCHAR;
2493
2494 case OP_NOTPOSUPTO:
2495 possessive = TRUE;
2496 min = 0;
2497 max = GET2(ecode, 1);
2498 ecode += 3;
2499 goto REPEATNOTCHAR;
2500
2501 case OP_NOTSTAR:
2502 case OP_NOTMINSTAR:
2503 case OP_NOTPLUS:
2504 case OP_NOTMINPLUS:
2505 case OP_NOTQUERY:
2506 case OP_NOTMINQUERY:
2507 c = *ecode++ - OP_NOTSTAR;
2508 minimize = (c & 1) != 0;
2509 min = rep_min[c]; /* Pick up values from tables; */
2510 max = rep_max[c]; /* zero for max => infinity */
2511 if (max == 0) max = INT_MAX;
2512
2513 /* Common code for all repeated single-byte matches. We can give up quickly
2514 if there are fewer than the minimum number of bytes left in the
2515 subject. */
2516
2517 REPEATNOTCHAR:
2518 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2519 fc = *ecode++;
2520
2521 /* The code is duplicated for the caseless and caseful cases, for speed,
2522 since matching characters is likely to be quite common. First, ensure the
2523 minimum number of matches are present. If min = max, continue at the same
2524 level without recursing. Otherwise, if minimizing, keep trying the rest of
2525 the expression and advancing one matching character if failing, up to the
2526 maximum. Alternatively, if maximizing, find the maximum number of
2527 characters and work backwards. */
2528
2529 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2530 max, eptr));
2531
2532 if ((ims & PCRE_CASELESS) != 0)
2533 {
2534 fc = md->lcc[fc];
2535
2536#ifdef SUPPORT_UTF8
2537 /* UTF-8 mode */
2538 if (utf8)
2539 {
2540 register unsigned int d;
2541 for (i = 1; i <= min; i++)
2542 {
2543 GETCHARINC(d, eptr);
2544 if (d < 256) d = md->lcc[d];
2545 if (fc == d) RRETURN(MATCH_NOMATCH);
2546 }
2547 }
2548 else
2549#endif
2550
2551 /* Not UTF-8 mode */
2552 {
2553 for (i = 1; i <= min; i++)
2554 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2555 }
2556
2557 if (min == max) continue;
2558
2559 if (minimize)
2560 {
2561#ifdef SUPPORT_UTF8
2562 /* UTF-8 mode */
2563 if (utf8)
2564 {
2565 register unsigned int d;
2566 for (fi = min;; fi++)
2567 {
2568 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2570 GETCHARINC(d, eptr);
2571 if (d < 256) d = md->lcc[d];
2572 if (fi >= max || eptr >= md->end_subject || fc == d)
2573 RRETURN(MATCH_NOMATCH);
2574 }
2575 }
2576 else
2577#endif
2578 /* Not UTF-8 mode */
2579 {
2580 for (fi = min;; fi++)
2581 {
2582 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2584 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2585 RRETURN(MATCH_NOMATCH);
2586 }
2587 }
2588 /* Control never gets here */
2589 }
2590
2591 /* Maximize case */
2592
2593 else
2594 {
2595 pp = eptr;
2596
2597#ifdef SUPPORT_UTF8
2598 /* UTF-8 mode */
2599 if (utf8)
2600 {
2601 register unsigned int d;
2602 for (i = min; i < max; i++)
2603 {
2604 int len = 1;
2605 if (eptr >= md->end_subject) break;
2606 GETCHARLEN(d, eptr, len);
2607 if (d < 256) d = md->lcc[d];
2608 if (fc == d) break;
2609 eptr += len;
2610 }
2611 if (possessive) continue;
2612 for(;;)
2613 {
2614 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2615 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2616 if (eptr-- == pp) break; /* Stop if tried at original pos */
2617 BACKCHAR(eptr);
2618 }
2619 }
2620 else
2621#endif
2622 /* Not UTF-8 mode */
2623 {
2624 for (i = min; i < max; i++)
2625 {
2626 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2627 eptr++;
2628 }
2629 if (possessive) continue;
2630 while (eptr >= pp)
2631 {
2632 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2634 eptr--;
2635 }
2636 }
2637
2638 RRETURN(MATCH_NOMATCH);
2639 }
2640 /* Control never gets here */
2641 }
2642
2643 /* Caseful comparisons */
2644
2645 else
2646 {
2647#ifdef SUPPORT_UTF8
2648 /* UTF-8 mode */
2649 if (utf8)
2650 {
2651 register unsigned int d;
2652 for (i = 1; i <= min; i++)
2653 {
2654 GETCHARINC(d, eptr);
2655 if (fc == d) RRETURN(MATCH_NOMATCH);
2656 }
2657 }
2658 else
2659#endif
2660 /* Not UTF-8 mode */
2661 {
2662 for (i = 1; i <= min; i++)
2663 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2664 }
2665
2666 if (min == max) continue;
2667
2668 if (minimize)
2669 {
2670#ifdef SUPPORT_UTF8
2671 /* UTF-8 mode */
2672 if (utf8)
2673 {
2674 register unsigned int d;
2675 for (fi = min;; fi++)
2676 {
2677 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2679 GETCHARINC(d, eptr);
2680 if (fi >= max || eptr >= md->end_subject || fc == d)
2681 RRETURN(MATCH_NOMATCH);
2682 }
2683 }
2684 else
2685#endif
2686 /* Not UTF-8 mode */
2687 {
2688 for (fi = min;; fi++)
2689 {
2690 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2692 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2693 RRETURN(MATCH_NOMATCH);
2694 }
2695 }
2696 /* Control never gets here */
2697 }
2698
2699 /* Maximize case */
2700
2701 else
2702 {
2703 pp = eptr;
2704
2705#ifdef SUPPORT_UTF8
2706 /* UTF-8 mode */
2707 if (utf8)
2708 {
2709 register unsigned int d;
2710 for (i = min; i < max; i++)
2711 {
2712 int len = 1;
2713 if (eptr >= md->end_subject) break;
2714 GETCHARLEN(d, eptr, len);
2715 if (fc == d) break;
2716 eptr += len;
2717 }
2718 if (possessive) continue;
2719 for(;;)
2720 {
2721 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2723 if (eptr-- == pp) break; /* Stop if tried at original pos */
2724 BACKCHAR(eptr);
2725 }
2726 }
2727 else
2728#endif
2729 /* Not UTF-8 mode */
2730 {
2731 for (i = min; i < max; i++)
2732 {
2733 if (eptr >= md->end_subject || fc == *eptr) break;
2734 eptr++;
2735 }
2736 if (possessive) continue;
2737 while (eptr >= pp)
2738 {
2739 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 eptr--;
2742 }
2743 }
2744
2745 RRETURN(MATCH_NOMATCH);
2746 }
2747 }
2748 /* Control never gets here */
2749
2750 /* Match a single character type repeatedly; several different opcodes
2751 share code. This is very similar to the code for single characters, but we
2752 repeat it in the interests of efficiency. */
2753
2754 case OP_TYPEEXACT:
2755 min = max = GET2(ecode, 1);
2756 minimize = TRUE;
2757 ecode += 3;
2758 goto REPEATTYPE;
2759
2760 case OP_TYPEUPTO:
2761 case OP_TYPEMINUPTO:
2762 min = 0;
2763 max = GET2(ecode, 1);
2764 minimize = *ecode == OP_TYPEMINUPTO;
2765 ecode += 3;
2766 goto REPEATTYPE;
2767
2768 case OP_TYPEPOSSTAR:
2769 possessive = TRUE;
2770 min = 0;
2771 max = INT_MAX;
2772 ecode++;
2773 goto REPEATTYPE;
2774
2775 case OP_TYPEPOSPLUS:
2776 possessive = TRUE;
2777 min = 1;
2778 max = INT_MAX;
2779 ecode++;
2780 goto REPEATTYPE;
2781
2782 case OP_TYPEPOSQUERY:
2783 possessive = TRUE;
2784 min = 0;
2785 max = 1;
2786 ecode++;
2787 goto REPEATTYPE;
2788
2789 case OP_TYPEPOSUPTO:
2790 possessive = TRUE;
2791 min = 0;
2792 max = GET2(ecode, 1);
2793 ecode += 3;
2794 goto REPEATTYPE;
2795
2796 case OP_TYPESTAR:
2797 case OP_TYPEMINSTAR:
2798 case OP_TYPEPLUS:
2799 case OP_TYPEMINPLUS:
2800 case OP_TYPEQUERY:
2801 case OP_TYPEMINQUERY:
2802 c = *ecode++ - OP_TYPESTAR;
2803 minimize = (c & 1) != 0;
2804 min = rep_min[c]; /* Pick up values from tables; */
2805 max = rep_max[c]; /* zero for max => infinity */
2806 if (max == 0) max = INT_MAX;
2807
2808 /* Common code for all repeated single character type matches. Note that
2809 in UTF-8 mode, '.' matches a character of any length, but for the other
2810 character types, the valid characters are all one-byte long. */
2811
2812 REPEATTYPE:
2813 ctype = *ecode++; /* Code for the character type */
2814
2815#ifdef SUPPORT_UCP
2816 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2817 {
2818 prop_fail_result = ctype == OP_NOTPROP;
2819 prop_type = *ecode++;
2820 prop_value = *ecode++;
2821 }
2822 else prop_type = -1;
2823#endif
2824
2825 /* First, ensure the minimum number of matches are present. Use inline
2826 code for maximizing the speed, and do the type test once at the start
2827 (i.e. keep it out of the loop). Also we can test that there are at least
2828 the minimum number of bytes before we start. This isn't as effective in
2829 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2830 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2831 and single-bytes. */
2832
2833 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2834 if (min > 0)
2835 {
2836#ifdef SUPPORT_UCP
2837 if (prop_type >= 0)
2838 {
2839 switch(prop_type)
2840 {
2841 case PT_ANY:
2842 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2843 for (i = 1; i <= min; i++)
2844 {
2845 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2846 GETCHARINCTEST(c, eptr);
2847 }
2848 break;
2849
2850 case PT_LAMP:
2851 for (i = 1; i <= min; i++)
2852 {
2853 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2854 GETCHARINCTEST(c, eptr);
2855 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2856 if ((prop_chartype == ucp_Lu ||
2857 prop_chartype == ucp_Ll ||
2858 prop_chartype == ucp_Lt) == prop_fail_result)
2859 RRETURN(MATCH_NOMATCH);
2860 }
2861 break;
2862
2863 case PT_GC:
2864 for (i = 1; i <= min; i++)
2865 {
2866 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2867 GETCHARINCTEST(c, eptr);
2868 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2869 if ((prop_category == prop_value) == prop_fail_result)
2870 RRETURN(MATCH_NOMATCH);
2871 }
2872 break;
2873
2874 case PT_PC:
2875 for (i = 1; i <= min; i++)
2876 {
2877 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2878 GETCHARINCTEST(c, eptr);
2879 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2880 if ((prop_chartype == prop_value) == prop_fail_result)
2881 RRETURN(MATCH_NOMATCH);
2882 }
2883 break;
2884
2885 case PT_SC:
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889 GETCHARINCTEST(c, eptr);
2890 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2891 if ((prop_script == prop_value) == prop_fail_result)
2892 RRETURN(MATCH_NOMATCH);
2893 }
2894 break;
2895
2896 default:
2897 RRETURN(PCRE_ERROR_INTERNAL);
2898 }
2899 }
2900
2901 /* Match extended Unicode sequences. We will get here only if the
2902 support is in the binary; otherwise a compile-time error occurs. */
2903
2904 else if (ctype == OP_EXTUNI)
2905 {
2906 for (i = 1; i <= min; i++)
2907 {
2908 GETCHARINCTEST(c, eptr);
2909 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2910 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2911 while (eptr < md->end_subject)
2912 {
2913 int len = 1;
2914 if (!utf8) c = *eptr; else
2915 {
2916 GETCHARLEN(c, eptr, len);
2917 }
2918 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2919 if (prop_category != ucp_M) break;
2920 eptr += len;
2921 }
2922 }
2923 }
2924
2925 else
2926#endif /* SUPPORT_UCP */
2927
2928/* Handle all other cases when the coding is UTF-8 */
2929
2930#ifdef SUPPORT_UTF8
2931 if (utf8) switch(ctype)
2932 {
2933 case OP_ANY:
2934 for (i = 1; i <= min; i++)
2935 {
2936 if (eptr >= md->end_subject ||
2937 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2938 RRETURN(MATCH_NOMATCH);
2939 eptr++;
2940 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2941 }
2942 break;
2943
2944 case OP_ANYBYTE:
2945 eptr += min;
2946 break;
2947
2948 case OP_ANYNL:
2949 for (i = 1; i <= min; i++)
2950 {
2951 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2952 GETCHARINC(c, eptr);
2953 switch(c)
2954 {
2955 default: RRETURN(MATCH_NOMATCH);
2956 case 0x000d:
2957 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2958 break;
2959
2960 case 0x000a:
2961 break;
2962
2963 case 0x000b:
2964 case 0x000c:
2965 case 0x0085:
2966 case 0x2028:
2967 case 0x2029:
2968 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2969 break;
2970 }
2971 }
2972 break;
2973
2974 case OP_NOT_HSPACE:
2975 for (i = 1; i <= min; i++)
2976 {
2977 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2978 GETCHARINC(c, eptr);
2979 switch(c)
2980 {
2981 default: break;
2982 case 0x09: /* HT */
2983 case 0x20: /* SPACE */
2984 case 0xa0: /* NBSP */
2985 case 0x1680: /* OGHAM SPACE MARK */
2986 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2987 case 0x2000: /* EN QUAD */
2988 case 0x2001: /* EM QUAD */
2989 case 0x2002: /* EN SPACE */
2990 case 0x2003: /* EM SPACE */
2991 case 0x2004: /* THREE-PER-EM SPACE */
2992 case 0x2005: /* FOUR-PER-EM SPACE */
2993 case 0x2006: /* SIX-PER-EM SPACE */
2994 case 0x2007: /* FIGURE SPACE */
2995 case 0x2008: /* PUNCTUATION SPACE */
2996 case 0x2009: /* THIN SPACE */
2997 case 0x200A: /* HAIR SPACE */
2998 case 0x202f: /* NARROW NO-BREAK SPACE */
2999 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3000 case 0x3000: /* IDEOGRAPHIC SPACE */
3001 RRETURN(MATCH_NOMATCH);
3002 }
3003 }
3004 break;
3005
3006 case OP_HSPACE:
3007 for (i = 1; i <= min; i++)
3008 {
3009 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3010 GETCHARINC(c, eptr);
3011 switch(c)
3012 {
3013 default: RRETURN(MATCH_NOMATCH);
3014 case 0x09: /* HT */
3015 case 0x20: /* SPACE */
3016 case 0xa0: /* NBSP */
3017 case 0x1680: /* OGHAM SPACE MARK */
3018 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3019 case 0x2000: /* EN QUAD */
3020 case 0x2001: /* EM QUAD */
3021 case 0x2002: /* EN SPACE */
3022 case 0x2003: /* EM SPACE */
3023 case 0x2004: /* THREE-PER-EM SPACE */
3024 case 0x2005: /* FOUR-PER-EM SPACE */
3025 case 0x2006: /* SIX-PER-EM SPACE */
3026 case 0x2007: /* FIGURE SPACE */
3027 case 0x2008: /* PUNCTUATION SPACE */
3028 case 0x2009: /* THIN SPACE */
3029 case 0x200A: /* HAIR SPACE */
3030 case 0x202f: /* NARROW NO-BREAK SPACE */
3031 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3032 case 0x3000: /* IDEOGRAPHIC SPACE */
3033 break;
3034 }
3035 }
3036 break;
3037
3038 case OP_NOT_VSPACE:
3039 for (i = 1; i <= min; i++)
3040 {
3041 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3042 GETCHARINC(c, eptr);
3043 switch(c)
3044 {
3045 default: break;
3046 case 0x0a: /* LF */
3047 case 0x0b: /* VT */
3048 case 0x0c: /* FF */
3049 case 0x0d: /* CR */
3050 case 0x85: /* NEL */
3051 case 0x2028: /* LINE SEPARATOR */
3052 case 0x2029: /* PARAGRAPH SEPARATOR */
3053 RRETURN(MATCH_NOMATCH);
3054 }
3055 }
3056 break;
3057
3058 case OP_VSPACE:
3059 for (i = 1; i <= min; i++)
3060 {
3061 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3062 GETCHARINC(c, eptr);
3063 switch(c)
3064 {
3065 default: RRETURN(MATCH_NOMATCH);
3066 case 0x0a: /* LF */
3067 case 0x0b: /* VT */
3068 case 0x0c: /* FF */
3069 case 0x0d: /* CR */
3070 case 0x85: /* NEL */
3071 case 0x2028: /* LINE SEPARATOR */
3072 case 0x2029: /* PARAGRAPH SEPARATOR */
3073 break;
3074 }
3075 }
3076 break;
3077
3078 case OP_NOT_DIGIT:
3079 for (i = 1; i <= min; i++)
3080 {
3081 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3082 GETCHARINC(c, eptr);
3083 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3084 RRETURN(MATCH_NOMATCH);
3085 }
3086 break;
3087
3088 case OP_DIGIT:
3089 for (i = 1; i <= min; i++)
3090 {
3091 if (eptr >= md->end_subject ||
3092 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3093 RRETURN(MATCH_NOMATCH);
3094 /* No need to skip more bytes - we know it's a 1-byte character */
3095 }
3096 break;
3097
3098 case OP_NOT_WHITESPACE:
3099 for (i = 1; i <= min; i++)
3100 {
3101 if (eptr >= md->end_subject ||
3102 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3103 RRETURN(MATCH_NOMATCH);
3104 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3105 }
3106 break;
3107
3108 case OP_WHITESPACE:
3109 for (i = 1; i <= min; i++)
3110 {
3111 if (eptr >= md->end_subject ||
3112 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3113 RRETURN(MATCH_NOMATCH);
3114 /* No need to skip more bytes - we know it's a 1-byte character */
3115 }
3116 break;
3117
3118 case OP_NOT_WORDCHAR:
3119 for (i = 1; i <= min; i++)
3120 {
3121 if (eptr >= md->end_subject ||
3122 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3123 RRETURN(MATCH_NOMATCH);
3124 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3125 }
3126 break;
3127
3128 case OP_WORDCHAR:
3129 for (i = 1; i <= min; i++)
3130 {
3131 if (eptr >= md->end_subject ||
3132 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3133 RRETURN(MATCH_NOMATCH);
3134 /* No need to skip more bytes - we know it's a 1-byte character */
3135 }
3136 break;
3137
3138 default:
3139 RRETURN(PCRE_ERROR_INTERNAL);
3140 } /* End switch(ctype) */
3141
3142 else
3143#endif /* SUPPORT_UTF8 */
3144
3145 /* Code for the non-UTF-8 case for minimum matching of operators other
3146 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3147 number of bytes present, as this was tested above. */
3148
3149 switch(ctype)
3150 {
3151 case OP_ANY:
3152 if ((ims & PCRE_DOTALL) == 0)
3153 {
3154 for (i = 1; i <= min; i++)
3155 {
3156 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3157 eptr++;
3158 }
3159 }
3160 else eptr += min;
3161 break;
3162
3163 case OP_ANYBYTE:
3164 eptr += min;
3165 break;
3166
3167 /* Because of the CRLF case, we can't assume the minimum number of
3168 bytes are present in this case. */
3169
3170 case OP_ANYNL:
3171 for (i = 1; i <= min; i++)
3172 {
3173 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3174 switch(*eptr++)
3175 {
3176 default: RRETURN(MATCH_NOMATCH);
3177 case 0x000d:
3178 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3179 break;
3180 case 0x000a:
3181 break;
3182
3183 case 0x000b:
3184 case 0x000c:
3185 case 0x0085:
3186 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3187 break;
3188 }
3189 }
3190 break;
3191
3192 case OP_NOT_HSPACE:
3193 for (i = 1; i <= min; i++)
3194 {
3195 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3196 switch(*eptr++)
3197 {
3198 default: break;
3199 case 0x09: /* HT */
3200 case 0x20: /* SPACE */
3201 case 0xa0: /* NBSP */
3202 RRETURN(MATCH_NOMATCH);
3203 }
3204 }
3205 break;
3206
3207 case OP_HSPACE:
3208 for (i = 1; i <= min; i++)
3209 {
3210 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3211 switch(*eptr++)
3212 {
3213 default: RRETURN(MATCH_NOMATCH);
3214 case 0x09: /* HT */
3215 case 0x20: /* SPACE */
3216 case 0xa0: /* NBSP */
3217 break;
3218 }
3219 }
3220 break;
3221
3222 case OP_NOT_VSPACE:
3223 for (i = 1; i <= min; i++)
3224 {
3225 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3226 switch(*eptr++)
3227 {
3228 default: break;
3229 case 0x0a: /* LF */
3230 case 0x0b: /* VT */
3231 case 0x0c: /* FF */
3232 case 0x0d: /* CR */
3233 case 0x85: /* NEL */
3234 RRETURN(MATCH_NOMATCH);
3235 }
3236 }
3237 break;
3238
3239 case OP_VSPACE:
3240 for (i = 1; i <= min; i++)
3241 {
3242 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3243 switch(*eptr++)
3244 {
3245 default: RRETURN(MATCH_NOMATCH);
3246 case 0x0a: /* LF */
3247 case 0x0b: /* VT */
3248 case 0x0c: /* FF */
3249 case 0x0d: /* CR */
3250 case 0x85: /* NEL */
3251 break;
3252 }
3253 }
3254 break;
3255
3256 case OP_NOT_DIGIT:
3257 for (i = 1; i <= min; i++)
3258 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3259 break;
3260
3261 case OP_DIGIT:
3262 for (i = 1; i <= min; i++)
3263 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3264 break;
3265
3266 case OP_NOT_WHITESPACE:
3267 for (i = 1; i <= min; i++)
3268 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3269 break;
3270
3271 case OP_WHITESPACE:
3272 for (i = 1; i <= min; i++)
3273 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3274 break;
3275
3276 case OP_NOT_WORDCHAR:
3277 for (i = 1; i <= min; i++)
3278 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3279 RRETURN(MATCH_NOMATCH);
3280 break;
3281
3282 case OP_WORDCHAR:
3283 for (i = 1; i <= min; i++)
3284 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3285 RRETURN(MATCH_NOMATCH);
3286 break;
3287
3288 default:
3289 RRETURN(PCRE_ERROR_INTERNAL);
3290 }
3291 }
3292
3293 /* If min = max, continue at the same level without recursing */
3294
3295 if (min == max) continue;
3296
3297 /* If minimizing, we have to test the rest of the pattern before each
3298 subsequent match. Again, separate the UTF-8 case for speed, and also
3299 separate the UCP cases. */
3300
3301 if (minimize)
3302 {
3303#ifdef SUPPORT_UCP
3304 if (prop_type >= 0)
3305 {
3306 switch(prop_type)
3307 {
3308 case PT_ANY:
3309 for (fi = min;; fi++)
3310 {
3311 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3312 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3313 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3314 GETCHARINC(c, eptr);
3315 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3316 }
3317 /* Control never gets here */
3318
3319 case PT_LAMP:
3320 for (fi = min;; fi++)
3321 {
3322 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3323 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3324 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3325 GETCHARINC(c, eptr);
3326 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3327 if ((prop_chartype == ucp_Lu ||
3328 prop_chartype == ucp_Ll ||
3329 prop_chartype == ucp_Lt) == prop_fail_result)
3330 RRETURN(MATCH_NOMATCH);
3331 }
3332 /* Control never gets here */
3333
3334 case PT_GC:
3335 for (fi = min;; fi++)
3336 {
3337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3340 GETCHARINC(c, eptr);
3341 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3342 if ((prop_category == prop_value) == prop_fail_result)
3343 RRETURN(MATCH_NOMATCH);
3344 }
3345 /* Control never gets here */
3346
3347 case PT_PC:
3348 for (fi = min;; fi++)
3349 {
3350 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3352 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3353 GETCHARINC(c, eptr);
3354 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3355 if ((prop_chartype == prop_value) == prop_fail_result)
3356 RRETURN(MATCH_NOMATCH);
3357 }
3358 /* Control never gets here */
3359
3360 case PT_SC:
3361 for (fi = min;; fi++)
3362 {
3363 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3365 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3366 GETCHARINC(c, eptr);
3367 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3368 if ((prop_script == prop_value) == prop_fail_result)
3369 RRETURN(MATCH_NOMATCH);
3370 }
3371 /* Control never gets here */
3372
3373 default:
3374 RRETURN(PCRE_ERROR_INTERNAL);
3375 }
3376 }
3377
3378 /* Match extended Unicode sequences. We will get here only if the
3379 support is in the binary; otherwise a compile-time error occurs. */
3380
3381 else if (ctype == OP_EXTUNI)
3382 {
3383 for (fi = min;; fi++)
3384 {
3385 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3388 GETCHARINCTEST(c, eptr);
3389 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3390 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3391 while (eptr < md->end_subject)
3392 {
3393 int len = 1;
3394 if (!utf8) c = *eptr; else
3395 {
3396 GETCHARLEN(c, eptr, len);
3397 }
3398 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3399 if (prop_category != ucp_M) break;
3400 eptr += len;
3401 }
3402 }
3403 }
3404
3405 else
3406#endif /* SUPPORT_UCP */
3407
3408#ifdef SUPPORT_UTF8
3409 /* UTF-8 mode */
3410 if (utf8)
3411 {
3412 for (fi = min;; fi++)
3413 {
3414 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 if (fi >= max || eptr >= md->end_subject ||
3417 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3418 IS_NEWLINE(eptr)))
3419 RRETURN(MATCH_NOMATCH);
3420
3421 GETCHARINC(c, eptr);
3422 switch(ctype)
3423 {
3424 case OP_ANY: /* This is the DOTALL case */
3425 break;
3426
3427 case OP_ANYBYTE:
3428 break;
3429
3430 case OP_ANYNL:
3431 switch(c)
3432 {
3433 default: RRETURN(MATCH_NOMATCH);
3434 case 0x000d:
3435 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3436 break;
3437 case 0x000a:
3438 break;
3439
3440 case 0x000b:
3441 case 0x000c:
3442 case 0x0085:
3443 case 0x2028:
3444 case 0x2029:
3445 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3446 break;
3447 }
3448 break;
3449
3450 case OP_NOT_HSPACE:
3451 switch(c)
3452 {
3453 default: break;
3454 case 0x09: /* HT */
3455 case 0x20: /* SPACE */
3456 case 0xa0: /* NBSP */
3457 case 0x1680: /* OGHAM SPACE MARK */
3458 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3459 case 0x2000: /* EN QUAD */
3460 case 0x2001: /* EM QUAD */
3461 case 0x2002: /* EN SPACE */
3462 case 0x2003: /* EM SPACE */
3463 case 0x2004: /* THREE-PER-EM SPACE */
3464 case 0x2005: /* FOUR-PER-EM SPACE */
3465 case 0x2006: /* SIX-PER-EM SPACE */
3466 case 0x2007: /* FIGURE SPACE */
3467 case 0x2008: /* PUNCTUATION SPACE */
3468 case 0x2009: /* THIN SPACE */
3469 case 0x200A: /* HAIR SPACE */
3470 case 0x202f: /* NARROW NO-BREAK SPACE */
3471 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3472 case 0x3000: /* IDEOGRAPHIC SPACE */
3473 RRETURN(MATCH_NOMATCH);
3474 }
3475 break;
3476
3477 case OP_HSPACE:
3478 switch(c)
3479 {
3480 default: RRETURN(MATCH_NOMATCH);
3481 case 0x09: /* HT */
3482 case 0x20: /* SPACE */
3483 case 0xa0: /* NBSP */
3484 case 0x1680: /* OGHAM SPACE MARK */
3485 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3486 case 0x2000: /* EN QUAD */
3487 case 0x2001: /* EM QUAD */
3488 case 0x2002: /* EN SPACE */
3489 case 0x2003: /* EM SPACE */
3490 case 0x2004: /* THREE-PER-EM SPACE */
3491 case 0x2005: /* FOUR-PER-EM SPACE */
3492 case 0x2006: /* SIX-PER-EM SPACE */
3493 case 0x2007: /* FIGURE SPACE */
3494 case 0x2008: /* PUNCTUATION SPACE */
3495 case 0x2009: /* THIN SPACE */
3496 case 0x200A: /* HAIR SPACE */
3497 case 0x202f: /* NARROW NO-BREAK SPACE */
3498 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3499 case 0x3000: /* IDEOGRAPHIC SPACE */
3500 break;
3501 }
3502 break;
3503
3504 case OP_NOT_VSPACE:
3505 switch(c)
3506 {
3507 default: break;
3508 case 0x0a: /* LF */
3509 case 0x0b: /* VT */
3510 case 0x0c: /* FF */
3511 case 0x0d: /* CR */
3512 case 0x85: /* NEL */
3513 case 0x2028: /* LINE SEPARATOR */
3514 case 0x2029: /* PARAGRAPH SEPARATOR */
3515 RRETURN(MATCH_NOMATCH);
3516 }
3517 break;
3518
3519 case OP_VSPACE:
3520 switch(c)
3521 {
3522 default: RRETURN(MATCH_NOMATCH);
3523 case 0x0a: /* LF */
3524 case 0x0b: /* VT */
3525 case 0x0c: /* FF */
3526 case 0x0d: /* CR */
3527 case 0x85: /* NEL */
3528 case 0x2028: /* LINE SEPARATOR */
3529 case 0x2029: /* PARAGRAPH SEPARATOR */
3530 break;
3531 }
3532 break;
3533
3534 case OP_NOT_DIGIT:
3535 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3536 RRETURN(MATCH_NOMATCH);
3537 break;
3538
3539 case OP_DIGIT:
3540 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3541 RRETURN(MATCH_NOMATCH);
3542 break;
3543
3544 case OP_NOT_WHITESPACE:
3545 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3546 RRETURN(MATCH_NOMATCH);
3547 break;
3548
3549 case OP_WHITESPACE:
3550 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3551 RRETURN(MATCH_NOMATCH);
3552 break;
3553
3554 case OP_NOT_WORDCHAR:
3555 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3556 RRETURN(MATCH_NOMATCH);
3557 break;
3558
3559 case OP_WORDCHAR:
3560 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3561 RRETURN(MATCH_NOMATCH);
3562 break;
3563
3564 default:
3565 RRETURN(PCRE_ERROR_INTERNAL);
3566 }
3567 }
3568 }
3569 else
3570#endif
3571 /* Not UTF-8 mode */
3572 {
3573 for (fi = min;; fi++)
3574 {
3575 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3577 if (fi >= max || eptr >= md->end_subject ||
3578 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3579 RRETURN(MATCH_NOMATCH);
3580
3581 c = *eptr++;
3582 switch(ctype)
3583 {
3584 case OP_ANY: /* This is the DOTALL case */
3585 break;
3586
3587 case OP_ANYBYTE:
3588 break;
3589
3590 case OP_ANYNL:
3591 switch(c)
3592 {
3593 default: RRETURN(MATCH_NOMATCH);
3594 case 0x000d:
3595 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3596 break;
3597
3598 case 0x000a:
3599 break;
3600
3601 case 0x000b:
3602 case 0x000c:
3603 case 0x0085:
3604 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3605 break;
3606 }
3607 break;
3608
3609 case OP_NOT_HSPACE:
3610 switch(c)
3611 {
3612 default: break;
3613 case 0x09: /* HT */
3614 case 0x20: /* SPACE */
3615 case 0xa0: /* NBSP */
3616 RRETURN(MATCH_NOMATCH);
3617 }
3618 break;
3619
3620 case OP_HSPACE:
3621 switch(c)
3622 {
3623 default: RRETURN(MATCH_NOMATCH);
3624 case 0x09: /* HT */
3625 case 0x20: /* SPACE */
3626 case 0xa0: /* NBSP */
3627 break;
3628 }
3629 break;
3630
3631 case OP_NOT_VSPACE:
3632 switch(c)
3633 {
3634 default: break;
3635 case 0x0a: /* LF */
3636 case 0x0b: /* VT */
3637 case 0x0c: /* FF */
3638 case 0x0d: /* CR */
3639 case 0x85: /* NEL */
3640 RRETURN(MATCH_NOMATCH);
3641 }
3642 break;
3643
3644 case OP_VSPACE:
3645 switch(c)
3646 {
3647 default: RRETURN(MATCH_NOMATCH);
3648 case 0x0a: /* LF */
3649 case 0x0b: /* VT */
3650 case 0x0c: /* FF */
3651 case 0x0d: /* CR */
3652 case 0x85: /* NEL */
3653 break;
3654 }
3655 break;
3656
3657 case OP_NOT_DIGIT:
3658 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3659 break;
3660
3661 case OP_DIGIT:
3662 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3663 break;
3664
3665 case OP_NOT_WHITESPACE:
3666 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3667 break;
3668
3669 case OP_WHITESPACE:
3670 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3671 break;
3672
3673 case OP_NOT_WORDCHAR:
3674 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3675 break;
3676
3677 case OP_WORDCHAR:
3678 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3679 break;
3680
3681 default:
3682 RRETURN(PCRE_ERROR_INTERNAL);
3683 }
3684 }
3685 }
3686 /* Control never gets here */
3687 }
3688
3689 /* If maximizing, it is worth using inline code for speed, doing the type
3690 test once at the start (i.e. keep it out of the loop). Again, keep the
3691 UTF-8 and UCP stuff separate. */
3692
3693 else
3694 {
3695 pp = eptr; /* Remember where we started */
3696
3697#ifdef SUPPORT_UCP
3698 if (prop_type >= 0)
3699 {
3700 switch(prop_type)
3701 {
3702 case PT_ANY:
3703 for (i = min; i < max; i++)
3704 {
3705 int len = 1;
3706 if (eptr >= md->end_subject) break;
3707 GETCHARLEN(c, eptr, len);
3708 if (prop_fail_result) break;
3709 eptr+= len;
3710 }
3711 break;
3712
3713 case PT_LAMP:
3714 for (i = min; i < max; i++)
3715 {
3716 int len = 1;
3717 if (eptr >= md->end_subject) break;
3718 GETCHARLEN(c, eptr, len);
3719 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3720 if ((prop_chartype == ucp_Lu ||
3721 prop_chartype == ucp_Ll ||
3722 prop_chartype == ucp_Lt) == prop_fail_result)
3723 break;
3724 eptr+= len;
3725 }
3726 break;
3727
3728 case PT_GC:
3729 for (i = min; i < max; i++)
3730 {
3731 int len = 1;
3732 if (eptr >= md->end_subject) break;
3733 GETCHARLEN(c, eptr, len);
3734 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3735 if ((prop_category == prop_value) == prop_fail_result)
3736 break;
3737 eptr+= len;
3738 }
3739 break;
3740
3741 case PT_PC:
3742 for (i = min; i < max; i++)
3743 {
3744 int len = 1;
3745 if (eptr >= md->end_subject) break;
3746 GETCHARLEN(c, eptr, len);
3747 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3748 if ((prop_chartype == prop_value) == prop_fail_result)
3749 break;
3750 eptr+= len;
3751 }
3752 break;
3753
3754 case PT_SC:
3755 for (i = min; i < max; i++)
3756 {
3757 int len = 1;
3758 if (eptr >= md->end_subject) break;
3759 GETCHARLEN(c, eptr, len);
3760 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3761 if ((prop_script == prop_value) == prop_fail_result)
3762 break;
3763 eptr+= len;
3764 }
3765 break;
3766 }
3767
3768 /* eptr is now past the end of the maximum run */
3769
3770 if (possessive) continue;
3771 for(;;)
3772 {
3773 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3775 if (eptr-- == pp) break; /* Stop if tried at original pos */
3776 if (utf8) BACKCHAR(eptr);
3777 }
3778 }
3779
3780 /* Match extended Unicode sequences. We will get here only if the
3781 support is in the binary; otherwise a compile-time error occurs. */
3782
3783 else if (ctype == OP_EXTUNI)
3784 {
3785 for (i = min; i < max; i++)
3786 {
3787 if (eptr >= md->end_subject) break;
3788 GETCHARINCTEST(c, eptr);
3789 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3790 if (prop_category == ucp_M) break;
3791 while (eptr < md->end_subject)
3792 {
3793 int len = 1;
3794 if (!utf8) c = *eptr; else
3795 {
3796 GETCHARLEN(c, eptr, len);
3797 }
3798 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3799 if (prop_category != ucp_M) break;
3800 eptr += len;
3801 }
3802 }
3803
3804 /* eptr is now past the end of the maximum run */
3805
3806 if (possessive) continue;
3807 for(;;)
3808 {
3809 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3811 if (eptr-- == pp) break; /* Stop if tried at original pos */
3812 for (;;) /* Move back over one extended */
3813 {
3814 int len = 1;
3815 if (!utf8) c = *eptr; else
3816 {
3817 BACKCHAR(eptr);
3818 GETCHARLEN(c, eptr, len);
3819 }
3820 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3821 if (prop_category != ucp_M) break;
3822 eptr--;
3823 }
3824 }
3825 }
3826
3827 else
3828#endif /* SUPPORT_UCP */
3829
3830#ifdef SUPPORT_UTF8
3831 /* UTF-8 mode */
3832
3833 if (utf8)
3834 {
3835 switch(ctype)
3836 {
3837 case OP_ANY:
3838 if (max < INT_MAX)
3839 {
3840 if ((ims & PCRE_DOTALL) == 0)
3841 {
3842 for (i = min; i < max; i++)
3843 {
3844 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3845 eptr++;
3846 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3847 }
3848 }
3849 else
3850 {
3851 for (i = min; i < max; i++)
3852 {
3853 if (eptr >= md->end_subject) break;
3854 eptr++;
3855 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3856 }
3857 }
3858 }
3859
3860 /* Handle unlimited UTF-8 repeat */
3861
3862 else
3863 {
3864 if ((ims & PCRE_DOTALL) == 0)
3865 {
3866 for (i = min; i < max; i++)
3867 {
3868 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3869 eptr++;
3870 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3871 }
3872 }
3873 else
3874 {
3875 eptr = md->end_subject;
3876 }
3877 }
3878 break;
3879
3880 /* The byte case is the same as non-UTF8 */
3881
3882 case OP_ANYBYTE:
3883 c = max - min;
3884 if (c > (unsigned int)(md->end_subject - eptr))
3885 c = md->end_subject - eptr;
3886 eptr += c;
3887 break;
3888
3889 case OP_ANYNL:
3890 for (i = min; i < max; i++)
3891 {
3892 int len = 1;
3893 if (eptr >= md->end_subject) break;
3894 GETCHARLEN(c, eptr, len);
3895 if (c == 0x000d)
3896 {
3897 if (++eptr >= md->end_subject) break;
3898 if (*eptr == 0x000a) eptr++;
3899 }
3900 else
3901 {
3902 if (c != 0x000a &&
3903 (md->bsr_anycrlf ||
3904 (c != 0x000b && c != 0x000c &&
3905 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3906 break;
3907 eptr += len;
3908 }
3909 }
3910 break;
3911
3912 case OP_NOT_HSPACE:
3913 case OP_HSPACE:
3914 for (i = min; i < max; i++)
3915 {
3916 BOOL gotspace;
3917 int len = 1;
3918 if (eptr >= md->end_subject) break;
3919 GETCHARLEN(c, eptr, len);
3920 switch(c)
3921 {
3922 default: gotspace = FALSE; break;
3923 case 0x09: /* HT */
3924 case 0x20: /* SPACE */
3925 case 0xa0: /* NBSP */
3926 case 0x1680: /* OGHAM SPACE MARK */
3927 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3928 case 0x2000: /* EN QUAD */
3929 case 0x2001: /* EM QUAD */
3930 case 0x2002: /* EN SPACE */
3931 case 0x2003: /* EM SPACE */
3932 case 0x2004: /* THREE-PER-EM SPACE */
3933 case 0x2005: /* FOUR-PER-EM SPACE */
3934 case 0x2006: /* SIX-PER-EM SPACE */
3935 case 0x2007: /* FIGURE SPACE */
3936 case 0x2008: /* PUNCTUATION SPACE */
3937 case 0x2009: /* THIN SPACE */
3938 case 0x200A: /* HAIR SPACE */
3939 case 0x202f: /* NARROW NO-BREAK SPACE */
3940 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3941 case 0x3000: /* IDEOGRAPHIC SPACE */
3942 gotspace = TRUE;
3943 break;
3944 }
3945 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3946 eptr += len;
3947 }
3948 break;
3949
3950 case OP_NOT_VSPACE:
3951 case OP_VSPACE:
3952 for (i = min; i < max; i++)
3953 {
3954 BOOL gotspace;
3955 int len = 1;
3956 if (eptr >= md->end_subject) break;
3957 GETCHARLEN(c, eptr, len);
3958 switch(c)
3959 {
3960 default: gotspace = FALSE; break;
3961 case 0x0a: /* LF */
3962 case 0x0b: /* VT */
3963 case 0x0c: /* FF */
3964 case 0x0d: /* CR */
3965 case 0x85: /* NEL */
3966 case 0x2028: /* LINE SEPARATOR */
3967 case 0x2029: /* PARAGRAPH SEPARATOR */
3968 gotspace = TRUE;
3969 break;
3970 }
3971 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3972 eptr += len;
3973 }
3974 break;
3975
3976 case OP_NOT_DIGIT:
3977 for (i = min; i < max; i++)
3978 {
3979 int len = 1;
3980 if (eptr >= md->end_subject) break;
3981 GETCHARLEN(c, eptr, len);
3982 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3983 eptr+= len;
3984 }
3985 break;
3986
3987 case OP_DIGIT:
3988 for (i = min; i < max; i++)
3989 {
3990 int len = 1;
3991 if (eptr >= md->end_subject) break;
3992 GETCHARLEN(c, eptr, len);
3993 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3994 eptr+= len;
3995 }
3996 break;
3997
3998 case OP_NOT_WHITESPACE:
3999 for (i = min; i < max; i++)
4000 {
4001 int len = 1;
4002 if (eptr >= md->end_subject) break;
4003 GETCHARLEN(c, eptr, len);
4004 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4005 eptr+= len;
4006 }
4007 break;
4008
4009 case OP_WHITESPACE:
4010 for (i = min; i < max; i++)
4011 {
4012 int len = 1;
4013 if (eptr >= md->end_subject) break;
4014 GETCHARLEN(c, eptr, len);
4015 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4016 eptr+= len;
4017 }
4018 break;
4019
4020 case OP_NOT_WORDCHAR:
4021 for (i = min; i < max; i++)
4022 {
4023 int len = 1;
4024 if (eptr >= md->end_subject) break;
4025 GETCHARLEN(c, eptr, len);
4026 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4027 eptr+= len;
4028 }
4029 break;
4030
4031 case OP_WORDCHAR:
4032 for (i = min; i < max; i++)
4033 {
4034 int len = 1;
4035 if (eptr >= md->end_subject) break;
4036 GETCHARLEN(c, eptr, len);
4037 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4038 eptr+= len;
4039 }
4040 break;
4041
4042 default:
4043 RRETURN(PCRE_ERROR_INTERNAL);
4044 }
4045
4046 /* eptr is now past the end of the maximum run */
4047
4048 if (possessive) continue;
4049 for(;;)
4050 {
4051 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4053 if (eptr-- == pp) break; /* Stop if tried at original pos */
4054 BACKCHAR(eptr);
4055 }
4056 }
4057 else
4058#endif /* SUPPORT_UTF8 */
4059
4060 /* Not UTF-8 mode */
4061 {
4062 switch(ctype)
4063 {
4064 case OP_ANY:
4065 if ((ims & PCRE_DOTALL) == 0)
4066 {
4067 for (i = min; i < max; i++)
4068 {
4069 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4070 eptr++;
4071 }
4072 break;
4073 }
4074 /* For DOTALL case, fall through and treat as \C */
4075
4076 case OP_ANYBYTE:
4077 c = max - min;
4078 if (c > (unsigned int)(md->end_subject - eptr))
4079 c = md->end_subject - eptr;
4080 eptr += c;
4081 break;
4082
4083 case OP_ANYNL:
4084 for (i = min; i < max; i++)
4085 {
4086 if (eptr >= md->end_subject) break;
4087 c = *eptr;
4088 if (c == 0x000d)
4089 {
4090 if (++eptr >= md->end_subject) break;
4091 if (*eptr == 0x000a) eptr++;
4092 }
4093 else
4094 {
4095 if (c != 0x000a &&
4096 (md->bsr_anycrlf ||
4097 (c != 0x000b && c != 0x000c && c != 0x0085)))
4098 break;
4099 eptr++;
4100 }
4101 }
4102 break;
4103
4104 case OP_NOT_HSPACE:
4105 for (i = min; i < max; i++)
4106 {
4107 if (eptr >= md->end_subject) break;
4108 c = *eptr;
4109 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4110 eptr++;
4111 }
4112 break;
4113
4114 case OP_HSPACE:
4115 for (i = min; i < max; i++)
4116 {
4117 if (eptr >= md->end_subject) break;
4118 c = *eptr;
4119 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4120 eptr++;
4121 }
4122 break;
4123
4124 case OP_NOT_VSPACE:
4125 for (i = min; i < max; i++)
4126 {
4127 if (eptr >= md->end_subject) break;
4128 c = *eptr;
4129 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4130 break;
4131 eptr++;
4132 }
4133 break;
4134
4135 case OP_VSPACE:
4136 for (i = min; i < max; i++)
4137 {
4138 if (eptr >= md->end_subject) break;
4139 c = *eptr;
4140 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4141 break;
4142 eptr++;
4143 }
4144 break;
4145
4146 case OP_NOT_DIGIT:
4147 for (i = min; i < max; i++)
4148 {
4149 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4150 break;
4151 eptr++;
4152 }
4153 break;
4154
4155 case OP_DIGIT:
4156 for (i = min; i < max; i++)
4157 {
4158 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4159 break;
4160 eptr++;
4161 }
4162 break;
4163
4164 case OP_NOT_WHITESPACE:
4165 for (i = min; i < max; i++)
4166 {
4167 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4168 break;
4169 eptr++;
4170 }
4171 break;
4172
4173 case OP_WHITESPACE:
4174 for (i = min; i < max; i++)
4175 {
4176 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4177 break;
4178 eptr++;
4179 }
4180 break;
4181
4182 case OP_NOT_WORDCHAR:
4183 for (i = min; i < max; i++)
4184 {
4185 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4186 break;
4187 eptr++;
4188 }
4189 break;
4190
4191 case OP_WORDCHAR:
4192 for (i = min; i < max; i++)
4193 {
4194 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4195 break;
4196 eptr++;
4197 }
4198 break;
4199
4200 default:
4201 RRETURN(PCRE_ERROR_INTERNAL);
4202 }
4203
4204 /* eptr is now past the end of the maximum run */
4205
4206 if (possessive) continue;
4207 while (eptr >= pp)
4208 {
4209 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4210 eptr--;
4211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4212 }
4213 }
4214
4215 /* Get here if we can't make it match with any permitted repetitions */
4216
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 /* Control never gets here */
4220
4221 /* There's been some horrible disaster. Arrival here can only mean there is
4222 something seriously wrong in the code above or the OP_xxx definitions. */
4223
4224 default:
4225 DPRINTF(("Unknown opcode %d\n", *ecode));
4226 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4227 }
4228
4229 /* Do not stick any code in here without much thought; it is assumed
4230 that "continue" in the code above comes out to here to repeat the main
4231 loop. */
4232
4233 } /* End of main loop */
4234/* Control never reaches here */
4235
4236
4237/* When compiling to use the heap rather than the stack for recursive calls to
4238match(), the RRETURN() macro jumps here. The number that is saved in
4239frame->Xwhere indicates which label we actually want to return to. */
4240
4241#ifdef NO_RECURSE
4242#define LBL(val) case val: goto L_RM##val;
4243HEAP_RETURN:
4244switch (frame->Xwhere)
4245 {
4246 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4247 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4248 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4249 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4250 LBL(53) LBL(54)
4251#ifdef SUPPORT_UTF8
4252 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4253 LBL(32) LBL(34) LBL(42) LBL(46)
4254#ifdef SUPPORT_UCP
4255 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4256#endif /* SUPPORT_UCP */
4257#endif /* SUPPORT_UTF8 */
4258 default:
4259 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4260 return PCRE_ERROR_INTERNAL;
4261 }
4262#undef LBL
4263#endif /* NO_RECURSE */
4264}
4265
4266
4267/***************************************************************************
4268****************************************************************************
4269 RECURSION IN THE match() FUNCTION
4270
4271Undefine all the macros that were defined above to handle this. */
4272
4273#ifdef NO_RECURSE
4274#undef eptr
4275#undef ecode
4276#undef mstart
4277#undef offset_top
4278#undef ims
4279#undef eptrb
4280#undef flags
4281
4282#undef callpat
4283#undef charptr
4284#undef data
4285#undef next
4286#undef pp
4287#undef prev
4288#undef saved_eptr
4289
4290#undef new_recursive
4291
4292#undef cur_is_word
4293#undef condition
4294#undef prev_is_word
4295
4296#undef original_ims
4297
4298#undef ctype
4299#undef length
4300#undef max
4301#undef min
4302#undef number
4303#undef offset
4304#undef op
4305#undef save_capture_last
4306#undef save_offset1
4307#undef save_offset2
4308#undef save_offset3
4309#undef stacksave
4310
4311#undef newptrb
4312
4313#endif
4314
4315/* These two are defined as macros in both cases */
4316
4317#undef fc
4318#undef fi
4319
4320/***************************************************************************
4321***************************************************************************/
4322
4323
4324
4325/*************************************************
4326* Execute a Regular Expression *
4327*************************************************/
4328
4329/* This function applies a compiled re to a subject string and picks out
4330portions of the string if it matches. Two elements in the vector are set for
4331each substring: the offsets to the start and end of the substring.
4332
4333Arguments:
4334 argument_re points to the compiled expression
4335 extra_data points to extra data or is NULL
4336 subject points to the subject string
4337 length length of subject string (may contain binary zeros)
4338 start_offset where to start in the subject string
4339 options option bits
4340 offsets points to a vector of ints to be filled in with offsets
4341 offsetcount the number of elements in the vector
4342
4343Returns: > 0 => success; value is the number of elements filled in
4344 = 0 => success, but offsets is not big enough
4345 -1 => failed to match
4346 < -1 => some kind of unexpected problem
4347*/
4348
4349PCRE_EXP_DEFN int
4350pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4351 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4352 int offsetcount)
4353{
4354int rc, resetcount, ocount;
4355int first_byte = -1;
4356int req_byte = -1;
4357int req_byte2 = -1;
4358int newline;
4359unsigned long int ims;
4360BOOL using_temporary_offsets = FALSE;
4361BOOL anchored;
4362BOOL startline;
4363BOOL firstline;
4364BOOL first_byte_caseless = FALSE;
4365BOOL req_byte_caseless = FALSE;
4366BOOL utf8;
4367match_data match_block;
4368match_data *md = &match_block;
4369const uschar *tables;
4370const uschar *start_bits = NULL;
4371USPTR start_match = (USPTR)subject + start_offset;
4372USPTR end_subject;
4373USPTR req_byte_ptr = start_match - 1;
4374
4375pcre_study_data internal_study;
4376const pcre_study_data *study;
4377
4378real_pcre internal_re;
4379const real_pcre *external_re = (const real_pcre *)argument_re;
4380const real_pcre *re = external_re;
4381
4382/* Plausibility checks */
4383
4384if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4385if (re == NULL || subject == NULL ||
4386 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4387if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4388
4389/* Fish out the optional data from the extra_data structure, first setting
4390the default values. */
4391
4392study = NULL;
4393md->match_limit = MATCH_LIMIT;
4394md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4395md->callout_data = NULL;
4396
4397/* The table pointer is always in native byte order. */
4398
4399tables = external_re->tables;
4400
4401if (extra_data != NULL)
4402 {
4403 register unsigned int flags = extra_data->flags;
4404 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4405 study = (const pcre_study_data *)extra_data->study_data;
4406 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4407 md->match_limit = extra_data->match_limit;
4408 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4409 md->match_limit_recursion = extra_data->match_limit_recursion;
4410 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4411 md->callout_data = extra_data->callout_data;
4412 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4413 }
4414
4415/* If the exec call supplied NULL for tables, use the inbuilt ones. This
4416is a feature that makes it possible to save compiled regex and re-use them
4417in other programs later. */
4418
4419if (tables == NULL) tables = _pcre_default_tables;
4420
4421/* Check that the first field in the block is the magic number. If it is not,
4422test for a regex that was compiled on a host of opposite endianness. If this is
4423the case, flipped values are put in internal_re and internal_study if there was
4424study data too. */
4425
4426if (re->magic_number != MAGIC_NUMBER)
4427 {
4428 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4429 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4430 if (study != NULL) study = &internal_study;
4431 }
4432
4433/* Set up other data */
4434
4435anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4436startline = (re->flags & PCRE_STARTLINE) != 0;
4437firstline = (re->options & PCRE_FIRSTLINE) != 0;
4438
4439/* The code starts after the real_pcre block and the capture name table. */
4440
4441md->start_code = (const uschar *)external_re + re->name_table_offset +
4442 re->name_count * re->name_entry_size;
4443
4444md->start_subject = (USPTR)subject;
4445md->start_offset = start_offset;
4446md->end_subject = md->start_subject + length;
4447end_subject = md->end_subject;
4448
4449md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4450utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4451
4452md->notbol = (options & PCRE_NOTBOL) != 0;
4453md->noteol = (options & PCRE_NOTEOL) != 0;
4454md->notempty = (options & PCRE_NOTEMPTY) != 0;
4455md->partial = (options & PCRE_PARTIAL) != 0;
4456md->hitend = FALSE;
4457
4458md->recursive = NULL; /* No recursion at top level */
4459
4460md->lcc = tables + lcc_offset;
4461md->ctypes = tables + ctypes_offset;
4462
4463/* Handle different \R options. */
4464
4465switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4466 {
4467 case 0:
4468 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4469 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4470 else
4471#ifdef BSR_ANYCRLF
4472 md->bsr_anycrlf = TRUE;
4473#else
4474 md->bsr_anycrlf = FALSE;
4475#endif
4476 break;
4477
4478 case PCRE_BSR_ANYCRLF:
4479 md->bsr_anycrlf = TRUE;
4480 break;
4481
4482 case PCRE_BSR_UNICODE:
4483 md->bsr_anycrlf = FALSE;
4484 break;
4485
4486 default: return PCRE_ERROR_BADNEWLINE;
4487 }
4488
4489/* Handle different types of newline. The three bits give eight cases. If
4490nothing is set at run time, whatever was used at compile time applies. */
4491
4492switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4493 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4494 {
4495 case 0: newline = NEWLINE; break; /* Compile-time default */
4496 case PCRE_NEWLINE_CR: newline = '\r'; break;
4497 case PCRE_NEWLINE_LF: newline = '\n'; break;
4498 case PCRE_NEWLINE_CR+
4499 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4500 case PCRE_NEWLINE_ANY: newline = -1; break;
4501 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4502 default: return PCRE_ERROR_BADNEWLINE;
4503 }
4504
4505if (newline == -2)
4506 {
4507 md->nltype = NLTYPE_ANYCRLF;
4508 }
4509else if (newline < 0)
4510 {
4511 md->nltype = NLTYPE_ANY;
4512 }
4513else
4514 {
4515 md->nltype = NLTYPE_FIXED;
4516 if (newline > 255)
4517 {
4518 md->nllen = 2;
4519 md->nl[0] = (newline >> 8) & 255;
4520 md->nl[1] = newline & 255;
4521 }
4522 else
4523 {
4524 md->nllen = 1;
4525 md->nl[0] = newline;
4526 }
4527 }
4528
4529/* Partial matching is supported only for a restricted set of regexes at the
4530moment. */
4531
4532if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4533 return PCRE_ERROR_BADPARTIAL;
4534
4535/* Check a UTF-8 string if required. Unfortunately there's no way of passing
4536back the character offset. */
4537
4538#ifdef SUPPORT_UTF8
4539if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4540 {
4541 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4542 return PCRE_ERROR_BADUTF8;
4543 if (start_offset > 0 && start_offset < length)
4544 {
4545 int tb = ((uschar *)subject)[start_offset];
4546 if (tb > 127)
4547 {
4548 tb &= 0xc0;
4549 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4550 }
4551 }
4552 }
4553#endif
4554
4555/* The ims options can vary during the matching as a result of the presence
4556of (?ims) items in the pattern. They are kept in a local variable so that
4557restoring at the exit of a group is easy. */
4558
4559ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4560
4561/* If the expression has got more back references than the offsets supplied can
4562hold, we get a temporary chunk of working store to use during the matching.
4563Otherwise, we can use the vector supplied, rounding down its size to a multiple
4564of 3. */
4565
4566ocount = offsetcount - (offsetcount % 3);
4567
4568if (re->top_backref > 0 && re->top_backref >= ocount/3)
4569 {
4570 ocount = re->top_backref * 3 + 3;
4571 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4572 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4573 using_temporary_offsets = TRUE;
4574 DPRINTF(("Got memory to hold back references\n"));
4575 }
4576else md->offset_vector = offsets;
4577
4578md->offset_end = ocount;
4579md->offset_max = (2*ocount)/3;
4580md->offset_overflow = FALSE;
4581md->capture_last = -1;
4582
4583/* Compute the minimum number of offsets that we need to reset each time. Doing
4584this makes a huge difference to execution time when there aren't many brackets
4585in the pattern. */
4586
4587resetcount = 2 + re->top_bracket * 2;
4588if (resetcount > offsetcount) resetcount = ocount;
4589
4590/* Reset the working variable associated with each extraction. These should
4591never be used unless previously set, but they get saved and restored, and so we
4592initialize them to avoid reading uninitialized locations. */
4593
4594if (md->offset_vector != NULL)
4595 {
4596 register int *iptr = md->offset_vector + ocount;
4597 register int *iend = iptr - resetcount/2 + 1;
4598 while (--iptr >= iend) *iptr = -1;
4599 }
4600
4601/* Set up the first character to match, if available. The first_byte value is
4602never set for an anchored regular expression, but the anchoring may be forced
4603at run time, so we have to test for anchoring. The first char may be unset for
4604an unanchored pattern, of course. If there's no first char and the pattern was
4605studied, there may be a bitmap of possible first characters. */
4606
4607if (!anchored)
4608 {
4609 if ((re->flags & PCRE_FIRSTSET) != 0)
4610 {
4611 first_byte = re->first_byte & 255;
4612 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4613 first_byte = md->lcc[first_byte];
4614 }
4615 else
4616 if (!startline && study != NULL &&
4617 (study->options & PCRE_STUDY_MAPPED) != 0)
4618 start_bits = study->start_bits;
4619 }
4620
4621/* For anchored or unanchored matches, there may be a "last known required
4622character" set. */
4623
4624if ((re->flags & PCRE_REQCHSET) != 0)
4625 {
4626 req_byte = re->req_byte & 255;
4627 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4628 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4629 }
4630
4631
4632/* ==========================================================================*/
4633
4634/* Loop for handling unanchored repeated matching attempts; for anchored regexs
4635the loop runs just once. */
4636
4637for(;;)
4638 {
4639 USPTR save_end_subject = end_subject;
4640 USPTR new_start_match;
4641
4642 /* Reset the maximum number of extractions we might see. */
4643
4644 if (md->offset_vector != NULL)
4645 {
4646 register int *iptr = md->offset_vector;
4647 register int *iend = iptr + resetcount;
4648 while (iptr < iend) *iptr++ = -1;
4649 }
4650
4651 /* Advance to a unique first char if possible. If firstline is TRUE, the
4652 start of the match is constrained to the first line of a multiline string.
4653 That is, the match must be before or at the first newline. Implement this by
4654 temporarily adjusting end_subject so that we stop scanning at a newline. If
4655 the match fails at the newline, later code breaks this loop. */
4656
4657 if (firstline)
4658 {
4659 USPTR t = start_match;
4660 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4661 end_subject = t;
4662 }
4663
4664 /* Now test for a unique first byte */
4665
4666 if (first_byte >= 0)
4667 {
4668 if (first_byte_caseless)
4669 while (start_match < end_subject &&
4670 md->lcc[*start_match] != first_byte)
4671 { NEXTCHAR(start_match); }
4672 else
4673 while (start_match < end_subject && *start_match != first_byte)
4674 { NEXTCHAR(start_match); }
4675 }
4676
4677 /* Or to just after a linebreak for a multiline match if possible */
4678
4679 else if (startline)
4680 {
4681 if (start_match > md->start_subject + start_offset)
4682 {
4683 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4684 { NEXTCHAR(start_match); }
4685
4686 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4687 and we are now at a LF, advance the match position by one more character.
4688 */
4689
4690 if (start_match[-1] == '\r' &&
4691 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4692 start_match < end_subject &&
4693 *start_match == '\n')
4694 start_match++;
4695 }
4696 }
4697
4698 /* Or to a non-unique first char after study */
4699
4700 else if (start_bits != NULL)
4701 {
4702 while (start_match < end_subject)
4703 {
4704 register unsigned int c = *start_match;
4705 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4706 { NEXTCHAR(start_match); }
4707 else break;
4708 }
4709 }
4710
4711 /* Restore fudged end_subject */
4712
4713 end_subject = save_end_subject;
4714
4715#ifdef DEBUG /* Sigh. Some compilers never learn. */
4716 printf(">>>> Match against: ");
4717 pchars(start_match, end_subject - start_match, TRUE, md);
4718 printf("\n");
4719#endif
4720
4721 /* If req_byte is set, we know that that character must appear in the subject
4722 for the match to succeed. If the first character is set, req_byte must be
4723 later in the subject; otherwise the test starts at the match point. This
4724 optimization can save a huge amount of backtracking in patterns with nested
4725 unlimited repeats that aren't going to match. Writing separate code for
4726 cased/caseless versions makes it go faster, as does using an autoincrement
4727 and backing off on a match.
4728
4729 HOWEVER: when the subject string is very, very long, searching to its end can
4730 take a long time, and give bad performance on quite ordinary patterns. This
4731 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4732 string... so we don't do this when the string is sufficiently long.
4733
4734 ALSO: this processing is disabled when partial matching is requested.
4735 */
4736
4737 if (req_byte >= 0 &&
4738 end_subject - start_match < REQ_BYTE_MAX &&
4739 !md->partial)
4740 {
4741 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4742
4743 /* We don't need to repeat the search if we haven't yet reached the
4744 place we found it at last time. */
4745
4746 if (p > req_byte_ptr)
4747 {
4748 if (req_byte_caseless)
4749 {
4750 while (p < end_subject)
4751 {
4752 register int pp = *p++;
4753 if (pp == req_byte || pp == req_byte2) { p--; break; }
4754 }
4755 }
4756 else
4757 {
4758 while (p < end_subject)
4759 {
4760 if (*p++ == req_byte) { p--; break; }
4761 }
4762 }
4763
4764 /* If we can't find the required character, break the matching loop,
4765 forcing a match failure. */
4766
4767 if (p >= end_subject)
4768 {
4769 rc = MATCH_NOMATCH;
4770 break;
4771 }
4772
4773 /* If we have found the required character, save the point where we
4774 found it, so that we don't search again next time round the loop if
4775 the start hasn't passed this character yet. */
4776
4777 req_byte_ptr = p;
4778 }
4779 }
4780
4781 /* OK, we can now run the match. */
4782
4783 md->start_match_ptr = start_match;
4784 md->match_call_count = 0;
4785 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4786
4787 switch(rc)
4788 {
4789 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4790 exactly like PRUNE. */
4791
4792 case MATCH_NOMATCH:
4793 case MATCH_PRUNE:
4794 case MATCH_THEN:
4795 new_start_match = start_match + 1;
4796#ifdef SUPPORT_UTF8
4797 if (utf8)
4798 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4799 new_start_match++;
4800#endif
4801 break;
4802
4803 /* SKIP passes back the next starting point explicitly. */
4804
4805 case MATCH_SKIP:
4806 new_start_match = md->start_match_ptr;
4807 break;
4808
4809 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4810
4811 case MATCH_COMMIT:
4812 rc = MATCH_NOMATCH;
4813 goto ENDLOOP;
4814
4815 /* Any other return is some kind of error. */
4816
4817 default:
4818 goto ENDLOOP;
4819 }
4820
4821 /* Control reaches here for the various types of "no match at this point"
4822 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4823
4824 rc = MATCH_NOMATCH;
4825
4826 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4827 newline in the subject (though it may continue over the newline). Therefore,
4828 if we have just failed to match, starting at a newline, do not continue. */
4829
4830 if (firstline && IS_NEWLINE(start_match)) break;
4831
4832 /* Advance to new matching position */
4833
4834 start_match = new_start_match;
4835
4836 /* Break the loop if the pattern is anchored or if we have passed the end of
4837 the subject. */
4838
4839 if (anchored || start_match > end_subject) break;
4840
4841 /* If we have just passed a CR and we are now at a LF, and the pattern does
4842 not contain any explicit matches for \r or \n, and the newline option is CRLF
4843 or ANY or ANYCRLF, advance the match position by one more character. */
4844
4845 if (start_match[-1] == '\r' &&
4846 start_match < end_subject &&
4847 *start_match == '\n' &&
4848 (re->flags & PCRE_HASCRORLF) == 0 &&
4849 (md->nltype == NLTYPE_ANY ||
4850 md->nltype == NLTYPE_ANYCRLF ||
4851 md->nllen == 2))
4852 start_match++;
4853
4854 } /* End of for(;;) "bumpalong" loop */
4855
4856/* ==========================================================================*/
4857
4858/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4859conditions is true:
4860
4861(1) The pattern is anchored or the match was failed by (*COMMIT);
4862
4863(2) We are past the end of the subject;
4864
4865(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4866 this option requests that a match occur at or before the first newline in
4867 the subject.
4868
4869When we have a match and the offset vector is big enough to deal with any
4870backreferences, captured substring offsets will already be set up. In the case
4871where we had to get some local store to hold offsets for backreference
4872processing, copy those that we can. In this case there need not be overflow if
4873certain parts of the pattern were not used, even though there are more
4874capturing parentheses than vector slots. */
4875
4876ENDLOOP:
4877
4878if (rc == MATCH_MATCH)
4879 {
4880 if (using_temporary_offsets)
4881 {
4882 if (offsetcount >= 4)
4883 {
4884 memcpy(offsets + 2, md->offset_vector + 2,
4885 (offsetcount - 2) * sizeof(int));
4886 DPRINTF(("Copied offsets from temporary memory\n"));
4887 }
4888 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4889 DPRINTF(("Freeing temporary memory\n"));
4890 (pcre_free)(md->offset_vector);
4891 }
4892
4893 /* Set the return code to the number of captured strings, or 0 if there are
4894 too many to fit into the vector. */
4895
4896 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4897
4898 /* If there is space, set up the whole thing as substring 0. The value of
4899 md->start_match_ptr might be modified if \K was encountered on the success
4900 matching path. */
4901
4902 if (offsetcount < 2) rc = 0; else
4903 {
4904 offsets[0] = md->start_match_ptr - md->start_subject;
4905 offsets[1] = md->end_match_ptr - md->start_subject;
4906 }
4907
4908 DPRINTF((">>>> returning %d\n", rc));
4909 return rc;
4910 }
4911
4912/* Control gets here if there has been an error, or if the overall match
4913attempt has failed at all permitted starting positions. */
4914
4915if (using_temporary_offsets)
4916 {
4917 DPRINTF(("Freeing temporary memory\n"));
4918 (pcre_free)(md->offset_vector);
4919 }
4920
4921if (rc != MATCH_NOMATCH)
4922 {
4923 DPRINTF((">>>> error: returning %d\n", rc));
4924 return rc;
4925 }
4926else if (md->partial && md->hitend)
4927 {
4928 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4929 return PCRE_ERROR_PARTIAL;
4930 }
4931else
4932 {
4933 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4934 return PCRE_ERROR_NOMATCH;
4935 }
4936}
4937
4938/* End of pcre_exec.c */

Archive Download this file

Branches

Tags