monotone

monotone Mtn Source Tree

Root/globish.cc

1// Copyright (C) 2005 Nathaniel Smith <njs@pobox.com>
2// Copyright (C) 2007 Zack Weinberg <zackw@panix.com>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "sanity.hh"
13#include "globish.hh"
14#include "option.hh" // for arg_type
15#include "numeric_vocab.hh"
16
17#include <iterator>
18#include <ostream>
19
20using std::string;
21using std::vector;
22using std::back_inserter;
23using std::back_insert_iterator;
24
25// The algorithm here is originally from pdksh 5. That implementation uses
26// the high bit of unsigned chars as a quotation flag. We can't do that,
27// because we need to be utf8 clean. Instead, we copy the string and
28// replace "live" metacharacters with single bytes from the
29// control-character range. This is why bytes <= 0x1f are not allowed in the
30// pattern.
31
32enum metachar {
33 META_STAR = 1, // *
34 META_QUES, // ?
35 META_CC_BRA, // [
36 META_CC_INV_BRA, // [^ or [!
37 META_CC_KET, // ] (matches either of the above two)
38 META_ALT_BRA, // {
39 META_ALT_OR, // , (when found inside unquoted { ... })
40 META_ALT_KET, // }
41};
42
43// Compile a character class.
44
45static string::const_iterator
46compile_charclass(string const & pat, string::const_iterator p,
47 back_insert_iterator<string> & to)
48{
49 string in_class;
50 char bra = (char)META_CC_BRA;
51
52 p++;
53 N(p != pat.end(),
54 F("invalid pattern '%s': unmatched '['") % pat);
55
56 if (*p == '!' || *p == '^')
57 {
58 bra = (char)META_CC_INV_BRA;
59 p++;
60 N(p != pat.end(),
61 F("invalid pattern '%s': unmatched '['") % pat);
62 }
63
64 while (p != pat.end() && *p != ']')
65 {
66 if (*p == '\\')
67 {
68 p++;
69 if (p == pat.end())
70 break;
71 }
72 // A dash at the beginning or end of the pattern is literal.
73 else if (*p == '-'
74 && in_class.size() != 0
75 && p+1 != pat.end()
76 && p[1] != ']')
77 {
78 p++;
79 if (*p == '\\')
80 p++;
81 if (p == pat.end())
82 break;
83
84 // the cast is needed because boost::format will not obey the %x
85 // if given a 'char'.
86 N((widen<unsigned int, char>(*p)) >= ' ',
87 F("invalid pattern '%s': control character 0x%02x is not allowed")
88 % pat % (widen<unsigned int, char>(*p)));
89
90 unsigned int start = widen<unsigned int, char>(in_class.end()[-1]);
91 unsigned int stop = widen<unsigned int, char>(*p);
92
93 N(start != stop,
94 F("invalid pattern '%s': "
95 "one-element character ranges are not allowed") % pat);
96 N(start < stop,
97 F("invalid pattern '%s': "
98 "endpoints of a character range must be in "
99 "ascending numeric order") % pat);
100 N(start < 0x80 && stop < 0x80,
101 F("invalid pattern '%s': cannot use non-ASCII characters "
102 "in classes") % pat);
103
104 L(FL("expanding range from %X (%c) to %X (%c)")
105 % (start+1) % (char)(start+1) % stop % (char)stop);
106
107 for (unsigned int r = start + 1; r < stop; r++)
108 in_class.push_back((char)r);
109 }
110 else
111 N(*p != '[', F("syntax error in '%s': "
112 "character classes may not be nested") % pat);
113
114 N((widen<unsigned int, char>(*p)) >= ' ',
115 F("invalid pattern '%s': control character 0x%02x is not allowed")
116 % pat % (widen<unsigned int, char>(*p)));
117
118 N((widen<unsigned int, char>(*p)) < 0x80,
119 F("invalid pattern '%s': cannot use non-ASCII characters in classes")
120 % pat);
121
122 in_class.push_back(*p);
123 p++;
124 }
125
126 N(p != pat.end(),
127 F("invalid pattern '%s': unmatched '['") % pat);
128
129 N(in_class.size() != 0,
130 F("invalid pattern '%s': empty character class") % pat);
131
132 // minor optimization: one-element non-inverted character class becomes
133 // the character.
134 if (bra == (char)META_CC_BRA && in_class.size() == 1)
135 *to++ = in_class[0];
136 else
137 {
138 *to++ = bra;
139 std::sort(in_class.begin(), in_class.end());
140 std::copy(in_class.begin(), in_class.end(), to);
141 *to++ = (char)META_CC_KET;
142 }
143 return p;
144}
145
146// Compile one fragment of a glob pattern.
147
148static void
149compile_frag(string const & pat, back_insert_iterator<string> & to)
150{
151 unsigned int brace_depth = 0;
152
153 for (string::const_iterator p = pat.begin(); p != pat.end(); p++)
154 switch (*p)
155 {
156 default:
157 N((widen<unsigned int, char>(*p)) >= ' ',
158 F("invalid pattern '%s': control character 0x%02x is not allowed")
159 % pat % (widen<unsigned int, char>(*p)));
160
161 *to++ = *p;
162 break;
163
164 case '*':
165 // optimization: * followed by any sequence of ?s and *s is
166 // equivalent to the number of ?s that appeared in the sequence,
167 // followed by a single star. the latter can be matched without
168 // nearly as much backtracking.
169
170 for (p++; p != pat.end(); p++)
171 {
172 if (*p == '?')
173 *to++ = META_QUES;
174 else if (*p != '*')
175 break;
176 }
177
178 p--;
179 *to++ = META_STAR;
180 break;
181
182 case '?':
183 *to++ = META_QUES;
184 break;
185
186 case '\\':
187 p++;
188 N(p != pat.end(),
189 F("invalid pattern '%s': un-escaped \\ at end") % pat);
190
191 N((widen<unsigned int, char>(*p)) >= ' ',
192 F("invalid pattern '%s': control character 0x%02x is not allowed")
193 % pat % (widen<unsigned int, char>(*p)));
194
195 *to++ = *p;
196 break;
197
198 case '[':
199 p = compile_charclass(pat, p, to);
200 break;
201
202 case ']':
203 N(false, F("invalid pattern '%s': unmatched ']'") % pat);
204
205 case '{':
206 // There's quite a bit of optimization we could be doing on
207 // alternatives, but it's hairy, especially if you get into
208 // nested alternatives; so we're not doing any of it now.
209 // (Look at emacs's regexp-opt.el for inspiration.)
210 brace_depth++;
211 N(brace_depth < 6,
212 F("invalid pattern '%s': braces nested too deeply") % pat);
213 *to++ = META_ALT_BRA;
214 break;
215
216 case ',':
217 if (brace_depth > 0)
218 *to++ = META_ALT_OR;
219 else
220 *to++ = ',';
221 break;
222
223 case '}':
224 N(brace_depth > 0,
225 F("invalid pattern '%s': unmatched '}'") % pat);
226 brace_depth--;
227 *to++ = META_ALT_KET;
228 break;
229 }
230
231 N(brace_depth == 0,
232 F("invalid pattern '%s': unmatched '{'") % pat);
233}
234
235// common code used by the constructors.
236
237static inline string
238compile(string const & pat)
239{
240 string s;
241 back_insert_iterator<string> to = back_inserter(s);
242 compile_frag(pat, to);
243 return s;
244}
245
246static inline string
247compile(vector<arg_type>::const_iterator const & beg,
248 vector<arg_type>::const_iterator const & end)
249{
250 if (end - beg == 0)
251 return "";
252 if (end - beg == 1)
253 return compile((*beg)());
254
255 string s;
256 back_insert_iterator<string> to = back_inserter(s);
257
258 *to++ = META_ALT_BRA;
259 vector<arg_type>::const_iterator i = beg;
260 for (;;)
261 {
262 compile_frag((*i)(), to);
263 i++;
264 if (i == end)
265 break;
266 *to++ = META_ALT_OR;
267 }
268 *to++ = META_ALT_KET;
269 return s;
270}
271
272globish::globish(string const & p) : compiled_pattern(compile(p)) {}
273globish::globish(char const * p) : compiled_pattern(compile(p)) {}
274
275globish::globish(vector<arg_type> const & p)
276 : compiled_pattern(compile(p.begin(), p.end())) {}
277globish::globish(vector<arg_type>::const_iterator const & beg,
278 vector<arg_type>::const_iterator const & end)
279 : compiled_pattern(compile(beg, end)) {}
280
281// Debugging.
282
283static string
284decode(string::const_iterator p, string::const_iterator end)
285{
286 string s;
287 for (; p != end; p++)
288 switch (*p)
289 {
290 case META_STAR: s.push_back('*'); break;
291 case META_QUES: s.push_back('?'); break;
292 case META_CC_BRA: s.push_back('['); break;
293 case META_CC_KET: s.push_back(']'); break;
294 case META_CC_INV_BRA: s.push_back('[');
295 s.push_back('!'); break;
296
297 case META_ALT_BRA: s.push_back('{'); break;
298 case META_ALT_KET: s.push_back('}'); break;
299 case META_ALT_OR: s.push_back(','); break;
300
301 // Some of these are only special in certain contexts,
302 // but it does no harm to escape them always.
303 case '[': case ']': case '-': case '!': case '^':
304 case '{': case '}': case ',':
305 case '*': case '?': case '\\':
306 s.push_back('\\');
307 // fall through
308 default:
309 s.push_back(*p);
310 }
311 return s;
312}
313
314string
315globish::operator()() const
316{
317 return decode(compiled_pattern.begin(), compiled_pattern.end());
318}
319
320template <> void dump(globish const & g, string & s)
321{
322 s = g();
323}
324
325std::ostream & operator<<(std::ostream & o, globish const & g)
326{
327 return o << g();
328}
329
330// Matching.
331
332static string::const_iterator
333find_next_subpattern(string::const_iterator p,
334 string::const_iterator pe,
335 bool want_alternatives)
336{
337 unsigned int depth = 1;
338 for (; p != pe; p++)
339 switch (*p)
340 {
341 default: break;
342
343 case META_ALT_BRA:
344 depth++; break;
345
346 case META_ALT_KET:
347 depth--;
348 if (depth == 0)
349 return p+1;
350
351 case META_ALT_OR:
352 if (depth == 1 && want_alternatives)
353 return p+1;
354 }
355
356 I(false);
357}
358
359
360static bool
361do_match(string::const_iterator sb, string::const_iterator se,
362 string::const_iterator p, string::const_iterator pe)
363{
364 unsigned int sc, pc;
365 string::const_iterator s(sb);
366
367 if (global_sanity.debug_p()) // decode() is expensive
368 L(FL("subpattern: '%s' against '%s'") % string(s,se) % decode(p,pe));
369
370 while (p < pe)
371 {
372 pc = widen<unsigned int, char>(*p++);
373 if(s < se) {
374 sc = widen<unsigned int, char>(*s);
375 s++;
376 } else {
377 sc = 0;
378 }
379 switch (pc)
380 {
381 default: // literal
382 if (sc != pc)
383 return false;
384 break;
385
386 case META_QUES: // any single character
387 if (sc == 0)
388 return false;
389 break;
390
391 case META_CC_BRA: // any of these characters
392 {
393 bool matched = false;
394 I(p < pe);
395 I(*p != META_CC_KET);
396 do
397 {
398 if (widen<unsigned int, char>(*p) == sc)
399 matched = true;
400 p++;
401 I(p < pe);
402 }
403 while (*p != META_CC_KET);
404 if (!matched)
405 return false;
406 }
407 p++;
408 break;
409
410 case META_CC_INV_BRA: // any but these characters
411 I(p < pe);
412 I(*p != META_CC_KET);
413 do
414 {
415 if (widen<unsigned int, char>(*p) == sc)
416 return false;
417 p++;
418 I(p < pe);
419 }
420 while (*p != META_CC_KET);
421 p++;
422 break;
423
424 case META_STAR: // zero or more arbitrary characters
425 if (p == pe)
426 return true; // star at end always matches, if we get that far
427
428 pc = widen<unsigned int, char>(*p);
429 // If the next character in p is not magic, we can only match
430 // starting from places in s where that character appears.
431 if (pc >= ' ')
432 {
433 if (global_sanity.debug_p())
434 L(FL("after *: looking for '%c' in '%c%s'")
435 % (char)pc % (char)sc % string(s, se));
436 p++;
437 for (;;)
438 {
439 if (sc == pc && do_match(s, se, p, pe))
440 return true;
441 if (s >= se)
442 break;
443 sc = widen<unsigned int, char>(*s++);
444 }
445 }
446 else
447 {
448 if (global_sanity.debug_p())
449 L(FL("metacharacter after *: doing it the slow way"));
450 s--;
451 do
452 {
453 if (do_match(s, se, p, pe))
454 return true;
455 s++;
456 }
457 while (s < se);
458 }
459 return false;
460
461 case META_ALT_BRA:
462 {
463 string::const_iterator prest, psub, pnext;
464 string::const_iterator srest;
465
466 prest = find_next_subpattern(p, pe, false);
467 psub = p;
468 if(s > sb) {
469 s--;
470 }
471 do
472 {
473 pnext = find_next_subpattern(psub, pe, true);
474 srest = (prest == pe ? se : s);
475 for (; srest < se; srest++)
476 {
477 if (do_match(s, srest, psub, pnext - 1)
478 && do_match(srest, se, prest, pe))
479 return true;
480 }
481 // try the empty target too
482 if (do_match(s, srest, psub, pnext - 1)
483 && do_match(srest, se, prest, pe))
484 return true;
485
486 psub = pnext;
487 }
488 while (pnext < prest);
489 return false;
490 }
491 }
492 }
493 return s == se;
494}
495
496bool globish::matches(string const & target) const
497{
498 bool result;
499
500 // The empty pattern matches nothing.
501 if (compiled_pattern.empty())
502 result = false;
503 else
504 result = do_match (target.begin(), target.end(),
505 compiled_pattern.begin(), compiled_pattern.end());
506
507 L(FL("matching '%s' against '%s': %s")
508 % target % (*this)() % (result ? "matches" : "does not match"));
509 return result;
510}
511
512#ifdef BUILD_UNIT_TESTS
513#include "unit_tests.hh"
514
515UNIT_TEST(globish, syntax)
516{
517 struct tcase
518 {
519 char const * in;
520 char const * out;
521 };
522 tcase const good[] = {
523 { "a", "a" },
524 { "\\a", "a" },
525 { "[a]", "a" },
526 { "[!a]", "[!a]" },
527 { "[^a]", "[!a]" },
528 { "[\\!a]", "[\\!a]" },
529 { "[\\^a]", "[\\^a]" },
530 { "[ab]", "[ab]" },
531 { "[a-b]", "[ab]" },
532 { "[a-c]", "[abc]" },
533 { "[ac-]", "[\\-ac]" },
534 { "[-ac]", "[\\-ac]" },
535 { "[+-/]", "[+\\,\\-./]" },
536
537 { "\xC2\xA1", "\xC2\xA1" }, // U+00A1 in UTF8
538
539 { "*", "*" },
540 { "\\*", "\\*" },
541 { "[*]", "\\*" },
542 { "?", "?" },
543 { "\\?", "\\?" },
544 { "[?]", "\\?" },
545 { ",", "\\," },
546 { "\\,", "\\," },
547 { "[,]", "\\," },
548 { "\\{", "\\{" },
549 { "[{]", "\\{" },
550 { "[}]", "\\}" },
551 { "\\[", "\\[" },
552 { "\\]", "\\]" },
553 { "\\\\", "\\\\" },
554
555 { "**", "*" },
556 { "*?", "?*" },
557 { "*???*?*", "????*" },
558 { "*a?*?b*", "*a??*b*" },
559
560 { "{a,b,c}d", "{a,b,c}d" },
561 { "foo{a,{b,c},?*}d", "foo{a,{b,c},?*}d" },
562 { "\\a\\b\\|\\{\\*", "ab|\\{\\*" },
563 { ".+$^{}", ".+$\\^{}" },
564 { "\\.\\+\\$\\^\\(\\)", ".+$\\^()" },
565 { 0, 0 }
566 };
567
568 char const * const bad[] = {
569 "[",
570 "[!",
571 "[\\",
572 "[\\]",
573 "[foo",
574 "[!foo",
575 "foo]",
576 "[\003]",
577 "[a-a]",
578 "[f-a]",
579 "[]",
580 "[\xC2\xA1]",
581 "[\xC2\xA1\xC2\xA2]",
582 "[\xC2\xA1-\xC2\xA2]",
583 "[-\xC2\xA1]",
584 "[[]",
585 "[]",
586
587 "\003",
588 "foo\\",
589 "{foo",
590 "{foo,bar{baz,quux}",
591 "foo}",
592 "foo,bar{baz,quux}}",
593 "{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}",
594 0
595 };
596 char const dummy[] = "";
597
598 for (tcase const * p = good; p->in; p++)
599 {
600 globish g(p->in);
601 string s;
602 dump(g, s);
603 L(FL("globish syntax: %s -> %s [expect %s]") % p->in % s % p->out);
604 UNIT_TEST_CHECK(s == p->out);
605 }
606
607 for (char const * const * p = bad; *p; p++)
608 {
609 L(FL("globish syntax: invalid %s") % *p);
610 UNIT_TEST_CHECK_THROW(I(globish(*p).matches(dummy)), informative_failure);
611 }
612}
613
614UNIT_TEST(globish, from_vector)
615{
616 vector<arg_type> v;
617 v.push_back(arg_type("a"));
618 v.push_back(arg_type("b"));
619 v.push_back(arg_type("c"));
620 globish combined(v);
621 string s;
622 dump(combined, s);
623 UNIT_TEST_CHECK(s == "{a,b,c}");
624}
625
626UNIT_TEST(globish, simple_matches)
627{
628 UNIT_TEST_CHECK(globish("abc").matches("abc"));
629 UNIT_TEST_CHECK(!globish("abc").matches("aac"));
630
631 UNIT_TEST_CHECK(globish("a[bc]d").matches("abd"));
632 UNIT_TEST_CHECK(globish("a[bc]d").matches("acd"));
633 UNIT_TEST_CHECK(!globish("a[bc]d").matches("and"));
634 UNIT_TEST_CHECK(!globish("a[bc]d").matches("ad"));
635 UNIT_TEST_CHECK(!globish("a[bc]d").matches("abbd"));
636
637 UNIT_TEST_CHECK(globish("a[!bc]d").matches("and"));
638 UNIT_TEST_CHECK(globish("a[!bc]d").matches("a#d"));
639 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abd"));
640 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("acd"));
641 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("ad"));
642 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abbd"));
643
644 UNIT_TEST_CHECK(globish("a?c").matches("abc"));
645 UNIT_TEST_CHECK(globish("a?c").matches("aac"));
646 UNIT_TEST_CHECK(globish("a?c").matches("a%c"));
647 UNIT_TEST_CHECK(!globish("a?c").matches("a%d"));
648 UNIT_TEST_CHECK(!globish("a?c").matches("d%d"));
649 UNIT_TEST_CHECK(!globish("a?c").matches("d%c"));
650 UNIT_TEST_CHECK(!globish("a?c").matches("a%%d"));
651
652 UNIT_TEST_CHECK(globish("a*c").matches("ac"));
653 UNIT_TEST_CHECK(globish("a*c").matches("abc"));
654 UNIT_TEST_CHECK(globish("a*c").matches("abac"));
655 UNIT_TEST_CHECK(globish("a*c").matches("abbcc"));
656 UNIT_TEST_CHECK(globish("a*c").matches("abcbbc"));
657 UNIT_TEST_CHECK(!globish("a*c").matches("abcbb"));
658 UNIT_TEST_CHECK(!globish("a*c").matches("abcb"));
659 UNIT_TEST_CHECK(!globish("a*c").matches("aba"));
660 UNIT_TEST_CHECK(!globish("a*c").matches("ab"));
661
662 UNIT_TEST_CHECK(globish("*.bak").matches(".bak"));
663 UNIT_TEST_CHECK(globish("*.bak").matches("a.bak"));
664 UNIT_TEST_CHECK(globish("*.bak").matches("foo.bak"));
665 UNIT_TEST_CHECK(globish("*.bak").matches(".bak.bak"));
666 UNIT_TEST_CHECK(globish("*.bak").matches("fwibble.bak.bak"));
667
668 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abc"));
669 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abcd"));
670 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("aabrd"));
671 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abbbbbbbccd"));
672 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("ab"));
673 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("abde"));
674 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("aaaaaaab"));
675 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("axxxxd"));
676 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("adb"));
677}
678
679UNIT_TEST(globish, complex_matches)
680{ {
681 globish_matcher m(globish("{a,b}?*\\*|"), globish("*c*"));
682 UNIT_TEST_CHECK(m("aq*|"));
683 UNIT_TEST_CHECK(m("bq*|"));
684 UNIT_TEST_CHECK(!m("bc*|"));
685 UNIT_TEST_CHECK(!m("bq|"));
686 UNIT_TEST_CHECK(!m("b*|"));
687 UNIT_TEST_CHECK(!m(""));
688 }
689 {
690 globish_matcher m(globish("{a,\\\\,b*}"), globish("*c*"));
691 UNIT_TEST_CHECK(m("a"));
692 UNIT_TEST_CHECK(!m("ab"));
693 UNIT_TEST_CHECK(m("\\"));
694 UNIT_TEST_CHECK(!m("\\\\"));
695 UNIT_TEST_CHECK(m("b"));
696 UNIT_TEST_CHECK(m("bfoobar"));
697 UNIT_TEST_CHECK(!m("bfoobarcfoobar"));
698 }
699 {
700 globish_matcher m(globish("*"), globish(""));
701 UNIT_TEST_CHECK(m("foo"));
702 UNIT_TEST_CHECK(m(""));
703 }
704 {
705 globish_matcher m(globish("{foo}"), globish(""));
706 UNIT_TEST_CHECK(m("foo"));
707 UNIT_TEST_CHECK(!m("bar"));
708 }
709}
710
711#endif // BUILD_UNIT_TESTS
712
713// Local Variables:
714// mode: C++
715// fill-column: 76
716// c-file-style: "gnu"
717// indent-tabs-mode: nil
718// End:
719// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status