monotone

monotone Mtn Source Tree

Root/globish.cc

1// Copyright (C) 2005 Nathaniel Smith <njs@pobox.com>
2// Copyright (C) 2007 Zack Weinberg <zackw@panix.com>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "sanity.hh"
13#include "globish.hh"
14#include "option.hh" // for arg_type
15#include "numeric_vocab.hh"
16
17#include <iterator>
18#include <ostream>
19
20using std::string;
21using std::vector;
22using std::back_inserter;
23using std::back_insert_iterator;
24
25// The algorithm here is originally from pdksh 5. That implementation uses
26// the high bit of unsigned chars as a quotation flag. We can't do that,
27// because we need to be utf8 clean. Instead, we copy the string and
28// replace "live" metacharacters with single bytes from the
29// control-character range. This is why bytes <= 0x1f are not allowed in the
30// pattern.
31
32enum metachar {
33 META_STAR = 1, // *
34 META_QUES, // ?
35 META_CC_BRA, // [
36 META_CC_INV_BRA, // [^ or [!
37 META_CC_KET, // ] (matches either of the above two)
38 META_ALT_BRA, // {
39 META_ALT_OR, // , (when found inside unquoted { ... })
40 META_ALT_KET, // }
41};
42
43// Compile a character class.
44
45static string::const_iterator
46compile_charclass(string const & pat, string::const_iterator p,
47 back_insert_iterator<string> & to)
48{
49 string in_class;
50 char bra = (char)META_CC_BRA;
51
52 p++;
53 N(p != pat.end(),
54 F("invalid pattern '%s': unmatched '['") % pat);
55
56 if (*p == '!' || *p == '^')
57 {
58 bra = (char)META_CC_INV_BRA;
59 p++;
60 N(p != pat.end(),
61 F("invalid pattern '%s': unmatched '['") % pat);
62 }
63
64 while (p != pat.end() && *p != ']')
65 {
66 if (*p == '\\')
67 {
68 p++;
69 if (p == pat.end())
70 break;
71 }
72 // A dash at the beginning or end of the pattern is literal.
73 else if (*p == '-'
74 && !in_class.empty()
75 && p+1 != pat.end()
76 && p[1] != ']')
77 {
78 p++;
79 if (*p == '\\')
80 p++;
81 if (p == pat.end())
82 break;
83
84 // the cast is needed because boost::format will not obey the %x
85 // if given a 'char'.
86 N((widen<unsigned int, char>(*p)) >= ' ',
87 F("invalid pattern '%s': control character 0x%02x is not allowed")
88 % pat % (widen<unsigned int, char>(*p)));
89
90 unsigned int start = widen<unsigned int, char>(in_class.end()[-1]);
91 unsigned int stop = widen<unsigned int, char>(*p);
92
93 N(start != stop,
94 F("invalid pattern '%s': "
95 "one-element character ranges are not allowed") % pat);
96 N(start < stop,
97 F("invalid pattern '%s': "
98 "endpoints of a character range must be in "
99 "ascending numeric order") % pat);
100 N(start < 0x80 && stop < 0x80,
101 F("invalid pattern '%s': cannot use non-ASCII characters "
102 "in classes") % pat);
103
104 L(FL("expanding range from %X (%c) to %X (%c)")
105 % (start+1) % (char)(start+1) % stop % (char)stop);
106
107 for (unsigned int r = start + 1; r < stop; r++)
108 in_class.push_back((char)r);
109 }
110 else
111 N(*p != '[', F("syntax error in '%s': "
112 "character classes may not be nested") % pat);
113
114 N((widen<unsigned int, char>(*p)) >= ' ',
115 F("invalid pattern '%s': control character 0x%02x is not allowed")
116 % pat % (widen<unsigned int, char>(*p)));
117
118 N((widen<unsigned int, char>(*p)) < 0x80,
119 F("invalid pattern '%s': cannot use non-ASCII characters in classes")
120 % pat);
121
122 in_class.push_back(*p);
123 p++;
124 }
125
126 N(p != pat.end(),
127 F("invalid pattern '%s': unmatched '['") % pat);
128
129 N(!in_class.empty(),
130 F("invalid pattern '%s': empty character class") % pat);
131
132 // minor optimization: one-element non-inverted character class becomes
133 // the character.
134 if (bra == (char)META_CC_BRA && in_class.size() == 1)
135 *to++ = in_class[0];
136 else
137 {
138 *to++ = bra;
139 std::sort(in_class.begin(), in_class.end());
140 std::copy(in_class.begin(), in_class.end(), to);
141 *to++ = (char)META_CC_KET;
142 }
143 return p;
144}
145
146// Compile one fragment of a glob pattern.
147
148static void
149compile_frag(string const & pat, back_insert_iterator<string> & to,
150 made_from_t made_from = made_from_local)
151{
152 unsigned int brace_depth = 0;
153
154 for (string::const_iterator p = pat.begin(); p != pat.end(); p++)
155 switch (*p)
156 {
157 default:
158 N((widen<unsigned int, char>(*p)) >= ' ',
159 F("invalid pattern '%s': control character 0x%02x is not allowed")
160 % pat % (widen<unsigned int, char>(*p)));
161
162 *to++ = *p;
163 break;
164
165 case '*':
166 // optimization: * followed by any sequence of ?s and *s is
167 // equivalent to the number of ?s that appeared in the sequence,
168 // followed by a single star. the latter can be matched without
169 // nearly as much backtracking.
170
171 for (p++; p != pat.end(); p++)
172 {
173 if (*p == '?')
174 *to++ = META_QUES;
175 else if (*p != '*')
176 break;
177 }
178
179 p--;
180 *to++ = META_STAR;
181 break;
182
183 case '?':
184 *to++ = META_QUES;
185 break;
186
187 case '\\':
188 p++;
189 N(p != pat.end(),
190 F("invalid pattern '%s': un-escaped \\ at end") % pat);
191
192 N((widen<unsigned int, char>(*p)) >= ' ',
193 F("invalid pattern '%s': control character 0x%02x is not allowed")
194 % pat % (widen<unsigned int, char>(*p)));
195
196 *to++ = *p;
197 break;
198
199 case '[':
200 p = compile_charclass(pat, p, to);
201 break;
202
203 case ']':
204 N(false, F("invalid pattern '%s': unmatched ']'") % pat);
205
206 case '{':
207 // There's quite a bit of optimization we could be doing on
208 // alternatives, but it's hairy, especially if you get into
209 // nested alternatives; so we're not doing any of it now.
210 // (Look at emacs's regexp-opt.el for inspiration.)
211 brace_depth++;
212 N(brace_depth < 6,
213 F("invalid pattern '%s': braces nested too deeply") % pat);
214 *to++ = META_ALT_BRA;
215 break;
216
217 case ',':
218 if (brace_depth > 0)
219 *to++ = META_ALT_OR;
220 else
221 *to++ = ',';
222 break;
223
224 case '}':
225 N(brace_depth > 0,
226 F("invalid pattern '%s': unmatched '}'") % pat);
227 brace_depth--;
228 *to++ = META_ALT_KET;
229 break;
230 }
231
232 N(brace_depth == 0,
233 F("invalid pattern '%s': unmatched '{'") % pat);
234}
235
236// common code used by the constructors.
237
238static inline string
239compile(string const & pat, made_from_t made_from = made_from_local)
240{
241 string s;
242 back_insert_iterator<string> to = back_inserter(s);
243 compile_frag(pat, to, made_from);
244 return s;
245}
246
247static inline string
248compile(vector<arg_type>::const_iterator const & beg,
249 vector<arg_type>::const_iterator const & end)
250{
251 if (end - beg == 0)
252 return "";
253 if (end - beg == 1)
254 return compile((*beg)());
255
256 string s;
257 back_insert_iterator<string> to = back_inserter(s);
258
259 *to++ = META_ALT_BRA;
260 vector<arg_type>::const_iterator i = beg;
261 for (;;)
262 {
263 compile_frag((*i)(), to);
264 i++;
265 if (i == end)
266 break;
267 *to++ = META_ALT_OR;
268 }
269 *to++ = META_ALT_KET;
270 return s;
271}
272
273globish::globish(string const & p, made_from_t made_from)
274 : compiled_pattern(compile(p, made_from)) {}
275globish::globish(char const * p, made_from_t made_from)
276 : compiled_pattern(compile(p, made_from)) {}
277
278globish::globish(vector<arg_type> const & p)
279 : compiled_pattern(compile(p.begin(), p.end())) {}
280globish::globish(vector<arg_type>::const_iterator const & beg,
281 vector<arg_type>::const_iterator const & end)
282 : compiled_pattern(compile(beg, end)) {}
283
284// Debugging.
285
286static string
287decode(string::const_iterator p, string::const_iterator end)
288{
289 string s;
290 for (; p != end; p++)
291 switch (*p)
292 {
293 case META_STAR: s.push_back('*'); break;
294 case META_QUES: s.push_back('?'); break;
295 case META_CC_BRA: s.push_back('['); break;
296 case META_CC_KET: s.push_back(']'); break;
297 case META_CC_INV_BRA: s.push_back('[');
298 s.push_back('!'); break;
299
300 case META_ALT_BRA: s.push_back('{'); break;
301 case META_ALT_KET: s.push_back('}'); break;
302 case META_ALT_OR: s.push_back(','); break;
303
304 // Some of these are only special in certain contexts,
305 // but it does no harm to escape them always.
306 case '[': case ']': case '-': case '!': case '^':
307 case '{': case '}': case ',':
308 case '*': case '?': case '\\':
309 s.push_back('\\');
310 // fall through
311 default:
312 s.push_back(*p);
313 }
314 return s;
315}
316
317string
318globish::operator()() const
319{
320 return decode(compiled_pattern.begin(), compiled_pattern.end());
321}
322
323template <> void dump(globish const & g, string & s)
324{
325 s = g();
326}
327
328std::ostream & operator<<(std::ostream & o, globish const & g)
329{
330 return o << g();
331}
332
333// Matching.
334
335static string::const_iterator
336find_next_subpattern(string::const_iterator p,
337 string::const_iterator pe,
338 bool want_alternatives)
339{
340 L(FL("Finding subpattern in '%s'") % decode(p, pe));
341 unsigned int depth = 1;
342 for (; p != pe; p++)
343 switch (*p)
344 {
345 default:
346 break;
347
348 case META_ALT_BRA:
349 depth++;
350 break;
351
352 case META_ALT_KET:
353 depth--;
354 if (depth == 0)
355 return p+1;
356 break;
357
358 case META_ALT_OR:
359 if (depth == 1 && want_alternatives)
360 return p+1;
361 break;
362 }
363
364 I(false);
365}
366
367
368static bool
369do_match(string::const_iterator sb, string::const_iterator se,
370 string::const_iterator p, string::const_iterator pe)
371{
372 unsigned int sc, pc;
373 string::const_iterator s(sb);
374
375 L(FL("subpattern: '%s' against '%s'") % string(s,se) % decode(p,pe));
376
377 while (p < pe)
378 {
379 pc = widen<unsigned int, char>(*p++);
380 if(s < se) {
381 sc = widen<unsigned int, char>(*s);
382 s++;
383 } else {
384 sc = 0;
385 }
386 switch (pc)
387 {
388 default: // literal
389 if (sc != pc)
390 return false;
391 break;
392
393 case META_QUES: // any single character
394 if (sc == 0)
395 return false;
396 break;
397
398 case META_CC_BRA: // any of these characters
399 {
400 bool matched = false;
401 I(p < pe);
402 I(*p != META_CC_KET);
403 do
404 {
405 if (widen<unsigned int, char>(*p) == sc)
406 matched = true;
407 p++;
408 I(p < pe);
409 }
410 while (*p != META_CC_KET);
411 if (!matched)
412 return false;
413 }
414 p++;
415 break;
416
417 case META_CC_INV_BRA: // any but these characters
418 I(p < pe);
419 I(*p != META_CC_KET);
420 do
421 {
422 if (widen<unsigned int, char>(*p) == sc)
423 return false;
424 p++;
425 I(p < pe);
426 }
427 while (*p != META_CC_KET);
428 p++;
429 break;
430
431 case META_STAR: // zero or more arbitrary characters
432 if (p == pe)
433 return true; // star at end always matches, if we get that far
434
435 pc = widen<unsigned int, char>(*p);
436 // If the next character in p is not magic, we can only match
437 // starting from places in s where that character appears.
438 if (pc >= ' ')
439 {
440 L(FL("after *: looking for '%c' in '%c%s'")
441 % (char)pc % (char)sc % string(s, se));
442 p++;
443 for (;;)
444 {
445 if (sc == pc && do_match(s, se, p, pe))
446 return true;
447 if (s >= se)
448 break;
449 sc = widen<unsigned int, char>(*s++);
450 }
451 }
452 else
453 {
454 L(FL("metacharacter after *: doing it the slow way"));
455 s--;
456 do
457 {
458 if (do_match(s, se, p, pe))
459 return true;
460 s++;
461 }
462 while (s < se);
463 }
464 return false;
465
466 case META_ALT_BRA:
467 {
468 string::const_iterator prest, psub, pnext;
469 string::const_iterator srest;
470
471 prest = find_next_subpattern(p, pe, false);
472 psub = p;
473 if(s > sb) {
474 s--;
475 }
476 do
477 {
478 pnext = find_next_subpattern(psub, pe, true);
479 srest = (prest == pe ? se : s);
480 for (; srest < se; srest++)
481 {
482 if (do_match(s, srest, psub, pnext - 1)
483 && do_match(srest, se, prest, pe))
484 return true;
485 }
486 // try the empty target too
487 if (do_match(s, srest, psub, pnext - 1)
488 && do_match(srest, se, prest, pe))
489 return true;
490
491 psub = pnext;
492 }
493 while (pnext < prest);
494 return false;
495 }
496 }
497 }
498 return s == se;
499}
500
501bool globish::matches(string const & target) const
502{
503 bool result;
504
505 // The empty pattern matches nothing.
506 if (compiled_pattern.empty())
507 result = false;
508 else
509 result = do_match (target.begin(), target.end(),
510 compiled_pattern.begin(), compiled_pattern.end());
511
512 L(FL("matching '%s' against '%s': %s")
513 % target % (*this)() % (result ? "matches" : "does not match"));
514 return result;
515}
516
517#ifdef BUILD_UNIT_TESTS
518#include "unit_tests.hh"
519
520UNIT_TEST(globish, syntax)
521{
522 struct tcase
523 {
524 char const * in;
525 char const * out;
526 };
527 tcase const good[] = {
528 { "a", "a" },
529 { "\\a", "a" },
530 { "[a]", "a" },
531 { "[!a]", "[!a]" },
532 { "[^a]", "[!a]" },
533 { "[\\!a]", "[\\!a]" },
534 { "[\\^a]", "[\\^a]" },
535 { "[ab]", "[ab]" },
536 { "[a-b]", "[ab]" },
537 { "[a-c]", "[abc]" },
538 { "[ac-]", "[\\-ac]" },
539 { "[-ac]", "[\\-ac]" },
540 { "[+-/]", "[+\\,\\-./]" },
541
542 { "\xC2\xA1", "\xC2\xA1" }, // U+00A1 in UTF8
543
544 { "*", "*" },
545 { "\\*", "\\*" },
546 { "[*]", "\\*" },
547 { "?", "?" },
548 { "\\?", "\\?" },
549 { "[?]", "\\?" },
550 { ",", "\\," },
551 { "\\,", "\\," },
552 { "[,]", "\\," },
553 { "\\{", "\\{" },
554 { "[{]", "\\{" },
555 { "[}]", "\\}" },
556 { "\\[", "\\[" },
557 { "\\]", "\\]" },
558 { "\\\\", "\\\\" },
559
560 { "**", "*" },
561 { "*?", "?*" },
562 { "*???*?*", "????*" },
563 { "*a?*?b*", "*a??*b*" },
564
565 { "{a,b,c}d", "{a,b,c}d" },
566 { "foo{a,{b,c},?*}d", "foo{a,{b,c},?*}d" },
567 { "\\a\\b\\|\\{\\*", "ab|\\{\\*" },
568 { ".+$^{}", ".+$\\^{}" },
569 { "\\.\\+\\$\\^\\(\\)", ".+$\\^()" },
570 { 0, 0 }
571 };
572
573 char const * const bad[] = {
574 "[",
575 "[!",
576 "[\\",
577 "[\\]",
578 "[foo",
579 "[!foo",
580 "foo]",
581 "[\003]",
582 "[a-a]",
583 "[f-a]",
584 "[]",
585 "[\xC2\xA1]",
586 "[\xC2\xA1\xC2\xA2]",
587 "[\xC2\xA1-\xC2\xA2]",
588 "[-\xC2\xA1]",
589 "[[]",
590 "[]",
591
592 "\003",
593 "foo\\",
594 "{foo",
595 "{foo,bar{baz,quux}",
596 "foo}",
597 "foo,bar{baz,quux}}",
598 "{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}",
599 0
600 };
601 char const dummy[] = "";
602
603 for (tcase const * p = good; p->in; p++)
604 {
605 globish g(p->in);
606 string s;
607 dump(g, s);
608 L(FL("globish syntax: %s -> %s [expect %s]") % p->in % s % p->out);
609 UNIT_TEST_CHECK(s == p->out);
610 }
611
612 for (char const * const * p = bad; *p; p++)
613 {
614 L(FL("globish syntax: invalid %s") % *p);
615 UNIT_TEST_CHECK_THROW(I(globish(*p).matches(dummy)), informative_failure);
616 }
617}
618
619UNIT_TEST(globish, from_vector)
620{
621 vector<arg_type> v;
622 v.push_back(arg_type("a"));
623 v.push_back(arg_type("b"));
624 v.push_back(arg_type("c"));
625 globish combined(v);
626 string s;
627 dump(combined, s);
628 UNIT_TEST_CHECK(s == "{a,b,c}");
629}
630
631UNIT_TEST(globish, simple_matches)
632{
633 UNIT_TEST_CHECK(globish("abc").matches("abc"));
634 UNIT_TEST_CHECK(!globish("abc").matches("aac"));
635
636 UNIT_TEST_CHECK(globish("a[bc]d").matches("abd"));
637 UNIT_TEST_CHECK(globish("a[bc]d").matches("acd"));
638 UNIT_TEST_CHECK(!globish("a[bc]d").matches("and"));
639 UNIT_TEST_CHECK(!globish("a[bc]d").matches("ad"));
640 UNIT_TEST_CHECK(!globish("a[bc]d").matches("abbd"));
641
642 UNIT_TEST_CHECK(globish("a[!bc]d").matches("and"));
643 UNIT_TEST_CHECK(globish("a[!bc]d").matches("a#d"));
644 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abd"));
645 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("acd"));
646 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("ad"));
647 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abbd"));
648
649 UNIT_TEST_CHECK(globish("a?c").matches("abc"));
650 UNIT_TEST_CHECK(globish("a?c").matches("aac"));
651 UNIT_TEST_CHECK(globish("a?c").matches("a%c"));
652 UNIT_TEST_CHECK(!globish("a?c").matches("a%d"));
653 UNIT_TEST_CHECK(!globish("a?c").matches("d%d"));
654 UNIT_TEST_CHECK(!globish("a?c").matches("d%c"));
655 UNIT_TEST_CHECK(!globish("a?c").matches("a%%d"));
656
657 UNIT_TEST_CHECK(globish("a*c").matches("ac"));
658 UNIT_TEST_CHECK(globish("a*c").matches("abc"));
659 UNIT_TEST_CHECK(globish("a*c").matches("abac"));
660 UNIT_TEST_CHECK(globish("a*c").matches("abbcc"));
661 UNIT_TEST_CHECK(globish("a*c").matches("abcbbc"));
662 UNIT_TEST_CHECK(!globish("a*c").matches("abcbb"));
663 UNIT_TEST_CHECK(!globish("a*c").matches("abcb"));
664 UNIT_TEST_CHECK(!globish("a*c").matches("aba"));
665 UNIT_TEST_CHECK(!globish("a*c").matches("ab"));
666
667 UNIT_TEST_CHECK(globish("*.bak").matches(".bak"));
668 UNIT_TEST_CHECK(globish("*.bak").matches("a.bak"));
669 UNIT_TEST_CHECK(globish("*.bak").matches("foo.bak"));
670 UNIT_TEST_CHECK(globish("*.bak").matches(".bak.bak"));
671 UNIT_TEST_CHECK(globish("*.bak").matches("fwibble.bak.bak"));
672
673 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abc"));
674 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abcd"));
675 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("aabrd"));
676 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abbbbbbbccd"));
677 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("ab"));
678 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("abde"));
679 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("aaaaaaab"));
680 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("axxxxd"));
681 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("adb"));
682}
683
684UNIT_TEST(globish, complex_matches)
685{ {
686 globish_matcher m(globish("{a,b}?*\\*|"), globish("*c*"));
687 UNIT_TEST_CHECK(m("aq*|"));
688 UNIT_TEST_CHECK(m("bq*|"));
689 UNIT_TEST_CHECK(!m("bc*|"));
690 UNIT_TEST_CHECK(!m("bq|"));
691 UNIT_TEST_CHECK(!m("b*|"));
692 UNIT_TEST_CHECK(!m(""));
693 }
694 {
695 globish_matcher m(globish("{a,\\\\,b*}"), globish("*c*"));
696 UNIT_TEST_CHECK(m("a"));
697 UNIT_TEST_CHECK(!m("ab"));
698 UNIT_TEST_CHECK(m("\\"));
699 UNIT_TEST_CHECK(!m("\\\\"));
700 UNIT_TEST_CHECK(m("b"));
701 UNIT_TEST_CHECK(m("bfoobar"));
702 UNIT_TEST_CHECK(!m("bfoobarcfoobar"));
703 }
704 {
705 globish_matcher m(globish("*"), globish(""));
706 UNIT_TEST_CHECK(m("foo"));
707 UNIT_TEST_CHECK(m(""));
708 }
709 {
710 globish_matcher m(globish("{foo}"), globish(""));
711 UNIT_TEST_CHECK(m("foo"));
712 UNIT_TEST_CHECK(!m("bar"));
713 }
714}
715
716UNIT_TEST(globish, nested_matches)
717{
718 globish g("a.{i.{x,y},j}");
719 UNIT_TEST_CHECK(g.matches("a.i.x"));
720 UNIT_TEST_CHECK(g.matches("a.i.y"));
721 UNIT_TEST_CHECK(g.matches("a.j"));
722 UNIT_TEST_CHECK(!g.matches("q"));
723 UNIT_TEST_CHECK(!g.matches("a.q"));
724 UNIT_TEST_CHECK(!g.matches("a.j.q"));
725 UNIT_TEST_CHECK(!g.matches("a.i.q"));
726 UNIT_TEST_CHECK(!g.matches("a.i.x.q"));
727}
728
729#endif // BUILD_UNIT_TESTS
730
731// Local Variables:
732// mode: C++
733// fill-column: 76
734// c-file-style: "gnu"
735// indent-tabs-mode: nil
736// End:
737// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status