monotone

monotone Mtn Source Tree

Root/globish.cc

1// Copyright (C) 2005 Nathaniel Smith <njs@pobox.com>
2// Copyright (C) 2007 Zack Weinberg <zackw@panix.com>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "sanity.hh"
13#include "globish.hh"
14#include "option.hh" // for arg_type
15#include "numeric_vocab.hh"
16
17#include <iterator>
18#include <ostream>
19
20using std::string;
21using std::vector;
22using std::back_inserter;
23using std::back_insert_iterator;
24
25// The algorithm here is originally from pdksh 5. That implementation uses
26// the high bit of unsigned chars as a quotation flag. We can't do that,
27// because we need to be utf8 clean. Instead, we copy the string and
28// replace "live" metacharacters with single bytes from the
29// control-character range. This is why bytes <= 0x1f are not allowed in the
30// pattern.
31
32enum metachar {
33 META_STAR = 1, // *
34 META_QUES, // ?
35 META_CC_BRA, // [
36 META_CC_INV_BRA, // [^ or [!
37 META_CC_KET, // ] (matches either of the above two)
38 META_ALT_BRA, // {
39 META_ALT_OR, // , (when found inside unquoted { ... })
40 META_ALT_KET, // }
41};
42
43// Compile a character class.
44
45static string::const_iterator
46compile_charclass(string const & pat, string::const_iterator p,
47 back_insert_iterator<string> & to)
48{
49 string in_class;
50 char bra = (char)META_CC_BRA;
51
52 p++;
53 N(p != pat.end(),
54 F("invalid pattern '%s': unmatched '['") % pat);
55
56 if (*p == '!' || *p == '^')
57 {
58 bra = (char)META_CC_INV_BRA;
59 p++;
60 N(p != pat.end(),
61 F("invalid pattern '%s': unmatched '['") % pat);
62 }
63
64 while (p != pat.end() && *p != ']')
65 {
66 if (*p == '\\')
67 {
68 p++;
69 if (p == pat.end())
70 break;
71 }
72 // A dash at the beginning or end of the pattern is literal.
73 else if (*p == '-'
74 && in_class.size() != 0
75 && p+1 != pat.end()
76 && p[1] != ']')
77 {
78 p++;
79 if (*p == '\\')
80 p++;
81 if (p == pat.end())
82 break;
83
84 // the cast is needed because boost::format will not obey the %x
85 // if given a 'char'.
86 N((widen<unsigned int, char>(*p)) >= ' ',
87 F("invalid pattern '%s': control character 0x%02x is not allowed")
88 % pat % (widen<unsigned int, char>(*p)));
89
90 unsigned int start = widen<unsigned int, char>(in_class.end()[-1]);
91 unsigned int stop = widen<unsigned int, char>(*p);
92
93 N(start != stop,
94 F("invalid pattern '%s': "
95 "one-element character ranges are not allowed") % pat);
96 N(start < stop,
97 F("invalid pattern '%s': "
98 "endpoints of a character range must be in "
99 "ascending numeric order") % pat);
100 N(start < 0x80 && stop < 0x80,
101 F("invalid pattern '%s': cannot use non-ASCII characters "
102 "in classes") % pat);
103
104 L(FL("expanding range from %X (%c) to %X (%c)")
105 % (start+1) % (char)(start+1) % stop % (char)stop);
106
107 for (unsigned int r = start + 1; r < stop; r++)
108 in_class.push_back((char)r);
109 }
110 else
111 N(*p != '[', F("syntax error in '%s': "
112 "character classes may not be nested") % pat);
113
114 N((widen<unsigned int, char>(*p)) >= ' ',
115 F("invalid pattern '%s': control character 0x%02x is not allowed")
116 % pat % (widen<unsigned int, char>(*p)));
117
118 N((widen<unsigned int, char>(*p)) < 0x80,
119 F("invalid pattern '%s': cannot use non-ASCII characters in classes")
120 % pat);
121
122 in_class.push_back(*p);
123 p++;
124 }
125
126 N(p != pat.end(),
127 F("invalid pattern '%s': unmatched '['") % pat);
128
129 N(in_class.size() != 0,
130 F("invalid pattern '%s': empty character class") % pat);
131
132 // minor optimization: one-element non-inverted character class becomes
133 // the character.
134 if (bra == (char)META_CC_BRA && in_class.size() == 1)
135 *to++ = in_class[0];
136 else
137 {
138 *to++ = bra;
139 std::sort(in_class.begin(), in_class.end());
140 std::copy(in_class.begin(), in_class.end(), to);
141 *to++ = (char)META_CC_KET;
142 }
143 return p;
144}
145
146// Compile one fragment of a glob pattern.
147
148static void
149compile_frag(string const & pat, back_insert_iterator<string> & to)
150{
151 unsigned int brace_depth = 0;
152
153 for (string::const_iterator p = pat.begin(); p != pat.end(); p++)
154 switch (*p)
155 {
156 default:
157 N((widen<unsigned int, char>(*p)) >= ' ',
158 F("invalid pattern '%s': control character 0x%02x is not allowed")
159 % pat % (widen<unsigned int, char>(*p)));
160
161 *to++ = *p;
162 break;
163
164 case '*':
165 // optimization: * followed by any sequence of ?s and *s is
166 // equivalent to the number of ?s that appeared in the sequence,
167 // followed by a single star. the latter can be matched without
168 // nearly as much backtracking.
169
170 for (p++; p != pat.end(); p++)
171 {
172 if (*p == '?')
173 *to++ = META_QUES;
174 else if (*p != '*')
175 break;
176 }
177
178 p--;
179 *to++ = META_STAR;
180 break;
181
182 case '?':
183 *to++ = META_QUES;
184 break;
185
186 case '\\':
187 p++;
188 N(p != pat.end(),
189 F("invalid pattern '%s': un-escaped \\ at end") % pat);
190
191 N((widen<unsigned int, char>(*p)) >= ' ',
192 F("invalid pattern '%s': control character 0x%02x is not allowed")
193 % pat % (widen<unsigned int, char>(*p)));
194
195 *to++ = *p;
196 break;
197
198 case '[':
199 p = compile_charclass(pat, p, to);
200 break;
201
202 case ']':
203 N(false, F("invalid pattern '%s': unmatched ']'") % pat);
204
205 case '{':
206 // There's quite a bit of optimization we could be doing on
207 // alternatives, but it's hairy, especially if you get into
208 // nested alternatives; so we're not doing any of it now.
209 // (Look at emacs's regexp-opt.el for inspiration.)
210 brace_depth++;
211 N(brace_depth < 6,
212 F("invalid pattern '%s': braces nested too deeply") % pat);
213 *to++ = META_ALT_BRA;
214 break;
215
216 case ',':
217 if (brace_depth > 0)
218 *to++ = META_ALT_OR;
219 else
220 *to++ = ',';
221 break;
222
223 case '}':
224 N(brace_depth > 0,
225 F("invalid pattern '%s': unmatched '}'") % pat);
226 brace_depth--;
227 *to++ = META_ALT_KET;
228 break;
229 }
230
231 N(brace_depth == 0,
232 F("invalid pattern '%s': unmatched '{'") % pat);
233}
234
235// common code used by the constructors.
236
237static inline string
238compile(string const & pat)
239{
240 string s;
241 back_insert_iterator<string> to = back_inserter(s);
242 compile_frag(pat, to);
243 return s;
244}
245
246static inline string
247compile(vector<arg_type>::const_iterator const & beg,
248 vector<arg_type>::const_iterator const & end)
249{
250 if (end - beg == 0)
251 return "";
252 if (end - beg == 1)
253 return compile((*beg)());
254
255 string s;
256 back_insert_iterator<string> to = back_inserter(s);
257
258 *to++ = META_ALT_BRA;
259 vector<arg_type>::const_iterator i = beg;
260 for (;;)
261 {
262 compile_frag((*i)(), to);
263 i++;
264 if (i == end)
265 break;
266 *to++ = META_ALT_OR;
267 }
268 *to++ = META_ALT_KET;
269 return s;
270}
271
272globish::globish(string const & p) : compiled_pattern(compile(p)) {}
273globish::globish(char const * p) : compiled_pattern(compile(p)) {}
274
275globish::globish(vector<arg_type> const & p)
276 : compiled_pattern(compile(p.begin(), p.end())) {}
277globish::globish(vector<arg_type>::const_iterator const & beg,
278 vector<arg_type>::const_iterator const & end)
279 : compiled_pattern(compile(beg, end)) {}
280
281// Debugging.
282
283static string
284decode(string::const_iterator p, string::const_iterator end)
285{
286 string s;
287 for (; p != end; p++)
288 switch (*p)
289 {
290 case META_STAR: s.push_back('*'); break;
291 case META_QUES: s.push_back('?'); break;
292 case META_CC_BRA: s.push_back('['); break;
293 case META_CC_KET: s.push_back(']'); break;
294 case META_CC_INV_BRA: s.push_back('[');
295 s.push_back('!'); break;
296
297 case META_ALT_BRA: s.push_back('{'); break;
298 case META_ALT_KET: s.push_back('}'); break;
299 case META_ALT_OR: s.push_back(','); break;
300
301 // Some of these are only special in certain contexts,
302 // but it does no harm to escape them always.
303 case '[': case ']': case '-': case '!': case '^':
304 case '{': case '}': case ',':
305 case '*': case '?': case '\\':
306 s.push_back('\\');
307 // fall through
308 default:
309 s.push_back(*p);
310 }
311 return s;
312}
313
314string
315globish::operator()() const
316{
317 return decode(compiled_pattern.begin(), compiled_pattern.end());
318}
319
320template <> void dump(globish const & g, string & s)
321{
322 s = g();
323}
324
325std::ostream & operator<<(std::ostream & o, globish const & g)
326{
327 return o << g();
328}
329
330// Matching.
331
332static string::const_iterator
333find_next_subpattern(string::const_iterator p,
334 string::const_iterator pe,
335 bool want_alternatives)
336{
337 unsigned int depth = 1;
338 for (; p != pe; p++)
339 switch (*p)
340 {
341 default: break;
342
343 case META_ALT_BRA:
344 depth++; break;
345
346 case META_ALT_KET:
347 depth--;
348 if (depth == 0)
349 return p+1;
350
351 case META_ALT_OR:
352 if (depth == 1 && want_alternatives)
353 return p+1;
354 }
355
356 I(false);
357}
358
359
360static bool
361do_match(string::const_iterator sb, string::const_iterator se,
362 string::const_iterator p, string::const_iterator pe)
363{
364 unsigned int sc, pc;
365 string::const_iterator s(sb);
366
367 L(FL("subpattern: '%s' against '%s'") % string(s,se) % decode(p,pe));
368
369 while (p < pe)
370 {
371 pc = widen<unsigned int, char>(*p++);
372 if(s < se) {
373 sc = widen<unsigned int, char>(*s);
374 s++;
375 } else {
376 sc = 0;
377 }
378 switch (pc)
379 {
380 default: // literal
381 if (sc != pc)
382 return false;
383 break;
384
385 case META_QUES: // any single character
386 if (sc == 0)
387 return false;
388 break;
389
390 case META_CC_BRA: // any of these characters
391 {
392 bool matched = false;
393 I(p < pe);
394 I(*p != META_CC_KET);
395 do
396 {
397 if (widen<unsigned int, char>(*p) == sc)
398 matched = true;
399 p++;
400 I(p < pe);
401 }
402 while (*p != META_CC_KET);
403 if (!matched)
404 return false;
405 }
406 p++;
407 break;
408
409 case META_CC_INV_BRA: // any but these characters
410 I(p < pe);
411 I(*p != META_CC_KET);
412 do
413 {
414 if (widen<unsigned int, char>(*p) == sc)
415 return false;
416 p++;
417 I(p < pe);
418 }
419 while (*p != META_CC_KET);
420 p++;
421 break;
422
423 case META_STAR: // zero or more arbitrary characters
424 if (p == pe)
425 return true; // star at end always matches, if we get that far
426
427 pc = widen<unsigned int, char>(*p);
428 // If the next character in p is not magic, we can only match
429 // starting from places in s where that character appears.
430 if (pc >= ' ')
431 {
432 L(FL("after *: looking for '%c' in '%c%s'")
433 % (char)pc % (char)sc % string(s, se));
434 p++;
435 for (;;)
436 {
437 if (sc == pc && do_match(s, se, p, pe))
438 return true;
439 if (s >= se)
440 break;
441 sc = widen<unsigned int, char>(*s++);
442 }
443 }
444 else
445 {
446 L(FL("metacharacter after *: doing it the slow way"));
447 s--;
448 do
449 {
450 if (do_match(s, se, p, pe))
451 return true;
452 s++;
453 }
454 while (s < se);
455 }
456 return false;
457
458 case META_ALT_BRA:
459 {
460 string::const_iterator prest, psub, pnext;
461 string::const_iterator srest;
462
463 prest = find_next_subpattern(p, pe, false);
464 psub = p;
465 if(s > sb) {
466 s--;
467 }
468 do
469 {
470 pnext = find_next_subpattern(psub, pe, true);
471 srest = (prest == pe ? se : s);
472 for (; srest < se; srest++)
473 {
474 if (do_match(s, srest, psub, pnext - 1)
475 && do_match(srest, se, prest, pe))
476 return true;
477 }
478 // try the empty target too
479 if (do_match(s, srest, psub, pnext - 1)
480 && do_match(srest, se, prest, pe))
481 return true;
482
483 psub = pnext;
484 }
485 while (pnext < prest);
486 return false;
487 }
488 }
489 }
490 return s == se;
491}
492
493bool globish::matches(string const & target) const
494{
495 bool result;
496
497 // The empty pattern matches nothing.
498 if (compiled_pattern.empty())
499 result = false;
500 else
501 result = do_match (target.begin(), target.end(),
502 compiled_pattern.begin(), compiled_pattern.end());
503
504 L(FL("matching '%s' against '%s': %s")
505 % target % (*this)() % (result ? "matches" : "does not match"));
506 return result;
507}
508
509#ifdef BUILD_UNIT_TESTS
510#include "unit_tests.hh"
511
512UNIT_TEST(globish, syntax)
513{
514 struct tcase
515 {
516 char const * in;
517 char const * out;
518 };
519 tcase const good[] = {
520 { "a", "a" },
521 { "\\a", "a" },
522 { "[a]", "a" },
523 { "[!a]", "[!a]" },
524 { "[^a]", "[!a]" },
525 { "[\\!a]", "[\\!a]" },
526 { "[\\^a]", "[\\^a]" },
527 { "[ab]", "[ab]" },
528 { "[a-b]", "[ab]" },
529 { "[a-c]", "[abc]" },
530 { "[ac-]", "[\\-ac]" },
531 { "[-ac]", "[\\-ac]" },
532 { "[+-/]", "[+\\,\\-./]" },
533
534 { "\xC2\xA1", "\xC2\xA1" }, // U+00A1 in UTF8
535
536 { "*", "*" },
537 { "\\*", "\\*" },
538 { "[*]", "\\*" },
539 { "?", "?" },
540 { "\\?", "\\?" },
541 { "[?]", "\\?" },
542 { ",", "\\," },
543 { "\\,", "\\," },
544 { "[,]", "\\," },
545 { "\\{", "\\{" },
546 { "[{]", "\\{" },
547 { "[}]", "\\}" },
548 { "\\[", "\\[" },
549 { "\\]", "\\]" },
550 { "\\\\", "\\\\" },
551
552 { "**", "*" },
553 { "*?", "?*" },
554 { "*???*?*", "????*" },
555 { "*a?*?b*", "*a??*b*" },
556
557 { "{a,b,c}d", "{a,b,c}d" },
558 { "foo{a,{b,c},?*}d", "foo{a,{b,c},?*}d" },
559 { "\\a\\b\\|\\{\\*", "ab|\\{\\*" },
560 { ".+$^{}", ".+$\\^{}" },
561 { "\\.\\+\\$\\^\\(\\)", ".+$\\^()" },
562 { 0, 0 }
563 };
564
565 char const * const bad[] = {
566 "[",
567 "[!",
568 "[\\",
569 "[\\]",
570 "[foo",
571 "[!foo",
572 "foo]",
573 "[\003]",
574 "[a-a]",
575 "[f-a]",
576 "[]",
577 "[\xC2\xA1]",
578 "[\xC2\xA1\xC2\xA2]",
579 "[\xC2\xA1-\xC2\xA2]",
580 "[-\xC2\xA1]",
581 "[[]",
582 "[]",
583
584 "\003",
585 "foo\\",
586 "{foo",
587 "{foo,bar{baz,quux}",
588 "foo}",
589 "foo,bar{baz,quux}}",
590 "{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}",
591 0
592 };
593 char const dummy[] = "";
594
595 for (tcase const * p = good; p->in; p++)
596 {
597 globish g(p->in);
598 string s;
599 dump(g, s);
600 L(FL("globish syntax: %s -> %s [expect %s]") % p->in % s % p->out);
601 UNIT_TEST_CHECK(s == p->out);
602 }
603
604 for (char const * const * p = bad; *p; p++)
605 {
606 L(FL("globish syntax: invalid %s") % *p);
607 UNIT_TEST_CHECK_THROW(I(globish(*p).matches(dummy)), informative_failure);
608 }
609}
610
611UNIT_TEST(globish, from_vector)
612{
613 vector<arg_type> v;
614 v.push_back(arg_type("a"));
615 v.push_back(arg_type("b"));
616 v.push_back(arg_type("c"));
617 globish combined(v);
618 string s;
619 dump(combined, s);
620 UNIT_TEST_CHECK(s == "{a,b,c}");
621}
622
623UNIT_TEST(globish, simple_matches)
624{
625 UNIT_TEST_CHECK(globish("abc").matches("abc"));
626 UNIT_TEST_CHECK(!globish("abc").matches("aac"));
627
628 UNIT_TEST_CHECK(globish("a[bc]d").matches("abd"));
629 UNIT_TEST_CHECK(globish("a[bc]d").matches("acd"));
630 UNIT_TEST_CHECK(!globish("a[bc]d").matches("and"));
631 UNIT_TEST_CHECK(!globish("a[bc]d").matches("ad"));
632 UNIT_TEST_CHECK(!globish("a[bc]d").matches("abbd"));
633
634 UNIT_TEST_CHECK(globish("a[!bc]d").matches("and"));
635 UNIT_TEST_CHECK(globish("a[!bc]d").matches("a#d"));
636 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abd"));
637 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("acd"));
638 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("ad"));
639 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abbd"));
640
641 UNIT_TEST_CHECK(globish("a?c").matches("abc"));
642 UNIT_TEST_CHECK(globish("a?c").matches("aac"));
643 UNIT_TEST_CHECK(globish("a?c").matches("a%c"));
644 UNIT_TEST_CHECK(!globish("a?c").matches("a%d"));
645 UNIT_TEST_CHECK(!globish("a?c").matches("d%d"));
646 UNIT_TEST_CHECK(!globish("a?c").matches("d%c"));
647 UNIT_TEST_CHECK(!globish("a?c").matches("a%%d"));
648
649 UNIT_TEST_CHECK(globish("a*c").matches("ac"));
650 UNIT_TEST_CHECK(globish("a*c").matches("abc"));
651 UNIT_TEST_CHECK(globish("a*c").matches("abac"));
652 UNIT_TEST_CHECK(globish("a*c").matches("abbcc"));
653 UNIT_TEST_CHECK(globish("a*c").matches("abcbbc"));
654 UNIT_TEST_CHECK(!globish("a*c").matches("abcbb"));
655 UNIT_TEST_CHECK(!globish("a*c").matches("abcb"));
656 UNIT_TEST_CHECK(!globish("a*c").matches("aba"));
657 UNIT_TEST_CHECK(!globish("a*c").matches("ab"));
658
659 UNIT_TEST_CHECK(globish("*.bak").matches(".bak"));
660 UNIT_TEST_CHECK(globish("*.bak").matches("a.bak"));
661 UNIT_TEST_CHECK(globish("*.bak").matches("foo.bak"));
662 UNIT_TEST_CHECK(globish("*.bak").matches(".bak.bak"));
663 UNIT_TEST_CHECK(globish("*.bak").matches("fwibble.bak.bak"));
664
665 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abc"));
666 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abcd"));
667 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("aabrd"));
668 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abbbbbbbccd"));
669 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("ab"));
670 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("abde"));
671 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("aaaaaaab"));
672 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("axxxxd"));
673 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("adb"));
674}
675
676UNIT_TEST(globish, complex_matches)
677{ {
678 globish_matcher m(globish("{a,b}?*\\*|"), globish("*c*"));
679 UNIT_TEST_CHECK(m("aq*|"));
680 UNIT_TEST_CHECK(m("bq*|"));
681 UNIT_TEST_CHECK(!m("bc*|"));
682 UNIT_TEST_CHECK(!m("bq|"));
683 UNIT_TEST_CHECK(!m("b*|"));
684 UNIT_TEST_CHECK(!m(""));
685 }
686 {
687 globish_matcher m(globish("{a,\\\\,b*}"), globish("*c*"));
688 UNIT_TEST_CHECK(m("a"));
689 UNIT_TEST_CHECK(!m("ab"));
690 UNIT_TEST_CHECK(m("\\"));
691 UNIT_TEST_CHECK(!m("\\\\"));
692 UNIT_TEST_CHECK(m("b"));
693 UNIT_TEST_CHECK(m("bfoobar"));
694 UNIT_TEST_CHECK(!m("bfoobarcfoobar"));
695 }
696 {
697 globish_matcher m(globish("*"), globish(""));
698 UNIT_TEST_CHECK(m("foo"));
699 UNIT_TEST_CHECK(m(""));
700 }
701 {
702 globish_matcher m(globish("{foo}"), globish(""));
703 UNIT_TEST_CHECK(m("foo"));
704 UNIT_TEST_CHECK(!m("bar"));
705 }
706}
707
708#endif // BUILD_UNIT_TESTS
709
710// Local Variables:
711// mode: C++
712// fill-column: 76
713// c-file-style: "gnu"
714// indent-tabs-mode: nil
715// End:
716// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status