monotone

monotone Mtn Source Tree

Root/globish.cc

1// Copyright (C) 2005 Nathaniel Smith <njs@pobox.com>
2// Copyright (C) 2007 Zack Weinberg <zackw@panix.com>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "sanity.hh"
13#include "globish.hh"
14#include "option.hh" // for arg_type
15#include "numeric_vocab.hh"
16
17#include <iterator>
18#include <ostream>
19
20using std::string;
21using std::vector;
22using std::back_inserter;
23using std::back_insert_iterator;
24
25// The algorithm here is originally from pdksh 5. That implementation uses
26// the high bit of unsigned chars as a quotation flag. We can't do that,
27// because we need to be utf8 clean. Instead, we copy the string and
28// replace "live" metacharacters with single bytes from the
29// control-character range. This is why bytes <= 0x1f are not allowed in the
30// pattern.
31
32enum metachar {
33 META_STAR = 1, // *
34 META_QUES, // ?
35 META_CC_BRA, // [
36 META_CC_INV_BRA, // [^ or [!
37 META_CC_KET, // ] (matches either of the above two)
38 META_ALT_BRA, // {
39 META_ALT_OR, // , (when found inside unquoted { ... })
40 META_ALT_KET, // }
41};
42
43// Compile a character class.
44
45static string::const_iterator
46compile_charclass(string const & pat, string::const_iterator p,
47 back_insert_iterator<string> & to)
48{
49 string in_class;
50 char bra = (char)META_CC_BRA;
51
52 p++;
53 N(p != pat.end(),
54 F("invalid pattern '%s': unmatched '['") % pat);
55
56 if (*p == '!' || *p == '^')
57 {
58 bra = (char)META_CC_INV_BRA;
59 p++;
60 N(p != pat.end(),
61 F("invalid pattern '%s': unmatched '['") % pat);
62 }
63
64 while (p != pat.end() && *p != ']')
65 {
66 if (*p == '\\')
67 {
68 p++;
69 if (p == pat.end())
70 break;
71 }
72 // A dash at the beginning or end of the pattern is literal.
73 else if (*p == '-'
74 && in_class.size() != 0
75 && p+1 != pat.end()
76 && p[1] != ']')
77 {
78 p++;
79 if (*p == '\\')
80 p++;
81 if (p == pat.end())
82 break;
83
84 // the cast is needed because boost::format will not obey the %x
85 // if given a 'char'.
86 N((widen<unsigned int, char>(*p)) >= ' ',
87 F("invalid pattern '%s': control character 0x%02x is not allowed")
88 % pat % (widen<unsigned int, char>(*p)));
89
90 unsigned int start = widen<unsigned int, char>(in_class.end()[-1]);
91 unsigned int stop = widen<unsigned int, char>(*p);
92
93 N(start != stop,
94 F("invalid pattern '%s': "
95 "one-element character ranges are not allowed") % pat);
96 N(start < stop,
97 F("invalid pattern '%s': "
98 "endpoints of a character range must be in "
99 "ascending numeric order") % pat);
100 N(start < 0x80 && stop < 0x80,
101 F("invalid pattern '%s': cannot use non-ASCII characters "
102 "in classes") % pat);
103
104 L(FL("expanding range from %X (%c) to %X (%c)")
105 % (start+1) % (char)(start+1) % stop % (char)stop);
106
107 for (unsigned int r = start + 1; r < stop; r++)
108 in_class.push_back((char)r);
109 }
110 else
111 N(*p != '[', F("syntax error in '%s': "
112 "character classes may not be nested") % pat);
113
114 N((widen<unsigned int, char>(*p)) >= ' ',
115 F("invalid pattern '%s': control character 0x%02x is not allowed")
116 % pat % (widen<unsigned int, char>(*p)));
117
118 N((widen<unsigned int, char>(*p)) < 0x80,
119 F("invalid pattern '%s': cannot use non-ASCII characters in classes")
120 % pat);
121
122 in_class.push_back(*p);
123 p++;
124 }
125
126 N(p != pat.end(),
127 F("invalid pattern '%s': unmatched '['") % pat);
128
129 N(in_class.size() != 0,
130 F("invalid pattern '%s': empty character class") % pat);
131
132 // minor optimization: one-element non-inverted character class becomes
133 // the character.
134 if (bra == (char)META_CC_BRA && in_class.size() == 1)
135 *to++ = in_class[0];
136 else
137 {
138 *to++ = bra;
139 std::sort(in_class.begin(), in_class.end());
140 std::copy(in_class.begin(), in_class.end(), to);
141 *to++ = (char)META_CC_KET;
142 }
143 return p;
144}
145
146// Compile one fragment of a glob pattern.
147
148static void
149compile_frag(string const & pat, back_insert_iterator<string> & to)
150{
151 unsigned int brace_depth = 0;
152
153 for (string::const_iterator p = pat.begin(); p != pat.end(); p++)
154 switch (*p)
155 {
156 default:
157 N((widen<unsigned int, char>(*p)) >= ' ',
158 F("invalid pattern '%s': control character 0x%02x is not allowed")
159 % pat % (widen<unsigned int, char>(*p)));
160
161 *to++ = *p;
162 break;
163
164 case '*':
165 // optimization: * followed by any sequence of ?s and *s is
166 // equivalent to the number of ?s that appeared in the sequence,
167 // followed by a single star. the latter can be matched without
168 // nearly as much backtracking.
169
170 for (p++; p != pat.end(); p++)
171 {
172 if (*p == '?')
173 *to++ = META_QUES;
174 else if (*p != '*')
175 break;
176 }
177
178 p--;
179 *to++ = META_STAR;
180 break;
181
182 case '?':
183 *to++ = META_QUES;
184 break;
185
186 case '\\':
187 p++;
188 N(p != pat.end(),
189 F("invalid pattern '%s': un-escaped \\ at end") % pat);
190
191 N((widen<unsigned int, char>(*p)) >= ' ',
192 F("invalid pattern '%s': control character 0x%02x is not allowed")
193 % pat % (widen<unsigned int, char>(*p)));
194
195 *to++ = *p;
196 break;
197
198 case '[':
199 p = compile_charclass(pat, p, to);
200 break;
201
202 case ']':
203 N(false, F("invalid pattern '%s': unmatched ']'") % pat);
204
205 case '{':
206 // There's quite a bit of optimization we could be doing on
207 // alternatives, but it's hairy, especially if you get into
208 // nested alternatives; so we're not doing any of it now.
209 // (Look at emacs's regexp-opt.el for inspiration.)
210 brace_depth++;
211 N(brace_depth < 6,
212 F("invalid pattern '%s': braces nested too deeply") % pat);
213 *to++ = META_ALT_BRA;
214 break;
215
216 case ',':
217 if (brace_depth > 0)
218 *to++ = META_ALT_OR;
219 else
220 *to++ = ',';
221 break;
222
223 case '}':
224 N(brace_depth > 0,
225 F("invalid pattern '%s': unmatched '}'") % pat);
226 brace_depth--;
227 *to++ = META_ALT_KET;
228 break;
229 }
230
231 N(brace_depth == 0,
232 F("invalid pattern '%s': unmatched '{'") % pat);
233}
234
235// common code used by the constructors.
236
237static inline string
238compile(string const & pat)
239{
240 string s;
241 back_insert_iterator<string> to = back_inserter(s);
242 compile_frag(pat, to);
243 return s;
244}
245
246static inline string
247compile(vector<arg_type>::const_iterator const & beg,
248 vector<arg_type>::const_iterator const & end)
249{
250 if (end - beg == 0)
251 return "";
252 if (end - beg == 1)
253 return compile((*beg)());
254
255 string s;
256 back_insert_iterator<string> to = back_inserter(s);
257
258 *to++ = META_ALT_BRA;
259 vector<arg_type>::const_iterator i = beg;
260 for (;;)
261 {
262 compile_frag((*i)(), to);
263 i++;
264 if (i == end)
265 break;
266 *to++ = META_ALT_OR;
267 }
268 *to++ = META_ALT_KET;
269 return s;
270}
271
272globish::globish(string const & p) : compiled_pattern(compile(p)) {}
273globish::globish(char const * p) : compiled_pattern(compile(p)) {}
274
275globish::globish(vector<arg_type> const & p)
276 : compiled_pattern(compile(p.begin(), p.end())) {}
277globish::globish(vector<arg_type>::const_iterator const & beg,
278 vector<arg_type>::const_iterator const & end)
279 : compiled_pattern(compile(beg, end)) {}
280
281// Debugging.
282
283static string
284decode(string::const_iterator p, string::const_iterator end)
285{
286 string s;
287 for (; p != end; p++)
288 switch (*p)
289 {
290 case META_STAR: s.push_back('*'); break;
291 case META_QUES: s.push_back('?'); break;
292 case META_CC_BRA: s.push_back('['); break;
293 case META_CC_KET: s.push_back(']'); break;
294 case META_CC_INV_BRA: s.push_back('[');
295 s.push_back('!'); break;
296
297 case META_ALT_BRA: s.push_back('{'); break;
298 case META_ALT_KET: s.push_back('}'); break;
299 case META_ALT_OR: s.push_back(','); break;
300
301 // Some of these are only special in certain contexts,
302 // but it does no harm to escape them always.
303 case '[': case ']': case '-': case '!': case '^':
304 case '{': case '}': case ',':
305 case '*': case '?': case '\\':
306 s.push_back('\\');
307 // fall through
308 default:
309 s.push_back(*p);
310 }
311 return s;
312}
313
314string
315globish::operator()() const
316{
317 return decode(compiled_pattern.begin(), compiled_pattern.end());
318}
319
320template <> void dump(globish const & g, string & s)
321{
322 s = g();
323}
324
325std::ostream & operator<<(std::ostream & o, globish const & g)
326{
327 return o << g();
328}
329
330// Matching.
331
332static string::const_iterator
333find_next_subpattern(string::const_iterator p,
334 string::const_iterator pe,
335 bool want_alternatives)
336{
337 unsigned int depth = 1;
338 for (; p != pe; p++)
339 switch (*p)
340 {
341 default: break;
342
343 case META_ALT_BRA:
344 depth++; break;
345
346 case META_ALT_KET:
347 depth--;
348 if (depth == 0)
349 return p+1;
350
351 case META_ALT_OR:
352 if (depth == 1 && want_alternatives)
353 return p+1;
354 }
355
356 I(false);
357}
358
359
360static bool
361do_match(string::const_iterator s, string::const_iterator se,
362 string::const_iterator p, string::const_iterator pe)
363{
364 unsigned int sc, pc;
365
366 if (global_sanity.debug_p()) // decode() is expensive
367 L(FL("subpattern: '%s' against '%s'") % string(s,se) % decode(p,pe));
368
369 while (p < pe)
370 {
371 pc = widen<unsigned int, char>(*p++);
372 sc = s < se ? widen<unsigned int, char>(*s) : 0;
373 s++;
374 switch (pc)
375 {
376 default: // literal
377 if (sc != pc)
378 return false;
379 break;
380
381 case META_QUES: // any single character
382 if (sc == 0)
383 return false;
384 break;
385
386 case META_CC_BRA: // any of these characters
387 {
388 bool matched = false;
389 I(p < pe);
390 I(*p != META_CC_KET);
391 do
392 {
393 if (widen<unsigned int, char>(*p) == sc)
394 matched = true;
395 p++;
396 I(p < pe);
397 }
398 while (*p != META_CC_KET);
399 if (!matched)
400 return false;
401 }
402 p++;
403 break;
404
405 case META_CC_INV_BRA: // any but these characters
406 I(p < pe);
407 I(*p != META_CC_KET);
408 do
409 {
410 if (widen<unsigned int, char>(*p) == sc)
411 return false;
412 p++;
413 I(p < pe);
414 }
415 while (*p != META_CC_KET);
416 p++;
417 break;
418
419 case META_STAR: // zero or more arbitrary characters
420 if (p == pe)
421 return true; // star at end always matches, if we get that far
422
423 pc = widen<unsigned int, char>(*p);
424 // If the next character in p is not magic, we can only match
425 // starting from places in s where that character appears.
426 if (pc >= ' ')
427 {
428 if (global_sanity.debug_p())
429 L(FL("after *: looking for '%c' in '%c%s'")
430 % (char)pc % (char)sc % string(s, se));
431 p++;
432 for (;;)
433 {
434 if (sc == pc && do_match(s, se, p, pe))
435 return true;
436 if (s >= se)
437 break;
438 sc = widen<unsigned int, char>(*s++);
439 }
440 }
441 else
442 {
443 if (global_sanity.debug_p())
444 L(FL("metacharacter after *: doing it the slow way"));
445 s--;
446 do
447 {
448 if (do_match(s, se, p, pe))
449 return true;
450 s++;
451 }
452 while (s < se);
453 }
454 return false;
455
456 case META_ALT_BRA:
457 {
458 string::const_iterator prest, psub, pnext;
459 string::const_iterator srest;
460
461 prest = find_next_subpattern(p, pe, false);
462 psub = p;
463 s--;
464 do
465 {
466 pnext = find_next_subpattern(psub, pe, true);
467 srest = (prest == pe ? se : s);
468 for (; srest < se; srest++)
469 {
470 if (do_match(s, srest, psub, pnext - 1)
471 && do_match(srest, se, prest, pe))
472 return true;
473 }
474 // try the empty target too
475 if (do_match(s, srest, psub, pnext - 1)
476 && do_match(srest, se, prest, pe))
477 return true;
478
479 psub = pnext;
480 }
481 while (pnext < prest);
482 return false;
483 }
484 }
485 }
486 return s == se;
487}
488
489bool globish::matches(string const & target) const
490{
491 bool result;
492
493 // The empty pattern matches nothing.
494 if (compiled_pattern.empty())
495 result = false;
496 else
497 result = do_match (target.begin(), target.end(),
498 compiled_pattern.begin(), compiled_pattern.end());
499
500 L(FL("matching '%s' against '%s': %s")
501 % target % (*this)() % (result ? "matches" : "does not match"));
502 return result;
503}
504
505#ifdef BUILD_UNIT_TESTS
506#include "unit_tests.hh"
507
508UNIT_TEST(globish, syntax)
509{
510 struct tcase
511 {
512 char const * in;
513 char const * out;
514 };
515 tcase const good[] = {
516 { "a", "a" },
517 { "\\a", "a" },
518 { "[a]", "a" },
519 { "[!a]", "[!a]" },
520 { "[^a]", "[!a]" },
521 { "[\\!a]", "[\\!a]" },
522 { "[\\^a]", "[\\^a]" },
523 { "[ab]", "[ab]" },
524 { "[a-b]", "[ab]" },
525 { "[a-c]", "[abc]" },
526 { "[ac-]", "[\\-ac]" },
527 { "[-ac]", "[\\-ac]" },
528 { "[+-/]", "[+\\,\\-./]" },
529
530 { "\xC2\xA1", "\xC2\xA1" }, // U+00A1 in UTF8
531
532 { "*", "*" },
533 { "\\*", "\\*" },
534 { "[*]", "\\*" },
535 { "?", "?" },
536 { "\\?", "\\?" },
537 { "[?]", "\\?" },
538 { ",", "\\," },
539 { "\\,", "\\," },
540 { "[,]", "\\," },
541 { "\\{", "\\{" },
542 { "[{]", "\\{" },
543 { "[}]", "\\}" },
544 { "\\[", "\\[" },
545 { "\\]", "\\]" },
546 { "\\\\", "\\\\" },
547
548 { "**", "*" },
549 { "*?", "?*" },
550 { "*???*?*", "????*" },
551 { "*a?*?b*", "*a??*b*" },
552
553 { "{a,b,c}d", "{a,b,c}d" },
554 { "foo{a,{b,c},?*}d", "foo{a,{b,c},?*}d" },
555 { "\\a\\b\\|\\{\\*", "ab|\\{\\*" },
556 { ".+$^{}", ".+$\\^{}" },
557 { "\\.\\+\\$\\^\\(\\)", ".+$\\^()" },
558 { 0, 0 }
559 };
560
561 char const * const bad[] = {
562 "[",
563 "[!",
564 "[\\",
565 "[\\]",
566 "[foo",
567 "[!foo",
568 "foo]",
569 "[\003]",
570 "[a-a]",
571 "[f-a]",
572 "[]",
573 "[\xC2\xA1]",
574 "[\xC2\xA1\xC2\xA2]",
575 "[\xC2\xA1-\xC2\xA2]",
576 "[-\xC2\xA1]",
577 "[[]",
578 "[]",
579
580 "\003",
581 "foo\\",
582 "{foo",
583 "{foo,bar{baz,quux}",
584 "foo}",
585 "foo,bar{baz,quux}}",
586 "{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}",
587 0
588 };
589 char const dummy[] = "";
590
591 for (tcase const * p = good; p->in; p++)
592 {
593 globish g(p->in);
594 string s;
595 dump(g, s);
596 L(FL("globish syntax: %s -> %s [expect %s]") % p->in % s % p->out);
597 UNIT_TEST_CHECK(s == p->out);
598 }
599
600 for (char const * const * p = bad; *p; p++)
601 {
602 L(FL("globish syntax: invalid %s") % *p);
603 UNIT_TEST_CHECK_THROW(I(globish(*p).matches(dummy)), informative_failure);
604 }
605}
606
607UNIT_TEST(globish, from_vector)
608{
609 vector<arg_type> v;
610 v.push_back(arg_type("a"));
611 v.push_back(arg_type("b"));
612 v.push_back(arg_type("c"));
613 globish combined(v);
614 string s;
615 dump(combined, s);
616 UNIT_TEST_CHECK(s == "{a,b,c}");
617}
618
619UNIT_TEST(globish, simple_matches)
620{
621 UNIT_TEST_CHECK(globish("abc").matches("abc"));
622 UNIT_TEST_CHECK(!globish("abc").matches("aac"));
623
624 UNIT_TEST_CHECK(globish("a[bc]d").matches("abd"));
625 UNIT_TEST_CHECK(globish("a[bc]d").matches("acd"));
626 UNIT_TEST_CHECK(!globish("a[bc]d").matches("and"));
627 UNIT_TEST_CHECK(!globish("a[bc]d").matches("ad"));
628 UNIT_TEST_CHECK(!globish("a[bc]d").matches("abbd"));
629
630 UNIT_TEST_CHECK(globish("a[!bc]d").matches("and"));
631 UNIT_TEST_CHECK(globish("a[!bc]d").matches("a#d"));
632 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abd"));
633 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("acd"));
634 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("ad"));
635 UNIT_TEST_CHECK(!globish("a[!bc]d").matches("abbd"));
636
637 UNIT_TEST_CHECK(globish("a?c").matches("abc"));
638 UNIT_TEST_CHECK(globish("a?c").matches("aac"));
639 UNIT_TEST_CHECK(globish("a?c").matches("a%c"));
640 UNIT_TEST_CHECK(!globish("a?c").matches("a%d"));
641 UNIT_TEST_CHECK(!globish("a?c").matches("d%d"));
642 UNIT_TEST_CHECK(!globish("a?c").matches("d%c"));
643 UNIT_TEST_CHECK(!globish("a?c").matches("a%%d"));
644
645 UNIT_TEST_CHECK(globish("a*c").matches("ac"));
646 UNIT_TEST_CHECK(globish("a*c").matches("abc"));
647 UNIT_TEST_CHECK(globish("a*c").matches("abac"));
648 UNIT_TEST_CHECK(globish("a*c").matches("abbcc"));
649 UNIT_TEST_CHECK(globish("a*c").matches("abcbbc"));
650 UNIT_TEST_CHECK(!globish("a*c").matches("abcbb"));
651 UNIT_TEST_CHECK(!globish("a*c").matches("abcb"));
652 UNIT_TEST_CHECK(!globish("a*c").matches("aba"));
653 UNIT_TEST_CHECK(!globish("a*c").matches("ab"));
654
655 UNIT_TEST_CHECK(globish("*.bak").matches(".bak"));
656 UNIT_TEST_CHECK(globish("*.bak").matches("a.bak"));
657 UNIT_TEST_CHECK(globish("*.bak").matches("foo.bak"));
658 UNIT_TEST_CHECK(globish("*.bak").matches(".bak.bak"));
659 UNIT_TEST_CHECK(globish("*.bak").matches("fwibble.bak.bak"));
660
661 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abc"));
662 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abcd"));
663 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("aabrd"));
664 UNIT_TEST_CHECK(globish("a*b*[cd]").matches("abbbbbbbccd"));
665 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("ab"));
666 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("abde"));
667 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("aaaaaaab"));
668 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("axxxxd"));
669 UNIT_TEST_CHECK(!globish("a*b*[cd]").matches("adb"));
670}
671
672UNIT_TEST(globish, complex_matches)
673{ {
674 globish_matcher m(globish("{a,b}?*\\*|"), globish("*c*"));
675 UNIT_TEST_CHECK(m("aq*|"));
676 UNIT_TEST_CHECK(m("bq*|"));
677 UNIT_TEST_CHECK(!m("bc*|"));
678 UNIT_TEST_CHECK(!m("bq|"));
679 UNIT_TEST_CHECK(!m("b*|"));
680 UNIT_TEST_CHECK(!m(""));
681 }
682 {
683 globish_matcher m(globish("{a,\\\\,b*}"), globish("*c*"));
684 UNIT_TEST_CHECK(m("a"));
685 UNIT_TEST_CHECK(!m("ab"));
686 UNIT_TEST_CHECK(m("\\"));
687 UNIT_TEST_CHECK(!m("\\\\"));
688 UNIT_TEST_CHECK(m("b"));
689 UNIT_TEST_CHECK(m("bfoobar"));
690 UNIT_TEST_CHECK(!m("bfoobarcfoobar"));
691 }
692 {
693 globish_matcher m(globish("*"), globish(""));
694 UNIT_TEST_CHECK(m("foo"));
695 UNIT_TEST_CHECK(m(""));
696 }
697 {
698 globish_matcher m(globish("{foo}"), globish(""));
699 UNIT_TEST_CHECK(m("foo"));
700 UNIT_TEST_CHECK(!m("bar"));
701 }
702}
703
704#endif // BUILD_UNIT_TESTS
705
706// Local Variables:
707// mode: C++
708// fill-column: 76
709// c-file-style: "gnu"
710// indent-tabs-mode: nil
711// End:
712// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status