monotone

monotone Mtn Source Tree

Root/src/globish.cc

1// Copyright (C) 2005 Nathaniel Smith <njs@pobox.com>
2// 2007 Zack Weinberg <zackw@panix.com>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "sanity.hh"
13#include "globish.hh"
14#include "option.hh" // for arg_type
15#include "numeric_vocab.hh"
16
17#include <iterator>
18#include <ostream>
19
20using std::string;
21using std::vector;
22using std::back_inserter;
23using std::back_insert_iterator;
24
25// The algorithm here is originally from pdksh 5. That implementation uses
26// the high bit of unsigned chars as a quotation flag. We can't do that,
27// because we need to be utf8 clean. Instead, we copy the string and
28// replace "live" metacharacters with single bytes from the
29// control-character range. This is why bytes <= 0x1f are not allowed in the
30// pattern.
31
32enum metachar {
33 META_STAR = 1, // *
34 META_QUES, // ?
35 META_CC_BRA, // [
36 META_CC_INV_BRA, // [^ or [!
37 META_CC_KET, // ] (matches either of the above two)
38 META_ALT_BRA, // {
39 META_ALT_OR, // , (when found inside unquoted { ... })
40 META_ALT_KET, // }
41};
42
43// Compile a character class.
44
45static string::const_iterator
46compile_charclass(string const & pat, string::const_iterator p,
47 back_insert_iterator<string> & to,
48 origin::type made_from)
49{
50 string in_class;
51 char bra = (char)META_CC_BRA;
52
53 p++;
54 E(p != pat.end(), made_from,
55 F("invalid pattern '%s': unmatched '['") % pat);
56
57 if (*p == '!' || *p == '^')
58 {
59 bra = (char)META_CC_INV_BRA;
60 p++;
61 E(p != pat.end(), made_from,
62 F("invalid pattern '%s': unmatched '['") % pat);
63 }
64
65 while (p != pat.end() && *p != ']')
66 {
67 if (*p == '\\')
68 {
69 p++;
70 if (p == pat.end())
71 break;
72 }
73 // A dash at the beginning or end of the pattern is literal.
74 else if (*p == '-'
75 && !in_class.empty()
76 && p+1 != pat.end()
77 && p[1] != ']')
78 {
79 p++;
80 if (*p == '\\')
81 p++;
82 if (p == pat.end())
83 break;
84
85 // the cast is needed because boost::format will not obey the %x
86 // if given a 'char'.
87 E((widen<unsigned int, char>(*p)) >= ' ', made_from,
88 F("invalid pattern '%s': control character 0x%02x is not allowed")
89 % pat % (widen<unsigned int, char>(*p)));
90
91 unsigned int start = widen<unsigned int, char>(in_class.end()[-1]);
92 unsigned int stop = widen<unsigned int, char>(*p);
93
94 E(start != stop, made_from,
95 F("invalid pattern '%s': "
96 "one-element character ranges are not allowed") % pat);
97 E(start < stop, made_from,
98 F("invalid pattern '%s': "
99 "endpoints of a character range must be in "
100 "ascending numeric order") % pat);
101 E(start < 0x80 && stop < 0x80, made_from,
102 F("invalid pattern '%s': cannot use non-ASCII characters "
103 "in classes") % pat);
104
105 L(FL("expanding range from %X (%c) to %X (%c)")
106 % (start+1) % (char)(start+1) % stop % (char)stop);
107
108 for (unsigned int r = start + 1; r < stop; r++)
109 in_class.push_back((char)r);
110 }
111 else
112 E(*p != '[', made_from,
113 F("syntax error in '%s': "
114 "character classes may not be nested") % pat);
115
116 E((widen<unsigned int, char>(*p)) >= ' ', made_from,
117 F("invalid pattern '%s': control character 0x%02x is not allowed")
118 % pat % (widen<unsigned int, char>(*p)));
119
120 E((widen<unsigned int, char>(*p)) < 0x80, made_from,
121 F("invalid pattern '%s': cannot use non-ASCII characters in classes")
122 % pat);
123
124 in_class.push_back(*p);
125 p++;
126 }
127
128 E(p != pat.end(), made_from,
129 F("invalid pattern '%s': unmatched '['") % pat);
130
131 E(!in_class.empty(), made_from,
132 F("invalid pattern '%s': empty character class") % pat);
133
134 // minor optimization: one-element non-inverted character class becomes
135 // the character.
136 if (bra == (char)META_CC_BRA && in_class.size() == 1)
137 *to++ = in_class[0];
138 else
139 {
140 *to++ = bra;
141 std::sort(in_class.begin(), in_class.end());
142 std::copy(in_class.begin(), in_class.end(), to);
143 *to++ = (char)META_CC_KET;
144 }
145 return p;
146}
147
148// Compile one fragment of a glob pattern.
149
150static void
151compile_frag(string const & pat, back_insert_iterator<string> & to,
152 origin::type made_from)
153{
154 unsigned int brace_depth = 0;
155
156 for (string::const_iterator p = pat.begin(); p != pat.end(); p++)
157 switch (*p)
158 {
159 default:
160 E((widen<unsigned int, char>(*p)) >= ' ', made_from,
161 F("invalid pattern '%s': control character 0x%02x is not allowed")
162 % pat % (widen<unsigned int, char>(*p)));
163
164 *to++ = *p;
165 break;
166
167 case '*':
168 // optimization: * followed by any sequence of ?s and *s is
169 // equivalent to the number of ?s that appeared in the sequence,
170 // followed by a single star. the latter can be matched without
171 // nearly as much backtracking.
172
173 for (p++; p != pat.end(); p++)
174 {
175 if (*p == '?')
176 *to++ = META_QUES;
177 else if (*p != '*')
178 break;
179 }
180
181 p--;
182 *to++ = META_STAR;
183 break;
184
185 case '?':
186 *to++ = META_QUES;
187 break;
188
189 case '\\':
190 p++;
191 E(p != pat.end(), made_from,
192 F("invalid pattern '%s': un-escaped \\ at end") % pat);
193
194 E((widen<unsigned int, char>(*p)) >= ' ', made_from,
195 F("invalid pattern '%s': control character 0x%02x is not allowed")
196 % pat % (widen<unsigned int, char>(*p)));
197
198 *to++ = *p;
199 break;
200
201 case '[':
202 p = compile_charclass(pat, p, to, made_from);
203 break;
204
205 case ']':
206 E(false, made_from, F("invalid pattern '%s': unmatched ']'") % pat);
207
208 case '{':
209 // There's quite a bit of optimization we could be doing on
210 // alternatives, but it's hairy, especially if you get into
211 // nested alternatives; so we're not doing any of it now.
212 // (Look at emacs's regexp-opt.el for inspiration.)
213 brace_depth++;
214 E(brace_depth < 6, made_from,
215 F("invalid pattern '%s': braces nested too deeply") % pat);
216 *to++ = META_ALT_BRA;
217 break;
218
219 case ',':
220 if (brace_depth > 0)
221 *to++ = META_ALT_OR;
222 else
223 *to++ = ',';
224 break;
225
226 case '}':
227 E(brace_depth > 0, made_from,
228 F("invalid pattern '%s': unmatched '}'") % pat);
229 brace_depth--;
230 *to++ = META_ALT_KET;
231 break;
232 }
233
234 E(brace_depth == 0, made_from,
235 F("invalid pattern '%s': unmatched '{'") % pat);
236}
237
238// common code used by the constructors.
239
240static inline string
241compile(string const & pat, origin::type made_from)
242{
243 string s;
244 back_insert_iterator<string> to = back_inserter(s);
245 compile_frag(pat, to, made_from);
246 return s;
247}
248
249static inline string
250compile(vector<arg_type>::const_iterator const & beg,
251 vector<arg_type>::const_iterator const & end)
252{
253 if (end - beg == 0)
254 return "";
255 if (end - beg == 1)
256 return compile((*beg)(), origin::user);
257
258 string s;
259 back_insert_iterator<string> to = back_inserter(s);
260
261 *to++ = META_ALT_BRA;
262 vector<arg_type>::const_iterator i = beg;
263 for (;;)
264 {
265 compile_frag((*i)(), to, origin::user);
266 i++;
267 if (i == end)
268 break;
269 *to++ = META_ALT_OR;
270 }
271 *to++ = META_ALT_KET;
272 return s;
273}
274
275globish::globish(string const & p, origin::type made_from)
276 : origin_aware(made_from),
277 compiled_pattern(compile(p, made_from)) {}
278globish::globish(char const * p, origin::type made_from)
279 : origin_aware(made_from),
280 compiled_pattern(compile(p, made_from)) {}
281
282globish::globish(vector<arg_type> const & p)
283 : origin_aware(origin::user),
284 compiled_pattern(compile(p.begin(), p.end())) {}
285globish::globish(vector<arg_type>::const_iterator const & beg,
286 vector<arg_type>::const_iterator const & end)
287 : origin_aware(origin::user),
288 compiled_pattern(compile(beg, end)) {}
289
290// Debugging.
291
292static string
293decode(string::const_iterator p, string::const_iterator end, bool escaped = true)
294{
295 string s;
296 for (; p != end; p++)
297 switch (*p)
298 {
299 case META_STAR: s.push_back('*'); break;
300 case META_QUES: s.push_back('?'); break;
301 case META_CC_BRA: s.push_back('['); break;
302 case META_CC_KET: s.push_back(']'); break;
303 case META_CC_INV_BRA: s.push_back('[');
304 s.push_back('!'); break;
305
306 case META_ALT_BRA: s.push_back('{'); break;
307 case META_ALT_KET: s.push_back('}'); break;
308 case META_ALT_OR: s.push_back(','); break;
309
310 // Some of these are only special in certain contexts,
311 // so for these contexts we don't want to escape them
312 case '[': case ']': case '-': case '!': case '^':
313 case '{': case '}': case ',':
314 case '*': case '?': case '\\':
315 if (escaped)
316 s.push_back('\\');
317 // fall through
318 default:
319 s.push_back(*p);
320 }
321 return s;
322}
323
324string
325globish::operator()() const
326{
327 return decode(compiled_pattern.begin(), compiled_pattern.end());
328}
329
330string
331globish::unescaped() const
332{
333 return decode(compiled_pattern.begin(), compiled_pattern.end(), false);
334}
335
336bool
337globish::contains_meta_chars() const
338{
339 string::const_iterator p = compiled_pattern.begin();
340 for (; p != compiled_pattern.end(); p++)
341 switch (*p)
342 {
343 case META_STAR:
344 case META_QUES:
345 case META_CC_BRA:
346 case META_CC_KET:
347 case META_CC_INV_BRA:
348 case META_ALT_BRA:
349 case META_ALT_KET:
350 case META_ALT_OR:
351 return true;
352 }
353 return false;
354}
355
356template <> void dump(globish const & g, string & s)
357{
358 s = g();
359}
360
361std::ostream & operator<<(std::ostream & o, globish const & g)
362{
363 return o << g();
364}
365
366// Matching.
367
368static string::const_iterator
369find_next_subpattern(string::const_iterator p,
370 string::const_iterator pe,
371 bool want_alternatives)
372{
373 L(FL("Finding subpattern in '%s'") % decode(p, pe));
374 unsigned int depth = 1;
375 for (; p != pe; p++)
376 switch (*p)
377 {
378 default:
379 break;
380
381 case META_ALT_BRA:
382 depth++;
383 break;
384
385 case META_ALT_KET:
386 depth--;
387 if (depth == 0)
388 return p+1;
389 break;
390
391 case META_ALT_OR:
392 if (depth == 1 && want_alternatives)
393 return p+1;
394 break;
395 }
396
397 I(false);
398}
399
400
401static bool
402do_match(string::const_iterator sb, string::const_iterator se,
403 string::const_iterator p, string::const_iterator pe)
404{
405 unsigned int sc, pc;
406 string::const_iterator s(sb);
407
408 L(FL("subpattern: '%s' against '%s'") % string(s,se) % decode(p,pe));
409
410 while (p < pe)
411 {
412 // pc will be the current pattern character
413 // p will point after pc
414 pc = widen<unsigned int, char>(*p++);
415 // sc will be the current string character
416 // s will point to sc
417 if(s < se) {
418 sc = widen<unsigned int, char>(*s);
419 } else {
420 sc = 0;
421 }
422 switch (pc)
423 {
424 default: // literal
425 if (sc != pc)
426 return false;
427 break;
428
429 case META_QUES: // any single character
430 if (sc == 0)
431 return false;
432 break;
433
434 case META_CC_BRA: // any of these characters
435 {
436 bool matched = false;
437 I(p < pe);
438 I(*p != META_CC_KET);
439 do
440 {
441 if (widen<unsigned int, char>(*p) == sc)
442 matched = true;
443 p++;
444 I(p < pe);
445 }
446 while (*p != META_CC_KET);
447 if (!matched)
448 return false;
449 }
450 p++;
451 break;
452
453 case META_CC_INV_BRA: // any but these characters
454 I(p < pe);
455 I(*p != META_CC_KET);
456 do
457 {
458 if (widen<unsigned int, char>(*p) == sc)
459 return false;
460 p++;
461 I(p < pe);
462 }
463 while (*p != META_CC_KET);
464 p++;
465 break;
466
467 case META_STAR: // zero or more arbitrary characters
468 if (p == pe)
469 return true; // star at end always matches, if we get that far
470
471 pc = widen<unsigned int, char>(*p);
472 // If the next character in p is not magic, we can only match
473 // starting from places in s where that character appears.
474 if (pc >= ' ')
475 {
476 L(FL("after *: looking for '%c' in '%s'")
477 % (char)pc % string(s, se));
478 p++;
479 for (;;)
480 {
481 ++s;
482 if (sc == pc && do_match(s, se, p, pe))
483 return true;
484 if (s >= se)
485 break;
486 sc = widen<unsigned int, char>(*s);
487 }
488 }
489 else
490 {
491 L(FL("metacharacter after *: doing it the slow way"));
492 do
493 {
494 if (do_match(s, se, p, pe))
495 return true;
496 s++;
497 }
498 while (s < se);
499 }
500 return false;
501
502 case META_ALT_BRA:
503 {
504 string::const_iterator prest, psub, pnext;
505 string::const_iterator srest;
506
507 prest = find_next_subpattern(p, pe, false);
508 psub = p;
509 // [ psub ... prest ) is the current bracket pair
510 // (including the *closing* braket, but not the opening braket)
511 do
512 {
513 pnext = find_next_subpattern(psub, pe, true);
514 // pnext points just after a comma or the closing braket
515 // [ psub ... pnext ) is one branch with trailing delimiter
516 srest = (prest == pe ? se : s);
517 for (; srest < se; srest++)
518 {
519 if (do_match(s, srest, psub, pnext - 1)
520 && do_match(srest, se, prest, pe))
521 return true;
522 }
523 // try the empty target too
524 if (do_match(s, srest, psub, pnext - 1)
525 && do_match(srest, se, prest, pe))
526 return true;
527
528 psub = pnext;
529 }
530 while (pnext < prest);
531 return false;
532 }
533 }
534 if (s < se)
535 {
536 ++s;
537 }
538 }
539 return s == se;
540}
541
542bool globish::matches(string const & target) const
543{
544 bool result;
545
546 // The empty pattern matches nothing.
547 if (compiled_pattern.empty())
548 result = false;
549 else
550 result = do_match (target.begin(), target.end(),
551 compiled_pattern.begin(), compiled_pattern.end());
552
553 L(FL("matching '%s' against '%s': %s")
554 % target % (*this)() % (result ? "matches" : "does not match"));
555 return result;
556}
557
558
559// Local Variables:
560// mode: C++
561// fill-column: 76
562// c-file-style: "gnu"
563// indent-tabs-mode: nil
564// End:
565// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status