monotone

monotone Mtn Source Tree

Root/src/pcrewrap.cc

1// Copyright (C) 2007 Zack Weinberg <zackw@panix.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include "pcrewrap.hh"
12#include "sanity.hh"
13#include <cstring>
14#include <map>
15#include <vector>
16
17// This dirty trick is necessary to prevent the 'pcre' typedef defined by
18// pcre.h from colliding with namespace pcre.
19#define pcre pcre_t
20#include "pcre.h"
21#undef pcre
22
23using std::make_pair;
24using std::map;
25using std::pair;
26using std::string;
27using std::vector;
28
29static NORETURN(void pcre_compile_error(int errcode, char const * err,
30 int erroff, char const * pattern,
31 origin::type caused_by));
32static NORETURN(void pcre_study_error(char const * err, char const * pattern,
33 origin::type caused_by));
34static NORETURN(void pcre_exec_error(int errcode,
35 origin::type regex_from,
36 origin::type subject_from));
37
38inline unsigned int
39flags_to_internal(pcre::flags f)
40{
41 using namespace pcre;
42#define C(f_, x) (((f_) & (x)) ? PCRE_##x : 0)
43 unsigned int i = 0;
44 i |= C(f, NEWLINE_CR);
45 i |= C(f, NEWLINE_LF);
46 // NEWLINE_CRLF == NEWLINE_CR|NEWLINE_LF and so is handled above
47 i |= C(f, ANCHORED);
48 i |= C(f, NOTBOL);
49 i |= C(f, NOTEOL);
50 i |= C(f, NOTEMPTY);
51 i |= C(f, CASELESS);
52 i |= C(f, DOLLAR_ENDONLY);
53 i |= C(f, DOTALL);
54 i |= C(f, DUPNAMES);
55 i |= C(f, EXTENDED);
56 i |= C(f, FIRSTLINE);
57 i |= C(f, MULTILINE);
58 i |= C(f, UNGREEDY);
59#undef C
60 return i;
61}
62
63inline unsigned int
64get_capturecount(void const * bd)
65{
66 unsigned int cc;
67 int err = pcre_fullinfo(static_cast<pcre_t const *>(bd), 0,
68 PCRE_INFO_CAPTURECOUNT,
69 static_cast<void *>(&cc));
70 I(err == 0);
71 return cc;
72}
73
74namespace pcre
75{
76 typedef map<char const *,
77 pair<struct real_pcre const *, struct pcre_extra const *> >
78 regex_cache;
79
80 class regex_cache_manager
81 {
82public:
83 regex_cache::const_iterator find(char const * pattern)
84 {
85 return cache.find(pattern);
86 }
87
88 void store(char const * pattern,
89 pair<struct real_pcre const *, struct pcre_extra const *>
90 data)
91 {
92 cache[pattern] = data;
93 }
94
95 regex_cache::const_iterator end()
96 {
97 return cache.end();
98 }
99
100 ~regex_cache_manager()
101 {
102 for (regex_cache::iterator iter = cache.begin();
103 iter != cache.end();
104 ++iter)
105 {
106 if (iter->second.first)
107 pcre_free(const_cast<pcre_t *>(iter->second.first));
108
109 if (iter->second.second)
110 pcre_free(const_cast<pcre_extra *>(iter->second.second));
111 }
112 }
113private:
114 regex_cache cache;
115 };
116
117 regex_cache_manager compiled;
118
119 void regex::init(char const * pattern, flags options)
120 {
121 int errcode;
122 int erroff;
123 char const * err;
124 // use the cached data if we have it
125 regex_cache::const_iterator iter = compiled.find(pattern);
126 if (iter != compiled.end())
127 {
128 basedat = iter->second.first;
129 extradat = iter->second.second;
130 return;
131 }
132 // not in cache - compile them then store in cache
133 basedat = pcre_compile2(pattern, flags_to_internal(options),
134 &errcode, &err, &erroff, 0);
135 if (!basedat)
136 pcre_compile_error(errcode, err, erroff, pattern, made_from);
137
138 pcre_extra *ed = pcre_study(basedat, 0, &err);
139 if (err)
140 pcre_study_error(err, pattern, made_from);
141 if (!ed)
142 {
143 // I resent that C++ requires this cast.
144 ed = (pcre_extra *)pcre_malloc(sizeof(pcre_extra));
145 std::memset(ed, 0, sizeof(pcre_extra));
146 }
147
148 // We set a fairly low recursion depth to avoid stack overflow.
149 // Per pcrestack(3), one should assume 500 bytes per recursion;
150 // it should be safe to let pcre have a megabyte of stack, so
151 // that's a depth of 2000, give or take. (For reference, the
152 // default stack limit on Linux is 8MB.)
153 ed->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
154 ed->match_limit_recursion = 2000;
155 extradat = ed;
156 // store in cache
157 compiled.store(pattern, make_pair(basedat, extradat));
158 }
159
160 regex::regex(char const * pattern, origin::type whence, flags options)
161 : made_from(whence)
162 {
163 this->init(pattern, options);
164 }
165
166 regex::regex(string const & pattern, origin::type whence, flags options)
167 : made_from(whence)
168 {
169 this->init(pattern.c_str(), options);
170 }
171
172 regex::~regex()
173 {
174 }
175
176 bool
177 regex::match(string const & subject, origin::type subject_origin,
178 flags options) const
179 {
180 int rc = pcre_exec(basedat, extradat,
181 subject.data(), subject.size(),
182 0, flags_to_internal(options), 0, 0);
183 if (rc == 0)
184 return true;
185 else if (rc == PCRE_ERROR_NOMATCH)
186 return false;
187 else
188 pcre_exec_error(rc, made_from, subject_origin);
189 }
190
191 bool
192 regex::match(string const & subject, origin::type subject_origin,
193 vector<string> & matches, flags options) const
194 {
195 matches.clear();
196
197 // retrieve the capture count of the pattern from pcre_fullinfo,
198 // because pcre_exec might not signal trailing unmatched subpatterns
199 // i.e. if "abc" matches "(abc)(de)?", the match count is two, not
200 // the expected three
201 int cap_count = 0;
202 int rc = pcre_fullinfo(basedat, extradat, PCRE_INFO_CAPTURECOUNT, &cap_count);
203 I(rc == 0);
204
205 // the complete regex is captured as well
206 cap_count += 1;
207
208 int worksize = cap_count * 3;
209
210 // "int ovector[worksize]" is C99 only (not valid C++, but allowed by gcc/clang)
211 // boost::shared_array is I think not plannned to be part of C++0x
212 class xyzzy {
213 int *data;
214 public:
215 xyzzy(int len) : data(new int[len]) {}
216 ~xyzzy() { delete[] data; }
217 operator int*() { return data; }
218 } ovector(worksize);
219
220 rc = pcre_exec(basedat, extradat,
221 subject.data(), subject.size(),
222 0, flags_to_internal(options), ovector, worksize);
223
224 // since we dynamically set the work size, we should
225 // always get either a negative (error) or >= 1 match count
226 I(rc != 0);
227
228 if (rc == PCRE_ERROR_NOMATCH)
229 return false;
230 else if (rc < 0)
231 pcre_exec_error(rc, made_from, subject_origin); // throws
232
233 for (int i=0; i < cap_count; ++i)
234 {
235 string match;
236 // not an empty match
237 if (ovector[2*i] != -1 && ovector[2*i+1] != -1)
238 match.assign(subject, ovector[2*i], ovector[2*i+1] - ovector[2*i]);
239 matches.push_back(match);
240 }
241
242 return true;
243 }
244} // namespace pcre
245
246// When the library returns an error, these functions discriminate between
247// bugs in monotone and user errors in regexp writing.
248static void
249pcre_compile_error(int errcode, char const * err,
250 int erroff, char const * pattern,
251 origin::type caused_by)
252{
253 // One of the more entertaining things about the PCRE API is that
254 // while the numeric error codes are documented, they do not get
255 // symbolic names.
256
257 switch (errcode)
258 {
259 case 21: // failed to get memory
260 throw std::bad_alloc();
261
262 case 10: // [code allegedly not in use]
263 case 11: // internal error: unexpected repeat
264 case 16: // erroffset passed as NULL
265 case 17: // unknown option bit(s) set
266 case 19: // [code allegedly not in use]
267 case 23: // internal error: code overflow
268 case 33: // [code allegedly not in use]
269 case 50: // [code allegedly not in use]
270 case 52: // internal error: overran compiling workspace
271 case 53: // internal error: previously-checked referenced subpattern
272 // not found
273 throw oops((F("while compiling regex '%s': %s") % pattern % err)
274 .str().c_str());
275
276 default:
277 // PCRE fails to distinguish between errors at no position and errors at
278 // character offset 0 in the pattern, so in practice we give the
279 // position-ful variant for all errors, but I'm leaving the == -1 check
280 // here in case PCRE gets fixed.
281 E(false, caused_by, (erroff == -1
282 ? (F("error in regex '%s': %s")
283 % pattern % err)
284 : (F("error near char %d of regex '%s': %s")
285 % (erroff + 1) % pattern % err)
286 ));
287 }
288}
289
290static void
291pcre_study_error(char const * err, char const * pattern,
292 origin::type caused_by)
293{
294 // This interface doesn't even *have* error codes.
295 // If the error is not out-of-memory, it's a bug.
296 if (!std::strcmp(err, "failed to get memory"))
297 throw std::bad_alloc();
298 else
299 throw oops((F("while studying regex '%s': %s") % pattern % err)
300 .str().c_str());
301}
302
303static void
304pcre_exec_error(int errcode, origin::type regex_from, origin::type subject_from)
305{
306 // This interface provides error codes with symbolic constants for them!
307 // But it doesn't provide string versions of them. As most of them
308 // indicate bugs in monotone, it's not worth defining our own strings.
309
310 switch(errcode)
311 {
312 case PCRE_ERROR_NOMEMORY:
313 throw std::bad_alloc();
314
315 case PCRE_ERROR_MATCHLIMIT:
316 E(false, subject_from,
317 F("backtrack limit exceeded in regular expression matching"));
318
319 case PCRE_ERROR_RECURSIONLIMIT:
320 E(false, subject_from,
321 F("recursion limit exceeded in regular expression matching"));
322
323 case PCRE_ERROR_BADUTF8:
324 case PCRE_ERROR_BADUTF8_OFFSET:
325 E(false, subject_from,
326 F("invalid UTF-8 sequence found during regular expression matching"));
327
328 default:
329 throw oops((F("pcre_exec returned %d") % errcode)
330 .str().c_str());
331 }
332}
333
334// Local Variables:
335// mode: C++
336// fill-column: 76
337// c-file-style: "gnu"
338// indent-tabs-mode: nil
339// End:
340// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status