monotone

monotone Mtn Source Tree

Root/url.cc

1// this is a collection of small grammars and functions to assemble /
2// disassemble string and structure forms of the terms therein.
3
4#include <string>
5
6#ifdef BUILD_UNIT_TESTS
7// #define BOOST_SPIRIT_DEBUG
8#endif
9
10#include <boost/spirit.hpp>
11#include <boost/spirit/attribute.hpp>
12#include <boost/spirit/utility/regex.hpp>
13#include <boost/spirit/phoenix/binders.hpp>
14
15#include <sanity.hh>
16#include <vocab.hh>
17
18using namespace std;
19using namespace boost::spirit;
20using namespace phoenix;
21
22template < typename ResultT >
23struct result_closure : boost::spirit::closure<result_closure <ResultT>, ResultT> {
24 typedef boost::spirit::closure<result_closure<ResultT>, ResultT> base_t;
25 typename base_t::member1 val;
26};
27
28struct IDNA_LABEL : public grammar<IDNA_LABEL, result_closure<string>::context_t>
29{
30 template <typename ScannerT> struct definition
31 {
32 // vaguely derived from guidelines in RFC3490,
33 // "internationalized domain names for applications"
34 rule<ScannerT> main;
35 rule<ScannerT> const & start() const { return main; }
36 definition(IDNA_LABEL const & self)
37 {
38 main =
39regex_p("([xX][nN]--)?[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?")
40[ self.val = construct_<string>(arg1, arg2) ];
41 }
42 };
43} idna_label_g;
44
45
46struct HOST : public grammar<HOST, result_closure<string>::context_t>
47{
48 template <typename ScannerT> struct definition
49 {
50 // vaguely derived from guidelines in STD3
51 rule<ScannerT> main;
52 rule<ScannerT> const & start() const { return main; }
53 definition(HOST const & self)
54 {
55 subrule<0> submain;
56 subrule<1> ipv4_address;
57 main =
58(
59 submain =
60 (
61 (ipv4_address | list_p(idna_label_g, ch_p('.')))
62 [ self.val = construct_<string>(arg1, arg2) ]
63 ),
64 ipv4_address = regex_p("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}")
65 ) ;
66 }
67 };
68} host_g;
69
70
71struct MAIL_LOCAL_NAME : public grammar<MAIL_LOCAL_NAME, result_closure<string>::context_t>
72{
73 template <typename ScannerT> struct definition
74 {
75 // subset of email "local names", vaguely derived from RFC 821 "simple
76 // mail transfer protocol"
77 rule<ScannerT> main;
78 rule<ScannerT> const & start() const { return main; }
79 definition(MAIL_LOCAL_NAME const & self)
80 {
81 subrule<0> submain;
82 subrule<1> special;
83 subrule<2> control;
84 subrule<3> bad;
85 subrule<4> c;
86 main =
87(
88 submain = (list_p(+c, ch_p('.')))
89 [ self.val = construct_<string>(arg1, arg2) ],
90
91 special = chset_p("<>()[]\\.,;:@\""),
92 control = (range_p(0x0, 0x1f) | chset_p("\xf7")),
93 bad = (special | control | ch_p(' ')),
94 c = (range_p(0x0, 0xff) - bad)
95 );
96 }
97 };
98} mailname_g;
99
100
101struct uri
102{
103 string proto;
104 string user;
105 string host;
106 string path;
107 string group;
108 unsigned long port;
109 uri() : port(0) {}
110};
111
112ostream & operator<<(ostream & ost, uri const & u)
113{
114 ost << "[uri: '" << u.proto << "' '" << u.user << "' '" << u.host << "' '"
115 << u.path << "' '" << u.group << "' '" << u.port << "']" << endl;
116 return ost;
117}
118
119struct URI : public grammar<URI, result_closure<uri>::context_t>
120{
121 template <typename ScannerT> struct definition
122 {
123 // vaguely derived from guidelines in RFC2396,
124 // "uniform resource identifiers"
125 typedef rule<ScannerT> rule_t;
126 rule_t main;
127 rule_t const & start() const { return main; }
128 definition(URI const & self)
129 {
130 subrule<0> submain;
131 subrule<2> unreserved;
132 subrule<3> mark;
133 subrule<4> escaped;
134
135 subrule<5> path_char;
136 subrule<6> path_segment;
137 subrule<7> path_segments;
138 subrule<8> path;
139
140 subrule<9> http;
141 subrule<10> nntp;
142 subrule<11> mailto;
143
144 subrule<12> mailname;
145 subrule<13> hostport;
146 subrule<14> host;
147 subrule<15> port;
148 subrule<16> group;
149
150 main =
151(
152 submain =
153 (
154 ( http >> str_p("://") >> hostport >> path)
155 | ( nntp >> str_p("://") >> hostport >> ch_p('/') >> group)
156 | ( mailto >> str_p(":") >> mailname >> ch_p('@') >> hostport)
157 ),
158
159 http = str_p("http") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
160 nntp = str_p("nntp") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
161 mailto = str_p("mailto") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
162
163 hostport = (host >> !(ch_p(':') >> port)),
164 host = host_g [ bind(&uri::host)(self.val) = arg1 ],
165 port = uint_p [ bind(&uri::port)(self.val) = arg1 ],
166
167 mark = chset_p("-_.!~*'()"),
168 unreserved = (chset_p("a-zA-Z0-9") | mark),
169 escaped = regex_p("%[[:xdigit:]]{2}"),
170 path_char = unreserved | escaped | chset_p(":@&=+$,"),
171 path_segment = (+path_char),
172 path_segments = (list_p(path_segment, ch_p('/'))),
173 path = (ch_p('/') >> path_segments)
174 [ bind(&uri::path)(self.val) = construct_<string>(arg1,arg2) ],
175
176 group = (list_p(idna_label_g, ch_p('.')))
177 [ bind(&uri::group)(self.val) = construct_<string>(arg1,arg2) ],
178
179 mailname = mailname_g [bind(&uri::user)(self.val) = arg1]
180 );
181 }
182 };
183} uri_g;
184
185
186struct UTF_URI : public grammar<UTF_URI, result_closure<uri>::context_t>
187{
188 template <typename ScannerT> struct definition
189 {
190 // UTF_URI differs from URI in that UTF_URI is applied to raw (un-encoded)
191 // UTF-8 strings. these are *not* valid RFC2396-et-al URIs; rather they are
192 // strings users types in, post-charset-conversion. parsing them is coarser
193 // and more forgiving; we will do a precise parse after internalizing.
194
195 typedef rule<ScannerT> rule_t;
196 rule_t main;
197 rule_t const & start() const { return main; }
198 definition(UTF_URI const & self)
199 {
200 subrule<0> submain;
201
202 subrule<1> path;
203
204 subrule<2> http;
205 subrule<3> nntp;
206 subrule<4> mailto;
207
208 subrule<5> mailname;
209 subrule<6> hostport;
210 subrule<7> host;
211 subrule<8> port;
212 subrule<9> group;
213
214 main =
215(
216 submain =
217 (
218 ( http >> str_p("://") >> hostport >> path)
219 | ( nntp >> str_p("://") >> hostport >> ch_p('/') >> group)
220 | ( mailto >> str_p(":") >> mailname >> ch_p('@') >> hostport)
221 ),
222
223 http = str_p("http") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
224 nntp = str_p("nntp") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
225 mailto = str_p("mailto") [ bind(&uri::proto)(self.val) = construct_<string>(arg1,arg2) ],
226
227 hostport = (host >> !(ch_p(':') >> port)),
228 host = (+(anychar_p - chset_p("/:"))) [ bind(&uri::host)(self.val) = construct_<string>(arg1,arg2) ],
229 port = uint_p [ bind(&uri::port)(self.val) = arg1 ],
230
231 path = (ch_p('/') >> (+anychar_p))
232 [ bind(&uri::path)(self.val) = construct_<string>(arg1,arg2) ],
233
234 group = (+anychar_p)
235 [ bind(&uri::group)(self.val) = construct_<string>(arg1,arg2) ],
236
237 mailname = (+(~chset_p("@")))
238 [bind(&uri::user)(self.val) = arg1]
239 );
240 }
241 };
242} utf_uri_g;
243
244
245bool parse_utf8_url(utf8 const & utf,
246 utf8 & proto,
247 utf8 & user,
248 utf8 & host,
249 utf8 & path,
250 utf8 & group,
251 unsigned long & port)
252{
253 uri ustruct;
254 bool parsed_ok = parse(utf().c_str(), utf_uri_g[var(ustruct) = arg1]).full;
255
256 if (parsed_ok)
257 {
258 proto = ustruct.proto;
259 user = ustruct.user;
260 host = ustruct.host;
261 path = ustruct.path;
262 group = ustruct.group;
263 port = ustruct.port;
264
265 if (ustruct.proto == "http")
266{
267 string::size_type gpos = ustruct.path.rfind('/');
268 if (gpos == string::npos || gpos == ustruct.path.size() - 1 || gpos == 0)
269 return false;
270 group = ustruct.path.substr(gpos+1);
271 path = ustruct.path.substr(0,gpos);
272}
273
274 if (ustruct.proto == "http" && ustruct.port == 0)
275port = 80;
276 else if (ustruct.proto == "nntp" && ustruct.port == 0)
277port = 119;
278 else if (ustruct.proto == "mailto" && ustruct.port == 0)
279port= 25;
280 }
281
282 L(F("parsed UTF-8 URL\n"));
283 return parsed_ok;
284}
285
286
287
288bool parse_url(url const & u,
289 std::string & proto,
290 ace & user,
291 ace & host,
292 urlenc & path,
293 ace & group,
294 unsigned long & port)
295{
296 // http://host:port/path.cgi/group
297 // nntp://host:port/group
298 // mailto:user@host:port
299
300 uri ustruct;
301 bool parsed_ok = parse(u().c_str(), uri_g[var(ustruct) = arg1]).full;
302
303 if (parsed_ok)
304 {
305 proto = ustruct.proto;
306 user = ustruct.user;
307 host = ustruct.host;
308 path = ustruct.path;
309 group = ustruct.group;
310 port = ustruct.port;
311
312 if (proto == "http")
313{
314 string::size_type gpos = ustruct.path.rfind('/');
315 if (gpos == string::npos || gpos == ustruct.path.size() - 1 || gpos == 0)
316 return false;
317 group = ustruct.path.substr(gpos+1);
318 path = ustruct.path.substr(0,gpos);
319}
320
321 if (ustruct.proto == "http" && ustruct.port == 0)
322port = 80;
323 else if (ustruct.proto == "nntp" && ustruct.port == 0)
324port = 119;
325 else if (ustruct.proto == "mailto" && ustruct.port == 0)
326port= 25;
327 }
328
329 L(F("parsed URL: proto '%s', user '%s', host '%s', port '%d', path '%s', group '%s'\n")
330 % proto % user % host % port % path % group);
331
332 return parsed_ok;
333}
334
335
336#ifdef BUILD_UNIT_TESTS
337#include "unit_tests.hh"
338
339static bool url_parses(string u,
340 string xproto,
341 string xuser,
342 string xhost,
343 string xpath,
344 string xgroup,
345 unsigned long xport)
346{
347 url uu(u);
348 ace user, host, group;
349 urlenc path;
350 string proto;
351 unsigned long port = 0;
352
353 L(F("trying to parse %s\n") % u);
354
355 parse_url(uu, proto, user, host, path, group, port);
356
357#define CHECK(z,c) if (c != (x ## z)) { \
358 cerr << "parsed url '" << u << "' wrong: " \
359 "got " << (#z) << " = '" << z << "', expected '" << (x ## z) << "'" \
360 << endl; return false; \
361}
362 CHECK(proto, proto);
363 CHECK(user, user());
364 CHECK(host, host());
365 CHECK(path, path());
366 CHECK(group, group());
367 CHECK(port, port);
368#undef CHECK
369
370 return true;
371}
372
373static void test_legal_urls()
374{
375 // positive tests
376 BOOST_CHECK(url_parses("http://www.gurgle.com/depot.cgi/foo.foo",
377 "http", "", "www.gurgle.com", "/depot.cgi", "foo.foo", 80));
378
379 BOOST_CHECK(url_parses("nntp://news.isp.com/my.group.is.good",
380 "nntp", "", "news.isp.com", "", "my.group.is.good", 119));
381
382 BOOST_CHECK(url_parses("mailto:super-list@mail.yoohoo.com",
383 "mailto", "super-list", "mail.yoohoo.com", "", "", 25));
384
385 BOOST_CHECK(url_parses("http://www.gurgle.com:1234/~someone/depot.cgi/foo.bleh",
386 "http", "", "www.gurgle.com", "/~someone/depot.cgi", "foo.bleh", 1234));
387
388 BOOST_CHECK(url_parses("nntp://news.isp.com:1221/my.group.is.good",
389 "nntp", "", "news.isp.com", "", "my.group.is.good", 1221));
390
391 BOOST_CHECK(url_parses("mailto:super-list@mail.yoohoo.com:3345",
392 "mailto", "super-list", "mail.yoohoo.com", "", "", 3345));
393
394}
395
396static void test_illegal_urls()
397{
398}
399
400void add_url_tests(test_suite * suite)
401{
402 I(suite);
403 suite->add(BOOST_TEST_CASE(&test_legal_urls));
404 suite->add(BOOST_TEST_CASE(&test_illegal_urls));
405}
406
407
408#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status