monotone

monotone Mtn Source Tree

Root/transforms.cc

1// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6#include <algorithm>
7#include <cctype>
8#include <functional>
9#include <iterator>
10#include <sstream>
11#include <string>
12#include <vector>
13
14#include <boost/filesystem/path.hpp>
15#include <boost/tokenizer.hpp>
16
17#include "cryptopp/filters.h"
18#include "cryptopp/files.h"
19#include "cryptopp/sha.h"
20#include "cryptopp/hex.h"
21#include "cryptopp/base64.h"
22#include "cryptopp/gzip.h"
23
24#include "idna/idna.h"
25#include "idna/stringprep.h"
26
27#include "cleanup.hh"
28#include "constants.hh"
29#include "vocab.hh"
30#include "transforms.hh"
31#include "sanity.hh"
32#include "url.hh"
33#include "xdelta.hh"
34
35using namespace std;
36
37// this file contans various sorts of string transformations. each
38// transformation should be self-explanatory from its type signature. see
39// transforms.hh for the summary.
40
41// NB this file uses very "value-centric" functional approach; even though
42// many of the underlying transformations are "stream-centric" and the
43// underlying libraries (eg. crypto++) are stream oriented. this will
44// probably strike some people as contemptably inefficient, since it means
45// that occasionally 1, 2, or even 3 copies of an entire file will wind up
46// in memory at once. I am taking this approach for 3 reasons: first, I
47// want the type system to help me and value types are much easier to work
48// with than stream types. second, it is *much* easier to debug a program
49// that operates on values than streams, and correctness takes precedence
50// over all other features of this program. third, this is a peer-to-peer
51// sort of program for small-ish source-code text files, not a fileserver,
52// and is memory-limited anyways (for example, storing things in sqlite
53// requires they be able to fit in memory). you're hopefully not going to
54// be dealing with hundreds of users hammering on locks and memory
55// concurrently.
56//
57// if future analysis proves these assumptions wrong, feel free to revisit
58// the matter, but bring strong evidence along with you that the stream
59// paradigm "must" be used. this program is intended for source code
60// control and I make no bones about it.
61
62using namespace std;
63
64// the generic function
65template<typename XFM> string xform(string const & in)
66{
67 string out;
68 out.reserve(in.size() * 2);
69 CryptoPP::StringSource
70 str(in, true,
71new XFM(new CryptoPP::StringSink(out)));
72 return out;
73}
74
75// specialize it
76template string xform<CryptoPP::Base64Encoder>(string const &);
77template string xform<CryptoPP::Base64Decoder>(string const &);
78template string xform<CryptoPP::HexEncoder>(string const &);
79template string xform<CryptoPP::HexDecoder>(string const &);
80template string xform<CryptoPP::Gzip>(string const &);
81template string xform<CryptoPP::Gunzip>(string const &);
82
83// for use in hexenc encoding
84
85struct lowerize
86{
87 char operator()(char const & c) const
88 {
89 return ::tolower(static_cast<int>(c));
90 }
91};
92
93string lowercase(string const & in)
94{
95 string n(in);
96 transform(n.begin(), n.end(), n.begin(), lowerize());
97 return n;
98}
99
100struct upperize
101{
102 char operator()(char const & c) const
103 {
104 return ::toupper(static_cast<int>(c));
105 }
106};
107
108string uppercase(string const & in)
109{
110 string n(in);
111 transform(n.begin(), n.end(), n.begin(), upperize());
112 return n;
113}
114
115
116// diffing and patching
117
118
119void diff(data const & olddata,
120 data const & newdata,
121 base64< gzip<delta> > & del)
122{
123 string unpacked;
124 compute_delta(olddata(), newdata(), unpacked);
125 pack(delta(unpacked), del);
126}
127
128void patch(data const & olddata,
129 base64< gzip<delta> > const & del,
130 data & newdata)
131{
132 delta unpacked;
133 unpack(del, unpacked);
134 string result;
135 apply_delta(olddata(), unpacked(), result);
136 newdata = result;
137}
138
139void diff(manifest_map const & oldman,
140 manifest_map const & newman,
141 base64< gzip<delta> > & del)
142{
143 string xd;
144 compute_delta(oldman, newman, xd);
145 pack(delta(xd), del);
146}
147
148void diff(base64< gzip<data> > const & olddata,
149 base64< gzip<data> > const & newdata,
150 base64< gzip<delta> > & del)
151{
152 gzip<data> olddata_decoded;
153 gzip<data> newdata_decoded;
154
155 decode_base64(olddata, olddata_decoded);
156 decode_base64(newdata, newdata_decoded);
157
158 data olddata_decompressed;
159 data newdata_decompressed;
160
161 decode_gzip(olddata_decoded, olddata_decompressed);
162 decode_gzip(newdata_decoded, newdata_decompressed);
163
164 diff(olddata_decompressed,
165 newdata_decompressed,
166 del);
167}
168
169void patch(base64< gzip<data> > const & olddata,
170 base64< gzip<delta> > const & del,
171 base64< gzip<data> > & newdata)
172{
173 data olddata_unpacked, newdata_unpacked;
174 unpack(olddata, olddata_unpacked);
175 patch(olddata_unpacked, del, newdata_unpacked);
176 pack(newdata_unpacked, newdata);
177}
178
179
180// identifier (a.k.a. sha1 signature) calculation
181
182void calculate_ident(data const & dat,
183 hexenc<id> & ident)
184{
185 CryptoPP::SHA hash;
186 hash.Update(reinterpret_cast<byte const *>(dat().c_str()),
187 static_cast<unsigned int>(dat().size()));
188 char digest[CryptoPP::SHA::DIGESTSIZE];
189 hash.Final(reinterpret_cast<byte *>(digest));
190 string out(digest, CryptoPP::SHA::DIGESTSIZE);
191 id ident_decoded(out);
192 encode_hexenc(ident_decoded, ident);
193}
194
195void calculate_ident(base64< gzip<data> > const & dat,
196 hexenc<id> & ident)
197{
198 gzip<data> data_decoded;
199 data data_decompressed;
200 decode_base64(dat, data_decoded);
201 decode_gzip(data_decoded, data_decompressed);
202 calculate_ident(data_decompressed, ident);
203}
204
205void calculate_ident(file_data const & dat,
206 file_id & ident)
207{
208 hexenc<id> tmp;
209 calculate_ident(dat.inner(), tmp);
210 ident = tmp;
211}
212
213void calculate_ident(manifest_map const & m,
214 manifest_id & ident)
215{
216 CryptoPP::SHA hash;
217 size_t sz = 0;
218 static size_t bufsz = 0;
219 static char *buf = NULL;
220
221 for (manifest_map::const_iterator i = m.begin();
222 i != m.end(); ++i)
223 {
224 sz += i->second.inner()().size();
225 sz += i->first().size();
226 sz += 3;
227 }
228
229 if (sz > bufsz)
230 {
231 bufsz = sz;
232 buf = static_cast<char *>(realloc(buf, bufsz));
233 I(buf);
234 }
235
236 // this has to go quite fast, for cvs importing
237 char *c = buf;
238 for (manifest_map::const_iterator i = m.begin();
239 i != m.end(); ++i)
240 {
241 memcpy(c, i->second.inner()().data(), i->second.inner()().size());
242 c += i->second.inner()().size();
243 *c++ = ' ';
244 *c++ = ' ';
245 memcpy(c, i->first().data(), i->first().size());
246 c += i->first().size();
247 *c++ = '\n';
248 }
249
250 hash.Update(reinterpret_cast<byte const *>(buf),
251 static_cast<unsigned int>(sz));
252
253 char digest[CryptoPP::SHA::DIGESTSIZE];
254 hash.Final(reinterpret_cast<byte *>(digest));
255 string out(digest, CryptoPP::SHA::DIGESTSIZE);
256 id ident_decoded(out);
257 hexenc<id> raw_ident;
258 encode_hexenc(ident_decoded, raw_ident);
259 ident = manifest_id(raw_ident);
260}
261
262void calculate_ident(manifest_data const & dat,
263 manifest_id & ident)
264{
265 hexenc<id> tmp;
266 calculate_ident(dat.inner(), tmp);
267 ident = tmp;
268}
269
270// this might reasonably go in file_io.cc too..
271void calculate_ident(file_path const & file,
272 hexenc<id> & ident,
273 lua_hooks & lua)
274{
275 string db_linesep, ext_linesep;
276 string db_charset, ext_charset;
277
278 bool do_lineconv = (lua.hook_get_linesep_conv(file, db_linesep, ext_linesep)
279 && db_linesep != ext_linesep);
280
281 bool do_charconv = (lua.hook_get_charset_conv(file, db_charset, ext_charset)
282 && db_charset != ext_charset);
283
284 if (do_charconv || do_lineconv)
285 {
286 data dat;
287 read_localized_data(file, dat, lua);
288 calculate_ident(dat, ident);
289 }
290 else
291 {
292 // no conversions necessary, use streaming form
293 CryptoPP::SHA hash;
294 unsigned int const sz = 2 * CryptoPP::SHA::DIGESTSIZE;
295 char buffer[sz];
296 CryptoPP::FileSource f(file().c_str(), true, new CryptoPP::HashFilter
297 (hash, new CryptoPP::HexEncoder
298 (new CryptoPP::ArraySink(reinterpret_cast<byte *>(buffer), sz))));
299 ident = lowercase(string(buffer, sz));
300 }
301}
302
303
304void split_into_lines(string const & in,
305 vector<string> & out)
306{
307 typedef boost::tokenizer<boost::char_separator<char> >
308 tokenizer;
309 boost::char_separator<char> sep("\r\n", "", boost::keep_empty_tokens);
310 tokenizer tokens(in, sep);
311 out.clear();
312 copy(tokens.begin(), tokens.end(), back_inserter(out));
313 if (out.size() > 0
314 && out.at(out.size()-1) == "")
315 out.pop_back();
316}
317
318void join_lines(vector<string> const & in,
319string & out,
320string const & linesep)
321{
322 ostringstream oss;
323 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
324 out = oss.str();
325}
326
327void join_lines(vector<string> const & in,
328string & out)
329{
330 join_lines(in, out, "\n");
331}
332
333string remove_ws(string const & s)
334{
335 string tmp;
336 tmp.reserve(s.size());
337 for (string::const_iterator i = s.begin();
338 i != s.end(); ++i)
339 {
340 switch (*i)
341{
342case '\n':
343case '\r':
344case '\t':
345case ' ':
346 break;
347default:
348 tmp += *i;
349 break;
350}
351 }
352 return tmp;
353}
354
355string trim_ws(string const & s)
356{
357 string tmp = s;
358 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
359 if (pos < string::npos)
360 tmp.erase(++pos);
361 pos = tmp.find_first_not_of("\n\r\t ");
362 if (pos < string::npos)
363 tmp = tmp.substr(pos);
364 return tmp;
365}
366
367string canonical_base64(string const & s)
368{
369 return xform<CryptoPP::Base64Encoder>
370 (xform<CryptoPP::Base64Decoder>(s));
371}
372
373
374// general character code conversion routines
375
376static string system_charset()
377{
378 char const * locale_charset_name = stringprep_locale_charset ();
379 I(locale_charset_name != NULL);
380 string sys_charset(locale_charset_name);
381 return sys_charset;
382}
383
384void charset_convert(string const & src_charset,
385 string const & dst_charset,
386 string const & src,
387 string & dst)
388{
389 if (src_charset == dst_charset)
390 dst = src;
391 else
392 {
393 L(F("converting %d bytes from %s to %s\n") % src.size()
394% src_charset % dst_charset);
395 char * converted = stringprep_convert(src.c_str(),
396 dst_charset.c_str(),
397 src_charset.c_str());
398 I(converted != NULL);
399 dst = string(converted);
400 free(converted);
401 }
402}
403
404
405void system_to_utf8(external const & ext, utf8 & utf)
406{
407 string out;
408 charset_convert(system_charset(), "UTF-8", ext(), out);
409 utf = out;
410}
411
412void utf8_to_system(utf8 const & utf, external & ext)
413{
414 string out;
415 charset_convert("UTF-8", system_charset(), utf(), out);
416 ext = out;
417}
418
419static string decode_idna_error(int err)
420{
421 switch (static_cast<Idna_rc>(err))
422 {
423 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
424 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
425 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
426 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
427 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
428 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
429 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
430 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
431 case IDNA_ICONV_ERROR: return "iconv error"; break;
432 case IDNA_MALLOC_ERROR: return "malloc error"; break;
433 default: return "unknown error"; break;
434 }
435 return "unknown error";
436}
437
438void ace_to_utf8(ace const & a, utf8 & utf)
439{
440 char *out = NULL;
441 L(F("converting %d bytes from IDNA ACE to UTF-8\n") % a().size());
442 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
443 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
444 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
445 % a().size()
446 % decode_idna_error(res));
447 utf = string(out);
448 free(out);
449}
450
451void utf8_to_ace(utf8 const & utf, ace & a)
452{
453 char *out = NULL;
454 L(F("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size());
455 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
456 N(res == IDNA_SUCCESS,
457 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
458 % utf().size()
459 % decode_idna_error(res));
460 a = string(out);
461 free(out);
462}
463
464void utf8_to_urlenc(utf8 const & utf, urlenc & u)
465{
466 string tmp;
467 string ok_bytes(constants::legal_url_bytes);
468 string ut = utf();
469 for (string::const_iterator i = ut.begin(); i != ut.end(); ++i)
470 {
471 if (ok_bytes.find(*i) == string::npos)
472tmp += (F("%%%2.2x") % (0xff & static_cast<unsigned long>(*i))).str();
473 else
474tmp += *i;
475 }
476 u = tmp;
477}
478
479void urlenc_to_utf8(urlenc const & u, utf8 & utf)
480{
481 istringstream iss(u());
482 string ok_bytes(constants::legal_url_bytes);
483 char c = 0;
484 string tmp;
485
486 while (iss.get(c), iss.gcount() != 0)
487 {
488 if (c == '%')
489{
490 unsigned long val = 0;
491 iss >> std::hex >> val;
492 N(val > 0 && val <= 0xff,
493 F("bad URL-encoding escape value '%%%x'") % val);
494 tmp += static_cast<char>(val);
495}
496 else
497{
498 N(ok_bytes.find(c) != string::npos,
499 F("bad char 0x%x in URL-encoded string") % static_cast<unsigned long>(c));
500 tmp += c;
501}
502 }
503 utf = tmp;
504}
505
506
507// specific internal / external conversions for various vocab terms
508
509void internalize_url(utf8 const & utf, url & u)
510{
511 utf8 proto, user, host, path, group;
512 unsigned long port;
513 N(parse_utf8_url(utf, proto, user, host, path, group, port),
514 F("UTF8-URL parse failed"));
515
516 if (proto() == "mailto")
517 {
518 ace ace_user, ace_host;
519 utf8_to_ace(user, ace_user);
520 utf8_to_ace(host, ace_host);
521 u = (F("mailto:%s@%s:%d") % ace_user % ace_host % port).str();
522 }
523 else if (proto() == "http")
524 {
525 urlenc urlenc_path;
526 ace ace_host, ace_group;
527 utf8_to_urlenc(path, urlenc_path);
528 utf8_to_ace(host, ace_host);
529 utf8_to_ace(group, ace_group);
530 u = (F("http://%s:%d%s/%s") % ace_host % port % urlenc_path % ace_group).str();
531 }
532 else if (proto() == "nntp")
533 {
534 ace ace_host, ace_group;
535 utf8_to_ace(host, ace_host);
536 utf8_to_ace(group, ace_group);
537 u = (F("nntp://%s:%d/%s") % ace_host % port % ace_group).str();
538 }
539 else
540 {
541 throw informative_failure("unknown URL protocol '" + proto() + "'");
542 }
543
544 {
545 ace auser, ahost, agroup;
546 urlenc upath;
547 string sproto;
548 L(F("checking internalized URL '%s'\n") % u);
549 N(parse_url(u, sproto, auser, ahost, upath, agroup, port),
550 F("confirmation parse of internalized URL '%s' failed") % u);
551 }
552}
553
554void internalize_url(external const & ext, url & u)
555{
556 utf8 utf;
557 system_to_utf8(ext, utf);
558 internalize_url(utf, u);
559}
560
561
562void externalize_url(url const & u, utf8 & utf)
563{
564 ace user, host, group;
565 urlenc path;
566 string proto;
567 unsigned long port;
568
569 L(F("externalizing URL '%s'\n") % u);
570 N(parse_url(u(), proto, user, host, path, group, port),
571 F("URL parse failed on '%s'") % u);
572
573 if (proto == "mailto")
574 {
575 utf8 utf_user, utf_host;
576 ace_to_utf8(user, utf_user);
577 ace_to_utf8(host, utf_host);
578 utf = (F("mailto:%s@%s:%d") % utf_user % utf_host % port).str();
579 }
580 else if (proto == "http")
581 {
582 utf8 utf_path, utf_host, utf_group;
583 urlenc_to_utf8(path, utf_path);
584 ace_to_utf8(host, utf_host);
585 ace_to_utf8(group, utf_group);
586 utf = (F("http://%s:%d%s/%s") % utf_host % port % utf_path % utf_group).str();
587 }
588 else if (proto == "nntp")
589 {
590 utf8 utf_path, utf_host, utf_group;
591 ace_to_utf8(host, utf_host);
592 ace_to_utf8(group, utf_group);
593 utf = (F("nntp://%s:%d/%s") % utf_host % port % utf_group).str();
594 }
595 else
596 {
597 throw informative_failure("unknown URL protocol '" + proto + "'");
598 }
599
600 {
601 utf8 uproto, uuser, uhost, upath, ugroup;
602 N(parse_utf8_url(utf, uproto, uuser, uhost, upath, ugroup, port),
603 F("confirmation parse of UTF8-URL failed"));
604 }
605}
606
607void externalize_url(url const & u, external & ext)
608{
609 utf8 utf;
610 externalize_url(u, utf);
611 utf8_to_system(utf, ext);
612}
613
614void internalize_cert_name(utf8 const & utf, cert_name & c)
615{
616 ace a;
617 utf8_to_ace(utf, a);
618 c = a();
619}
620
621void internalize_cert_name(external const & ext, cert_name & c)
622{
623 utf8 utf;
624 system_to_utf8(ext(), utf);
625 internalize_cert_name(utf, c);
626}
627
628void externalize_cert_name(cert_name const & c, utf8 & utf)
629{
630 ace_to_utf8(ace(c()), utf);
631}
632
633void externalize_cert_name(cert_name const & c, external & ext)
634{
635 utf8 utf;
636 externalize_cert_name(c, utf);
637 utf8_to_system(utf, ext);
638}
639
640void internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
641{
642 string tmp;
643 typedef boost::tokenizer<boost::char_separator<char> >
644 tokenizer;
645 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
646 tokenizer tokens(utf(), sep);
647 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
648 {
649 if (*i == "." || *i == "@")
650tmp += *i;
651 else
652{
653 ace a;
654 utf8_to_ace(*i, a);
655 tmp += a();
656}
657 }
658 key = tmp;
659}
660
661void internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
662{
663 utf8 utf;
664 system_to_utf8(ext, utf);
665 internalize_rsa_keypair_id(utf, key);
666}
667
668void externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
669{
670 string tmp;
671 typedef boost::tokenizer<boost::char_separator<char> >
672 tokenizer;
673 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
674 tokenizer tokens(key(), sep);
675 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
676 {
677 if (*i == "." || *i == "@")
678tmp += *i;
679 else
680{
681 ace a(*i);
682 utf8 u;
683 ace_to_utf8(a, u);
684 tmp += u();
685}
686 }
687 utf = tmp;
688}
689
690void externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
691{
692 utf8 utf;
693 externalize_rsa_keypair_id(key, utf);
694 utf8_to_system(utf, ext);
695}
696
697
698void line_end_convert(string const & linesep, string const & src, string & dst)
699{
700 string linesep_str("\n");
701 if (linesep == "CR" || linesep == "\r")
702 linesep_str = "\r";
703 else if (linesep == "CRLF" || linesep == "\r\n")
704 linesep_str = "\r\n";
705 else if (linesep == "LF"|| linesep == "\n")
706 linesep_str = "\n";
707
708 vector<string> tmp;
709 split_into_lines(src, tmp);
710 join_lines(tmp, dst, linesep_str);
711}
712
713#ifdef BUILD_UNIT_TESTS
714#include "unit_tests.hh"
715
716static void enc_test()
717{
718 data d2, d1("the rain in spain");
719 gzip<data> gzd1, gzd2;
720 base64< gzip<data> > bgzd;
721 encode_gzip(d1, gzd1);
722 encode_base64(gzd1, bgzd);
723 decode_base64(bgzd, gzd2);
724 BOOST_CHECK(gzd2 == gzd1);
725 decode_gzip(gzd2, d2);
726 BOOST_CHECK(d2 == d1);
727}
728
729static void rdiff_test()
730{
731 data dat1(string("the first day of spring\nmakes me want to sing\n"));
732 data dat2(string("the first day of summer\nis a major bummer\n"));
733 data dat3;
734 gzip<data> dat1_gz, dat2_gz, dat3_gz;
735 base64< gzip<data> > dat1_bgz, dat2_bgz, dat3_bgz;
736 encode_gzip(dat1, dat1_gz);
737 encode_gzip(dat2, dat2_gz);
738 encode_base64(dat1_gz, dat1_bgz);
739 encode_base64(dat2_gz, dat2_bgz);
740 base64< gzip<delta> > del_bgz;
741 diff(dat1_bgz, dat2_bgz, del_bgz);
742
743 patch(dat1_bgz, del_bgz, dat3_bgz);
744 decode_base64(dat3_bgz, dat3_gz);
745 decode_gzip(dat3_gz, dat3);
746 BOOST_CHECK(dat3 == dat2);
747}
748
749static void calculate_ident_test()
750{
751 data input(string("the only blender which can be turned into the most powerful vaccum cleaner"));
752 hexenc<id> output;
753 string ident("86e03bdb3870e2a207dfd0dcbfd4c4f2e3bc97bd");
754 calculate_ident(input, output);
755 BOOST_CHECK(output() == ident);
756}
757
758static void caseconv_test()
759{
760 BOOST_CHECK(uppercase("hello") == "HELLO");
761 BOOST_CHECK(uppercase("heLlO") == "HELLO");
762 BOOST_CHECK(lowercase("POODLE DAY") == "poodle day");
763 BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day");
764 BOOST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
765 BOOST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
766}
767
768static void join_lines_test()
769{
770 vector<string> strs;
771 string joined;
772
773 strs.clear();
774 join_lines(strs, joined);
775 BOOST_CHECK(joined == "");
776
777 strs.push_back("hi");
778 join_lines(strs, joined);
779 BOOST_CHECK(joined == "hi\n");
780
781 strs.push_back("there");
782 join_lines(strs, joined);
783 BOOST_CHECK(joined == "hi\nthere\n");
784
785 strs.push_back("user");
786 join_lines(strs, joined);
787 BOOST_CHECK(joined == "hi\nthere\nuser\n");
788}
789
790static void strip_ws_test()
791{
792 BOOST_CHECK(trim_ws("\n leading space") == "leading space");
793 BOOST_CHECK(trim_ws("trailing space \n") == "trailing space");
794 BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
795 BOOST_CHECK(remove_ws(" I like going\tfor walks\n ")
796 == "Ilikegoingforwalks");
797}
798
799#define IDNA_ACE_PREFIX "xn--"
800#define IDNA_SUCCESS 0
801
802struct idna
803{
804 char *name;
805 size_t inlen;
806 uint32_t in[100];
807 char *out;
808 int allowunassigned;
809 int usestd3asciirules;
810 int toasciirc;
811 int tounicoderc;
812} idna_vec[] =
813 {
814 {
815 "Arabic (Egyptian)", 17,
816 {
8170x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
8180x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
8190x061F},
820 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS,
821 IDNA_SUCCESS},
822 {
823 "Chinese (simplified)", 9,
824 {
8250x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
826 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS,
827 IDNA_SUCCESS},
828 {
829 "Chinese (traditional)", 9,
830 {
8310x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
832 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS,
833 IDNA_SUCCESS},
834 {
835 "Czech", 22,
836 {
8370x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
8380x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
8390x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
840 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS,
841 IDNA_SUCCESS},
842 {
843 "Hebrew", 22,
844 {
8450x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
8460x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
8470x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
848 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS,
849 IDNA_SUCCESS},
850 {
851 "Hindi (Devanagari)", 30,
852 {
8530x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
8540x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
8550x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
8560x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
857 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
858 IDNA_SUCCESS},
859 {
860 "Japanese (kanji and hiragana)", 18,
861 {
8620x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
8630x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
8640x306E, 0x304B},
865 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
866 IDNA_SUCCESS},
867 {
868 "Russian (Cyrillic)", 28,
869 {
8700x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
8710x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
8720x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
8730x0441, 0x0441, 0x043A, 0x0438},
874 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
875 IDNA_SUCCESS, IDNA_SUCCESS},
876 {
877 "Spanish", 40,
878 {
8790x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
8800x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
8810x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
8820x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
8830x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
884 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
885 IDNA_SUCCESS},
886 {
887 "Vietnamese", 31,
888 {
8890x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
8900x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
8910x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
8920x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
893 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
894 IDNA_SUCCESS},
895 {
896 "Japanese", 8,
897 {
8980x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
899 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS,
900 IDNA_SUCCESS},
901 {
902 "Japanese", 24,
903 {
9040x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
9050x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
9060x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
907 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
908 IDNA_SUCCESS},
909 {
910 "Japanese", 25,
911 {
9120x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
9130x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
9140x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
9150x6240},
916 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
917 IDNA_SUCCESS},
918 {
919 "Japanese", 8,
920 {
9210x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
922 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS,
923 IDNA_SUCCESS},
924 {
925 "Japanese", 13,
926 {
9270x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
9280x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
929 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS,
930 IDNA_SUCCESS},
931 {
932 "Japanese", 9,
933 {
9340x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
935 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
936 {
937 "Japanese", 7,
938 {
9390x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
940 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
941 {
942 "Greek", 8,
943 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
944 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
945 {
946 "Maltese (Malti)", 10,
947 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
948 0x0127, 0x0061},
949 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
950 {
951 "Russian (Cyrillic)", 28,
952 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
953 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
954 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
955 0x0441, 0x0441, 0x043a, 0x0438},
956 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
957 IDNA_SUCCESS, IDNA_SUCCESS},
958 };
959
960static void check_idna_encoding()
961{
962 putenv("CHARSET=UTF-8");
963
964 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
965 {
966 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
967
968 size_t p, q;
969 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
970 idna_vec[i].inlen,
971 &p, &q);
972 utf8 utf = string(uc);
973 utf8 tutf;
974 free(uc);
975
976 ace a = string(idna_vec[i].out);
977 ace tace;
978 utf8_to_ace(utf, tace);
979 L(F("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace());
980 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
981 ace_to_utf8(a, tutf);
982 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
983
984 external tmp_external;
985 url tmp_url;
986
987 external utf_host_url("http://" + utf() + ":80/depot.cgi/path.to.group");
988 url ace_host_url("http://" + a() + ":80/depot.cgi/path.to.group");
989
990 internalize_url(utf_host_url, tmp_url);
991 L(F("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tmp_url());
992 BOOST_CHECK(lowercase(ace_host_url()) == lowercase(tmp_url()));
993 externalize_url(ace_host_url, tmp_external);
994 BOOST_CHECK(lowercase(tmp_external()) == lowercase(utf_host_url()));
995
996 external utf_group_url("http://www.gurgle.com:80/depot.cgi/" + utf());
997 url ace_group_url("http://www.gurgle.com:80/depot.cgi/" + a());
998
999 internalize_url(utf_group_url, tmp_url);
1000 L(F("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tmp_url());
1001 BOOST_CHECK(lowercase(ace_group_url()) == lowercase(tmp_url()));
1002 externalize_url(ace_group_url, tmp_external);
1003 BOOST_CHECK(lowercase(tmp_external()) == lowercase(utf_group_url()));
1004 }
1005}
1006
1007static void check_url_encoding(string const & dec, string const & enc)
1008{
1009 urlenc tu;
1010 utf8 tutf;
1011 utf8_to_urlenc(dec, tu);
1012 L(F("URL-encoded to '%s'\n") % tu());
1013 BOOST_CHECK(enc == tu());
1014 urlenc_to_utf8(tu, tutf);
1015 BOOST_CHECK(tutf == dec);
1016}
1017
1018static void encode_test()
1019{
1020 check_url_encoding("hello\xF1there", "hello%f1there");
1021 check_url_encoding("hello\xF2there", "hello%f2there");
1022 check_url_encoding("hello\xF3there", "hello%f3there");
1023 check_url_encoding("hello\xF4there", "hello%f4there");
1024 check_url_encoding("hello\xF5there", "hello%f5there");
1025 check_url_encoding("hello\xF6there", "hello%f6there");
1026 check_url_encoding("hello\xE6there", "hello%e6there");
1027 check_url_encoding("hello\xD6there", "hello%d6there");
1028 check_url_encoding("hello\xC6there", "hello%c6there");
1029 check_url_encoding("\xC6there", "%c6there");
1030 check_url_encoding("hello\xC6", "hello%c6");
1031 check_url_encoding("hello\xC6\xA9there", "hello%c6%a9there");
1032 check_url_encoding("hello\xC6\xA9\xD4there", "hello%c6%a9%d4there");
1033 check_idna_encoding();
1034}
1035
1036void add_transform_tests(test_suite * suite)
1037{
1038 I(suite);
1039 suite->add(BOOST_TEST_CASE(&enc_test));
1040 suite->add(BOOST_TEST_CASE(&rdiff_test));
1041 suite->add(BOOST_TEST_CASE(&calculate_ident_test));
1042 suite->add(BOOST_TEST_CASE(&caseconv_test));
1043 suite->add(BOOST_TEST_CASE(&join_lines_test));
1044 suite->add(BOOST_TEST_CASE(&strip_ws_test));
1045 suite->add(BOOST_TEST_CASE(&encode_test));
1046}
1047
1048#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status