monotone

monotone Mtn Source Tree

Root/transforms.cc

1// -*- mode: C++; c-file-style: "gnu"; indent-tabs-mode: nil -*-
2// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
3// all rights reserved.
4// licensed to the public under the terms of the GNU GPL (>= 2)
5// see the file COPYING for details
6
7#include <algorithm>
8#include <cctype>
9#include <functional>
10#include <iterator>
11#include <sstream>
12#include <string>
13#include <vector>
14
15#include <boost/tokenizer.hpp>
16#include <boost/scoped_array.hpp>
17#include <boost/date_time/posix_time/posix_time.hpp>
18
19#include "botan/botan.h"
20#include "botan/gzip.h"
21#include "botan/sha160.h"
22
23#include "idna/idna.h"
24#include "idna/stringprep.h"
25
26#include "cleanup.hh"
27#include "constants.hh"
28#include "sanity.hh"
29#include "transforms.hh"
30#include "vocab.hh"
31#include "work.hh"
32#include "xdelta.hh"
33
34using namespace std;
35
36// this file contans various sorts of string transformations. each
37// transformation should be self-explanatory from its type signature. see
38// transforms.hh for the summary.
39
40// NB this file uses very "value-centric" functional approach; even though
41// many of the underlying transformations are "stream-centric" and the
42// underlying libraries (eg. crypto++) are stream oriented. this will
43// probably strike some people as contemptably inefficient, since it means
44// that occasionally 1, 2, or even 3 copies of an entire file will wind up
45// in memory at once. I am taking this approach for 3 reasons: first, I
46// want the type system to help me and value types are much easier to work
47// with than stream types. second, it is *much* easier to debug a program
48// that operates on values than streams, and correctness takes precedence
49// over all other features of this program. third, this is a peer-to-peer
50// sort of program for small-ish source-code text files, not a fileserver,
51// and is memory-limited anyways (for example, storing things in sqlite
52// requires they be able to fit in memory). you're hopefully not going to
53// be dealing with hundreds of users hammering on locks and memory
54// concurrently.
55//
56// if future analysis proves these assumptions wrong, feel free to revisit
57// the matter, but bring strong evidence along with you that the stream
58// paradigm "must" be used. this program is intended for source code
59// control and I make no bones about it.
60
61using namespace std;
62
63// the generic function
64template<typename XFM> string xform(string const & in)
65{
66 string out;
67 Botan::Pipe pipe(new XFM());
68 pipe.process_msg(in);
69 out = pipe.read_all_as_string();
70 return out;
71}
72
73// specialize it
74template string xform<Botan::Base64_Encoder>(string const &);
75template string xform<Botan::Base64_Decoder>(string const &);
76template string xform<Botan::Hex_Encoder>(string const &);
77template string xform<Botan::Hex_Decoder>(string const &);
78template string xform<Botan::Gzip_Compression>(string const &);
79template string xform<Botan::Gzip_Decompression>(string const &);
80
81// for use in hexenc encoding
82
83string encode_hexenc(string const & in)
84{
85 boost::scoped_array<char> buf(new char[in.size() * 2]);
86 static char const *tab = "0123456789abcdef";
87 char *c = buf.get();
88 for (string::const_iterator i = in.begin();
89 i != in.end(); ++i)
90 {
91 *c++ = tab[(*i >> 4) & 0xf];
92 *c++ = tab[*i & 0xf];
93 }
94 return string(buf.get(), in.size() *2);
95}
96
97static inline char decode_hex_char(char c)
98{
99 if (c >= '0' && c <= '9')
100 return c - '0';
101 if (c >= 'a' && c <= 'f')
102 return c - 'a' + 10;
103 I(false);
104}
105
106string decode_hexenc(string const & in)
107{
108 I(in.size() % 2 == 0);
109 boost::scoped_array<char> buf(new char[in.size() / 2]);
110 char *c = buf.get();
111 for (string::const_iterator i = in.begin();
112 i != in.end(); ++i)
113 {
114 char t = decode_hex_char(*i++);
115 t <<= 4;
116 t |= decode_hex_char(*i);
117 *c++ = t;
118 }
119 return string(buf.get(), in.size() / 2);
120}
121
122struct
123lowerize
124{
125 char operator()(char const & c) const
126 {
127 return ::tolower(static_cast<int>(c));
128 }
129};
130
131string
132lowercase(string const & in)
133{
134 string n(in);
135 transform(n.begin(), n.end(), n.begin(), lowerize());
136 return n;
137}
138
139struct
140upperize
141{
142 char operator()(char const & c) const
143 {
144 return ::toupper(static_cast<int>(c));
145 }
146};
147
148string
149uppercase(string const & in)
150{
151 string n(in);
152 transform(n.begin(), n.end(), n.begin(), upperize());
153 return n;
154}
155
156template <typename T>
157void pack(T const & in, base64< gzip<T> > & out)
158{
159 string tmp;
160 tmp.reserve(in().size()); // FIXME: do some benchmarking and make this a constant::
161
162 Botan::Pipe pipe(new Botan::Gzip_Compression(), new Botan::Base64_Encoder);
163 pipe.process_msg(in());
164 tmp = pipe.read_all_as_string();
165 out = tmp;
166}
167
168template <typename T>
169void unpack(base64< gzip<T> > const & in, T & out)
170{
171 string tmp;
172 tmp.reserve(in().size()); // FIXME: do some benchmarking and make this a constant::
173
174 Botan::Pipe pipe(new Botan::Base64_Decoder(), new Botan::Gzip_Decompression());
175 pipe.process_msg(in());
176 tmp = pipe.read_all_as_string();
177
178 out = tmp;
179}
180
181// specialise them
182template void pack<data>(data const &, base64< gzip<data> > &);
183template void pack<delta>(delta const &, base64< gzip<delta> > &);
184template void unpack<data>(base64< gzip<data> > const &, data &);
185template void unpack<delta>(base64< gzip<delta> > const &, delta &);
186
187// diffing and patching
188
189void
190diff(data const & olddata,
191 data const & newdata,
192 delta & del)
193{
194 string unpacked;
195 compute_delta(olddata(), newdata(), unpacked);
196 del = delta(unpacked);
197}
198
199void
200patch(data const & olddata,
201 delta const & del,
202 data & newdata)
203{
204 string result;
205 apply_delta(olddata(), del(), result);
206 newdata = result;
207}
208
209void
210diff(manifest_map const & oldman,
211 manifest_map const & newman,
212 delta & del)
213{
214 string xd;
215 compute_delta(oldman, newman, xd);
216 del = delta(xd);
217}
218
219// identifier (a.k.a. sha1 signature) calculation
220
221void
222calculate_ident(data const & dat,
223 hexenc<id> & ident)
224{
225 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"));
226 p.process_msg(dat());
227
228 id ident_decoded(p.read_all_as_string());
229 encode_hexenc(ident_decoded, ident);
230}
231
232void
233calculate_ident(base64< gzip<data> > const & dat,
234 hexenc<id> & ident)
235{
236 gzip<data> data_decoded;
237 data data_decompressed;
238 decode_base64(dat, data_decoded);
239 decode_gzip(data_decoded, data_decompressed);
240 calculate_ident(data_decompressed, ident);
241}
242
243void
244calculate_ident(file_data const & dat,
245 file_id & ident)
246{
247 hexenc<id> tmp;
248 calculate_ident(dat.inner(), tmp);
249 ident = tmp;
250}
251
252void
253calculate_ident(manifest_map const & m,
254 manifest_id & ident)
255{
256 size_t sz = 0;
257 static size_t bufsz = 0;
258 static char *buf = NULL;
259
260 for (manifest_map::const_iterator i = m.begin();
261 i != m.end(); ++i)
262 {
263 sz += i->second.inner()().size();
264 sz += i->first.as_internal().size();
265 sz += 3;
266 }
267
268 if (sz > bufsz)
269 {
270 bufsz = sz;
271 buf = static_cast<char *>(realloc(buf, bufsz));
272 I(buf);
273 }
274
275 // this has to go quite fast, for cvs importing
276 char *c = buf;
277 for (manifest_map::const_iterator i = m.begin();
278 i != m.end(); ++i)
279 {
280 memcpy(c, i->second.inner()().data(), i->second.inner()().size());
281 c += i->second.inner()().size();
282 *c++ = ' ';
283 *c++ = ' ';
284 memcpy(c, i->first.as_internal().data(), i->first.as_internal().size());
285 c += i->first.as_internal().size();
286 *c++ = '\n';
287 }
288
289 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"));
290 p.process_msg(reinterpret_cast<Botan::byte const*>(buf), sz);
291
292 id ident_decoded(p.read_all_as_string());
293 hexenc<id> raw_ident;
294 encode_hexenc(ident_decoded, raw_ident);
295 ident = manifest_id(raw_ident);
296}
297
298void
299calculate_ident(manifest_data const & dat,
300 manifest_id & ident)
301{
302 hexenc<id> tmp;
303 calculate_ident(dat.inner(), tmp);
304 ident = tmp;
305}
306
307
308void calculate_ident(revision_data const & dat,
309 revision_id & ident)
310{
311 hexenc<id> tmp;
312 calculate_ident(dat.inner(), tmp);
313 ident = tmp;
314}
315
316void calculate_ident(revision_set const & cs,
317 revision_id & ident)
318{
319 data tmp;
320 hexenc<id> tid;
321 write_revision_set(cs, tmp);
322 calculate_ident(tmp, tid);
323 ident = tid;
324}
325
326// this might reasonably go in file_io.cc too...
327void
328calculate_ident(file_path const & file,
329 hexenc<id> & ident,
330 lua_hooks & lua)
331{
332 string db_linesep, ext_linesep;
333 string db_charset, ext_charset;
334
335 bool do_lineconv = (lua.hook_get_linesep_conv(file, db_linesep, ext_linesep)
336 && db_linesep != ext_linesep);
337
338 bool do_charconv = (lua.hook_get_charset_conv(file, db_charset, ext_charset)
339 && db_charset != ext_charset);
340
341 if (do_charconv || do_lineconv)
342 {
343 data dat;
344 read_localized_data(file, dat, lua);
345 calculate_ident(dat, ident);
346 }
347 else
348 {
349 // no conversions necessary, use streaming form
350 // Best to be safe and check it isn't a dir.
351 assert_path_is_file(file);
352 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"), new Botan::Hex_Encoder());
353 Botan::DataSource_Stream infile(file.as_external());
354 p.process_msg(infile);
355
356 ident = lowercase(p.read_all_as_string());
357 }
358}
359
360void split_into_lines(std::string const & in,
361 std::string const & encoding,
362 std::vector<std::string> & out)
363{
364 std::string lc_encoding = lowercase(encoding);
365 out.clear();
366
367 // note: this function does not handle ISO-2022-X, Shift-JIS, and
368 // probably a good deal of other encodings as well. please expand
369 // the logic here if you can work out an easy way of doing line
370 // breaking on these encodings. currently it's just designed to
371 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
372 // respectively.
373 //
374 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
375 // ASCII, and UTF-8 families of encodings.
376
377 if (lc_encoding == default_encoding
378 || lc_encoding.find("ascii") != std::string::npos
379 || lc_encoding.find("8859") != std::string::npos
380 || lc_encoding.find("euc") != std::string::npos
381 || lc_encoding.find("koi") != std::string::npos
382 || lc_encoding.find("gb") != std::string::npos
383 || lc_encoding == "utf-8"
384 || lc_encoding == "utf_8"
385 || lc_encoding == "utf8")
386 {
387 std::string::size_type begin = 0;
388 std::string::size_type end = in.find_first_of("\r\n", begin);
389
390 while (end != std::string::npos && end >= begin)
391 {
392 out.push_back(in.substr(begin, end-begin));
393 if (in.at(end) == '\r'
394 && in.size() > end+1
395 && in.at(end+1) == '\n')
396 begin = end + 2;
397 else
398 begin = end + 1;
399 if (begin >= in.size())
400 break;
401 end = in.find_first_of("\r\n", begin);
402 }
403 if (begin < in.size())
404 out.push_back(in.substr(begin, in.size() - begin));
405 }
406 else
407 {
408 out.push_back(in);
409 }
410}
411
412
413void
414split_into_lines(string const & in,
415 vector<string> & out)
416{
417 split_into_lines(in, default_encoding, out);
418}
419
420void
421join_lines(vector<string> const & in,
422 string & out,
423 string const & linesep)
424{
425 ostringstream oss;
426 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
427 out = oss.str();
428}
429
430void
431join_lines(vector<string> const & in,
432 string & out)
433{
434 join_lines(in, out, "\n");
435}
436
437void
438prefix_lines_with(string const & prefix, string const & lines, string & out)
439{
440 std::vector<std::string> msgs;
441 split_into_lines(lines, msgs);
442
443 ostringstream oss;
444 for (std::vector<string>::const_iterator i = msgs.begin();
445 i != msgs.end();)
446 {
447 oss << prefix << *i;
448 i++;
449 if (i != msgs.end())
450 oss << endl;
451 }
452
453 out = oss.str();
454}
455
456string
457remove_ws(string const & s)
458{
459 string tmp;
460 tmp.reserve(s.size());
461 for (string::const_iterator i = s.begin();
462 i != s.end(); ++i)
463 {
464 switch (*i)
465 {
466 case '\n':
467 case '\r':
468 case '\t':
469 case ' ':
470 break;
471 default:
472 tmp += *i;
473 break;
474 }
475 }
476 return tmp;
477}
478
479string
480trim_ws(string const & s)
481{
482 string tmp = s;
483 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
484 if (pos < string::npos)
485 tmp.erase(++pos);
486 pos = tmp.find_first_not_of("\n\r\t ");
487 if (pos < string::npos)
488 tmp = tmp.substr(pos);
489 return tmp;
490}
491
492string
493canonical_base64(string const & s)
494{
495 return xform<Botan::Base64_Encoder>
496 (xform<Botan::Base64_Decoder>(s));
497}
498
499
500// general character code conversion routines
501
502static string
503system_charset()
504{
505 char const * locale_charset_name = stringprep_locale_charset ();
506 I(locale_charset_name != NULL);
507 string sys_charset(locale_charset_name);
508 return sys_charset;
509}
510
511void
512charset_convert(string const & src_charset,
513 string const & dst_charset,
514 string const & src,
515 string & dst)
516{
517 if (src_charset == dst_charset)
518 dst = src;
519 else
520 {
521 L(F("converting %d bytes from %s to %s\n") % src.size()
522 % src_charset % dst_charset);
523 char * converted = stringprep_convert(src.c_str(),
524 dst_charset.c_str(),
525 src_charset.c_str());
526 E(converted != NULL,
527 F("failed to convert string from %s to %s: '%s'")
528 % src_charset % dst_charset % src);
529 dst = string(converted);
530 free(converted);
531 }
532}
533
534void
535system_to_utf8(external const & ext, utf8 & utf)
536{
537 string out;
538 charset_convert(system_charset(), "UTF-8", ext(), out);
539 utf = out;
540}
541
542// hack: this is an unexposed function in libidna
543extern "C" long g_utf8_strlen(const char * p, size_t max);
544
545size_t
546length(utf8 const & utf)
547{
548 return g_utf8_strlen(utf().c_str(), utf().size());
549}
550
551// Lots of gunk to avoid charset conversion as much as possible. Running
552// iconv over every element of every path in a 30,000 file manifest takes
553// multiple seconds, which then is a minimum bound on pretty much any
554// operation we do...
555static inline bool
556system_charset_is_utf8_impl()
557{
558 std::string lc_encoding = lowercase(system_charset());
559 return (lc_encoding == "utf-8"
560 || lc_encoding == "utf_8"
561 || lc_encoding == "utf8");
562}
563
564static inline bool
565system_charset_is_utf8()
566{
567 static bool it_is = system_charset_is_utf8_impl();
568 return it_is;
569}
570
571static inline bool
572system_charset_is_ascii_extension_impl()
573{
574 if (system_charset_is_utf8())
575 return true;
576 std::string lc_encoding = lowercase(system_charset());
577 // if your character set is identical to ascii in the lower 7 bits, then add
578 // it here for a speed boost.
579 return (lc_encoding.find("ascii") != std::string::npos
580 || lc_encoding.find("8859") != std::string::npos
581 || lc_encoding.find("ansi_x3.4") != std::string::npos
582 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
583 // Unix Code) is a simple and clean encoding, standard on Unix
584 // systems.... It is backwards-compatible with ASCII (i.e. valid
585 // ASCII implies valid EUC)."
586 || lc_encoding.find("euc") != std::string::npos);
587}
588
589static inline bool
590system_charset_is_ascii_extension()
591{
592 static bool it_is = system_charset_is_ascii_extension_impl();
593 return it_is;
594}
595
596inline static bool
597is_all_ascii(string const & utf)
598{
599 // could speed this up by vectorization -- mask against 0x80808080,
600 // process a whole word at at time...
601 for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i)
602 if (0x80 & *i)
603 return false;
604 return true;
605}
606
607// this function must be fast. do not make it slow.
608void
609utf8_to_system(utf8 const & utf, std::string & ext)
610{
611 if (system_charset_is_utf8())
612 ext = utf();
613 else if (system_charset_is_ascii_extension()
614 && is_all_ascii(utf()))
615 ext = utf();
616 else
617 charset_convert("UTF-8", system_charset(), utf(), ext);
618}
619
620void
621utf8_to_system(utf8 const & utf, external & ext)
622{
623 string out;
624 utf8_to_system(utf, out);
625 ext = out;
626}
627
628static string
629decode_idna_error(int err)
630{
631 switch (static_cast<Idna_rc>(err))
632 {
633 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
634 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
635 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
636 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
637 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
638 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
639 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
640 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
641 case IDNA_ICONV_ERROR: return "iconv error"; break;
642 case IDNA_MALLOC_ERROR: return "malloc error"; break;
643 default: return "unknown error"; break;
644 }
645 return "unknown error";
646}
647
648void
649ace_to_utf8(ace const & a, utf8 & utf)
650{
651 char *out = NULL;
652 L(F("converting %d bytes from IDNA ACE to UTF-8\n") % a().size());
653 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
654 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
655 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
656 % a().size()
657 % decode_idna_error(res));
658 utf = string(out);
659 free(out);
660}
661
662void
663utf8_to_ace(utf8 const & utf, ace & a)
664{
665 char *out = NULL;
666 L(F("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size());
667 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
668 N(res == IDNA_SUCCESS,
669 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
670 % utf().size()
671 % decode_idna_error(res));
672 a = string(out);
673 free(out);
674}
675
676void
677internalize_cert_name(utf8 const & utf, cert_name & c)
678{
679 ace a;
680 utf8_to_ace(utf, a);
681 c = a();
682}
683
684void
685internalize_cert_name(external const & ext, cert_name & c)
686{
687 utf8 utf;
688 system_to_utf8(ext(), utf);
689 internalize_cert_name(utf, c);
690}
691
692void
693externalize_cert_name(cert_name const & c, utf8 & utf)
694{
695 ace_to_utf8(ace(c()), utf);
696}
697
698void
699externalize_cert_name(cert_name const & c, external & ext)
700{
701 utf8 utf;
702 externalize_cert_name(c, utf);
703 utf8_to_system(utf, ext);
704}
705
706void
707internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
708{
709 string tmp;
710 typedef boost::tokenizer<boost::char_separator<char> >
711 tokenizer;
712 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
713 tokenizer tokens(utf(), sep);
714 bool in_domain = false;
715 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
716 {
717 if (!in_domain || *i == "." || *i == "@")
718 tmp += *i;
719 else
720 {
721 ace a;
722 utf8_to_ace(*i, a);
723 tmp += a();
724 }
725 if (*i == "@")
726 in_domain = true;
727 }
728 key = tmp;
729}
730
731void
732internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
733{
734 utf8 utf;
735 system_to_utf8(ext, utf);
736 internalize_rsa_keypair_id(utf, key);
737}
738
739void
740externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
741{
742 string tmp;
743 typedef boost::tokenizer<boost::char_separator<char> >
744 tokenizer;
745 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
746 tokenizer tokens(key(), sep);
747 bool in_domain = false;
748 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
749 {
750 if (!in_domain || *i == "." || *i == "@")
751 tmp += *i;
752 else
753 {
754 ace a(*i);
755 utf8 u;
756 ace_to_utf8(a, u);
757 tmp += u();
758 }
759 if (*i == "@")
760 in_domain = true;
761 }
762 utf = tmp;
763}
764
765void
766externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
767{
768 utf8 utf;
769 externalize_rsa_keypair_id(key, utf);
770 utf8_to_system(utf, ext);
771}
772
773void
774internalize_var_domain(utf8 const & utf, var_domain & d)
775{
776 ace a;
777 utf8_to_ace(utf, a);
778 d = a();
779}
780
781void
782internalize_var_domain(external const & ext, var_domain & d)
783{
784 utf8 utf;
785 system_to_utf8(ext(), utf);
786 internalize_var_domain(utf, d);
787}
788
789void
790externalize_var_domain(var_domain const & d, utf8 & utf)
791{
792 ace_to_utf8(ace(d()), utf);
793}
794
795void
796externalize_var_domain(var_domain const & d, external & ext)
797{
798 utf8 utf;
799 externalize_var_domain(d, utf);
800 utf8_to_system(utf, ext);
801}
802
803void
804line_end_convert(string const & linesep, string const & src, string & dst)
805{
806 string linesep_str("\n");
807 if (linesep == "CR" || linesep == "\r")
808 linesep_str = "\r";
809 else if (linesep == "CRLF" || linesep == "\r\n")
810 linesep_str = "\r\n";
811 else if (linesep == "LF"|| linesep == "\n")
812 linesep_str = "\n";
813
814 L(F("doing linesep conversion to %s\n") % linesep);
815 vector<string> tmp;
816 split_into_lines(src, tmp);
817 join_lines(tmp, dst, linesep_str);
818 if (src.size() >= linesep.size() &&
819 (src.compare(src.size() - linesep.size(), linesep.size(), linesep) == 0))
820 dst += linesep_str;
821}
822
823
824boost::posix_time::ptime
825string_to_datetime(std::string const & s)
826{
827 try
828 {
829 // boost::posix_time is lame: it can parse "basic" ISO times, of the
830 // form 20000101T120000, but not "extended" ISO times, of the form
831 // 2000-01-01T12:00:00. So do something stupid to convert one to the
832 // other.
833 std::string tmp = s;
834 std::string::size_type pos = 0;
835 while ((pos = tmp.find_first_of("-:")) != string::npos)
836 tmp.erase(pos, 1);
837 return boost::posix_time::from_iso_string(tmp);
838 }
839 catch (std::out_of_range &e)
840 {
841 N(false, F("failed to parse date string '%s': %s") % s % e.what());
842 }
843 catch (std::exception &)
844 {
845 N(false, F("failed to parse date string '%s'") % s);
846 }
847 I(false);
848}
849
850
851#ifdef BUILD_UNIT_TESTS
852#include "unit_tests.hh"
853#include <stdlib.h>
854
855static void
856enc_test()
857{
858 data d2, d1("the rain in spain");
859 gzip<data> gzd1, gzd2;
860 base64< gzip<data> > bgzd;
861 encode_gzip(d1, gzd1);
862 encode_base64(gzd1, bgzd);
863 decode_base64(bgzd, gzd2);
864 BOOST_CHECK(gzd2 == gzd1);
865 decode_gzip(gzd2, d2);
866 BOOST_CHECK(d2 == d1);
867}
868
869static void
870rdiff_test()
871{
872 data dat1(string("the first day of spring\nmakes me want to sing\n"));
873 data dat2(string("the first day of summer\nis a major bummer\n"));
874 delta del;
875 diff(dat1, dat2, del);
876
877 data dat3;
878 patch(dat1, del, dat3);
879 BOOST_CHECK(dat3 == dat2);
880}
881
882static void
883calculate_ident_test()
884{
885 data input(string("the only blender which can be turned into the most powerful vaccum cleaner"));
886 hexenc<id> output;
887 string ident("86e03bdb3870e2a207dfd0dcbfd4c4f2e3bc97bd");
888 calculate_ident(input, output);
889 BOOST_CHECK(output() == ident);
890}
891
892static void
893caseconv_test()
894{
895 BOOST_CHECK(uppercase("hello") == "HELLO");
896 BOOST_CHECK(uppercase("heLlO") == "HELLO");
897 BOOST_CHECK(lowercase("POODLE DAY") == "poodle day");
898 BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day");
899 BOOST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
900 BOOST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
901}
902
903static void
904join_lines_test()
905{
906 vector<string> strs;
907 string joined;
908
909 strs.clear();
910 join_lines(strs, joined);
911 BOOST_CHECK(joined == "");
912
913 strs.push_back("hi");
914 join_lines(strs, joined);
915 BOOST_CHECK(joined == "hi\n");
916
917 strs.push_back("there");
918 join_lines(strs, joined);
919 BOOST_CHECK(joined == "hi\nthere\n");
920
921 strs.push_back("user");
922 join_lines(strs, joined);
923 BOOST_CHECK(joined == "hi\nthere\nuser\n");
924}
925
926static void
927strip_ws_test()
928{
929 BOOST_CHECK(trim_ws("\n leading space") == "leading space");
930 BOOST_CHECK(trim_ws("trailing space \n") == "trailing space");
931 BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
932 BOOST_CHECK(remove_ws(" I like going\tfor walks\n ")
933 == "Ilikegoingforwalks");
934}
935
936#define IDNA_ACE_PREFIX "xn--"
937#define IDNA_SUCCESS 0
938
939struct
940idna
941{
942 char *name;
943 size_t inlen;
944 uint32_t in[100];
945 char *out;
946 int allowunassigned;
947 int usestd3asciirules;
948 int toasciirc;
949 int tounicoderc;
950} idna_vec[] =
951 {
952 {
953 "Arabic (Egyptian)", 17,
954 {
955 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
956 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
957 0x061F},
958 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS,
959 IDNA_SUCCESS},
960 {
961 "Chinese (simplified)", 9,
962 {
963 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
964 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS,
965 IDNA_SUCCESS},
966 {
967 "Chinese (traditional)", 9,
968 {
969 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
970 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS,
971 IDNA_SUCCESS},
972 {
973 "Czech", 22,
974 {
975 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
976 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
977 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
978 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS,
979 IDNA_SUCCESS},
980 {
981 "Hebrew", 22,
982 {
983 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
984 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
985 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
986 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS,
987 IDNA_SUCCESS},
988 {
989 "Hindi (Devanagari)", 30,
990 {
991 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
992 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
993 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
994 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
995 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
996 IDNA_SUCCESS},
997 {
998 "Japanese (kanji and hiragana)", 18,
999 {
1000 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
1001 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
1002 0x306E, 0x304B},
1003 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
1004 IDNA_SUCCESS},
1005 {
1006 "Russian (Cyrillic)", 28,
1007 {
1008 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
1009 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
1010 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
1011 0x0441, 0x0441, 0x043A, 0x0438},
1012 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
1013 IDNA_SUCCESS, IDNA_SUCCESS},
1014 {
1015 "Spanish", 40,
1016 {
1017 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
1018 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
1019 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
1020 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
1021 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
1022 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
1023 IDNA_SUCCESS},
1024 {
1025 "Vietnamese", 31,
1026 {
1027 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
1028 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
1029 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
1030 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
1031 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
1032 IDNA_SUCCESS},
1033 {
1034 "Japanese", 8,
1035 {
1036 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
1037 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS,
1038 IDNA_SUCCESS},
1039 {
1040 "Japanese", 24,
1041 {
1042 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
1043 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
1044 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
1045 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
1046 IDNA_SUCCESS},
1047 {
1048 "Japanese", 25,
1049 {
1050 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
1051 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
1052 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
1053 0x6240},
1054 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
1055 IDNA_SUCCESS},
1056 {
1057 "Japanese", 8,
1058 {
1059 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
1060 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS,
1061 IDNA_SUCCESS},
1062 {
1063 "Japanese", 13,
1064 {
1065 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
1066 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
1067 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS,
1068 IDNA_SUCCESS},
1069 {
1070 "Japanese", 9,
1071 {
1072 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
1073 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1074 {
1075 "Japanese", 7,
1076 {
1077 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
1078 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1079 {
1080 "Greek", 8,
1081 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
1082 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1083 {
1084 "Maltese (Malti)", 10,
1085 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
1086 0x0127, 0x0061},
1087 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1088 {
1089 "Russian (Cyrillic)", 28,
1090 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
1091 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
1092 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
1093 0x0441, 0x0441, 0x043a, 0x0438},
1094 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
1095 IDNA_SUCCESS, IDNA_SUCCESS},
1096 };
1097
1098static void
1099check_idna_encoding()
1100{
1101 putenv("CHARSET=UTF-8");
1102
1103 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
1104 {
1105 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
1106
1107 size_t p, q;
1108 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
1109 idna_vec[i].inlen,
1110 &p, &q);
1111 utf8 utf = string(uc);
1112 utf8 tutf;
1113 free(uc);
1114
1115 ace a = string(idna_vec[i].out);
1116 ace tace;
1117 utf8_to_ace(utf, tace);
1118 L(boost::format("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace());
1119 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
1120 ace_to_utf8(a, tutf);
1121 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
1122 }
1123}
1124
1125static void encode_test()
1126{
1127 check_idna_encoding();
1128}
1129
1130void
1131add_transform_tests(test_suite * suite)
1132{
1133 I(suite);
1134 suite->add(BOOST_TEST_CASE(&enc_test));
1135 suite->add(BOOST_TEST_CASE(&rdiff_test));
1136 suite->add(BOOST_TEST_CASE(&calculate_ident_test));
1137 suite->add(BOOST_TEST_CASE(&caseconv_test));
1138 suite->add(BOOST_TEST_CASE(&join_lines_test));
1139 suite->add(BOOST_TEST_CASE(&strip_ws_test));
1140 suite->add(BOOST_TEST_CASE(&encode_test));
1141}
1142
1143#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status