monotone

monotone Mtn Source Tree

Root/transforms.cc

1// -*- mode: C++; c-file-style: "gnu"; indent-tabs-mode: nil -*-
2// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
3// all rights reserved.
4// licensed to the public under the terms of the GNU GPL (>= 2)
5// see the file COPYING for details
6
7#include <algorithm>
8#include <cctype>
9#include <functional>
10#include <iterator>
11#include <sstream>
12#include <string>
13#include <vector>
14#include <wchar.h>
15
16#include <boost/tokenizer.hpp>
17#include <boost/scoped_array.hpp>
18
19#include "botan/botan.h"
20#include "botan/gzip.h"
21#include "botan/sha160.h"
22
23#include "idna/idna.h"
24#include "idna/stringprep.h"
25
26#include "cleanup.hh"
27#include "constants.hh"
28#include "sanity.hh"
29#include "transforms.hh"
30#include "vocab.hh"
31#include "work.hh"
32#include "xdelta.hh"
33
34using namespace std;
35
36// this file contans various sorts of string transformations. each
37// transformation should be self-explanatory from its type signature. see
38// transforms.hh for the summary.
39
40// NB this file uses very "value-centric" functional approach; even though
41// many of the underlying transformations are "stream-centric" and the
42// underlying libraries (eg. crypto++) are stream oriented. this will
43// probably strike some people as contemptably inefficient, since it means
44// that occasionally 1, 2, or even 3 copies of an entire file will wind up
45// in memory at once. I am taking this approach for 3 reasons: first, I
46// want the type system to help me and value types are much easier to work
47// with than stream types. second, it is *much* easier to debug a program
48// that operates on values than streams, and correctness takes precedence
49// over all other features of this program. third, this is a peer-to-peer
50// sort of program for small-ish source-code text files, not a fileserver,
51// and is memory-limited anyways (for example, storing things in sqlite
52// requires they be able to fit in memory). you're hopefully not going to
53// be dealing with hundreds of users hammering on locks and memory
54// concurrently.
55//
56// if future analysis proves these assumptions wrong, feel free to revisit
57// the matter, but bring strong evidence along with you that the stream
58// paradigm "must" be used. this program is intended for source code
59// control and I make no bones about it.
60
61using namespace std;
62
63// the generic function
64template<typename XFM> string xform(string const & in)
65{
66 string out;
67 Botan::Pipe pipe(new XFM());
68 pipe.process_msg(in);
69 out = pipe.read_all_as_string();
70 return out;
71}
72
73// specialize it
74template string xform<Botan::Base64_Encoder>(string const &);
75template string xform<Botan::Base64_Decoder>(string const &);
76template string xform<Botan::Hex_Encoder>(string const &);
77template string xform<Botan::Hex_Decoder>(string const &);
78template string xform<Botan::Gzip_Compression>(string const &);
79template string xform<Botan::Gzip_Decompression>(string const &);
80
81// for use in hexenc encoding
82
83string encode_hexenc(string const & in)
84{
85 boost::scoped_array<char> buf(new char[in.size() * 2]);
86 static char const *tab = "0123456789abcdef";
87 char *c = buf.get();
88 for (string::const_iterator i = in.begin();
89 i != in.end(); ++i)
90 {
91 *c++ = tab[(*i >> 4) & 0xf];
92 *c++ = tab[*i & 0xf];
93 }
94 return string(buf.get(), in.size() *2);
95}
96
97static inline char decode_hex_char(char c)
98{
99 if (c >= '0' && c <= '9')
100 return c - '0';
101 if (c >= 'a' && c <= 'f')
102 return c - 'a' + 10;
103 I(false);
104}
105
106string decode_hexenc(string const & in)
107{
108 I(in.size() % 2 == 0);
109 boost::scoped_array<char> buf(new char[in.size() / 2]);
110 char *c = buf.get();
111 for (string::const_iterator i = in.begin();
112 i != in.end(); ++i)
113 {
114 char t = decode_hex_char(*i++);
115 t <<= 4;
116 t |= decode_hex_char(*i);
117 *c++ = t;
118 }
119 return string(buf.get(), in.size() / 2);
120}
121
122struct
123lowerize
124{
125 char operator()(char const & c) const
126 {
127 return ::tolower(static_cast<int>(c));
128 }
129};
130
131string
132lowercase(string const & in)
133{
134 string n(in);
135 transform(n.begin(), n.end(), n.begin(), lowerize());
136 return n;
137}
138
139struct
140upperize
141{
142 char operator()(char const & c) const
143 {
144 return ::toupper(static_cast<int>(c));
145 }
146};
147
148string
149uppercase(string const & in)
150{
151 string n(in);
152 transform(n.begin(), n.end(), n.begin(), upperize());
153 return n;
154}
155
156template <typename T>
157void pack(T const & in, base64< gzip<T> > & out)
158{
159 string tmp;
160 tmp.reserve(in().size()); // FIXME: do some benchmarking and make this a constant::
161
162 Botan::Pipe pipe(new Botan::Gzip_Compression(), new Botan::Base64_Encoder);
163 pipe.process_msg(in());
164 tmp = pipe.read_all_as_string();
165 out = tmp;
166}
167
168template <typename T>
169void unpack(base64< gzip<T> > const & in, T & out)
170{
171 string tmp;
172 tmp.reserve(in().size()); // FIXME: do some benchmarking and make this a constant::
173
174 Botan::Pipe pipe(new Botan::Base64_Decoder(), new Botan::Gzip_Decompression());
175 pipe.process_msg(in());
176 tmp = pipe.read_all_as_string();
177
178 out = tmp;
179}
180
181// specialise them
182template void pack<data>(data const &, base64< gzip<data> > &);
183template void pack<delta>(delta const &, base64< gzip<delta> > &);
184template void unpack<data>(base64< gzip<data> > const &, data &);
185template void unpack<delta>(base64< gzip<delta> > const &, delta &);
186
187// diffing and patching
188
189void
190diff(data const & olddata,
191 data const & newdata,
192 delta & del)
193{
194 string unpacked;
195 compute_delta(olddata(), newdata(), unpacked);
196 del = delta(unpacked);
197}
198
199void
200patch(data const & olddata,
201 delta const & del,
202 data & newdata)
203{
204 string result;
205 apply_delta(olddata(), del(), result);
206 newdata = result;
207}
208
209void
210diff(manifest_map const & oldman,
211 manifest_map const & newman,
212 delta & del)
213{
214 string xd;
215 compute_delta(oldman, newman, xd);
216 del = delta(xd);
217}
218
219// identifier (a.k.a. sha1 signature) calculation
220
221void
222calculate_ident(data const & dat,
223 hexenc<id> & ident)
224{
225 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"));
226 p.process_msg(dat());
227
228 id ident_decoded(p.read_all_as_string());
229 encode_hexenc(ident_decoded, ident);
230}
231
232void
233calculate_ident(base64< gzip<data> > const & dat,
234 hexenc<id> & ident)
235{
236 gzip<data> data_decoded;
237 data data_decompressed;
238 decode_base64(dat, data_decoded);
239 decode_gzip(data_decoded, data_decompressed);
240 calculate_ident(data_decompressed, ident);
241}
242
243void
244calculate_ident(file_data const & dat,
245 file_id & ident)
246{
247 hexenc<id> tmp;
248 calculate_ident(dat.inner(), tmp);
249 ident = tmp;
250}
251
252void
253calculate_ident(manifest_map const & m,
254 manifest_id & ident)
255{
256 size_t sz = 0;
257 static size_t bufsz = 0;
258 static char *buf = NULL;
259
260 for (manifest_map::const_iterator i = m.begin();
261 i != m.end(); ++i)
262 {
263 sz += i->second.inner()().size();
264 sz += i->first.as_internal().size();
265 sz += 3;
266 }
267
268 if (sz > bufsz)
269 {
270 bufsz = sz;
271 buf = static_cast<char *>(realloc(buf, bufsz));
272 I(buf);
273 }
274
275 // this has to go quite fast, for cvs importing
276 char *c = buf;
277 for (manifest_map::const_iterator i = m.begin();
278 i != m.end(); ++i)
279 {
280 memcpy(c, i->second.inner()().data(), i->second.inner()().size());
281 c += i->second.inner()().size();
282 *c++ = ' ';
283 *c++ = ' ';
284 memcpy(c, i->first.as_internal().data(), i->first.as_internal().size());
285 c += i->first.as_internal().size();
286 *c++ = '\n';
287 }
288
289 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"));
290 p.process_msg(reinterpret_cast<Botan::byte const*>(buf), sz);
291
292 id ident_decoded(p.read_all_as_string());
293 hexenc<id> raw_ident;
294 encode_hexenc(ident_decoded, raw_ident);
295 ident = manifest_id(raw_ident);
296}
297
298void
299calculate_ident(manifest_data const & dat,
300 manifest_id & ident)
301{
302 hexenc<id> tmp;
303 calculate_ident(dat.inner(), tmp);
304 ident = tmp;
305}
306
307
308void calculate_ident(revision_data const & dat,
309 revision_id & ident)
310{
311 hexenc<id> tmp;
312 calculate_ident(dat.inner(), tmp);
313 ident = tmp;
314}
315
316void calculate_ident(revision_set const & cs,
317 revision_id & ident)
318{
319 data tmp;
320 hexenc<id> tid;
321 write_revision_set(cs, tmp);
322 calculate_ident(tmp, tid);
323 ident = tid;
324}
325
326// this might reasonably go in file_io.cc too...
327void
328calculate_ident(file_path const & file,
329 hexenc<id> & ident,
330 lua_hooks & lua)
331{
332 string db_linesep, ext_linesep;
333 string db_charset, ext_charset;
334
335 bool do_lineconv = (lua.hook_get_linesep_conv(file, db_linesep, ext_linesep)
336 && db_linesep != ext_linesep);
337
338 bool do_charconv = (lua.hook_get_charset_conv(file, db_charset, ext_charset)
339 && db_charset != ext_charset);
340
341 if (do_charconv || do_lineconv)
342 {
343 data dat;
344 read_localized_data(file, dat, lua);
345 calculate_ident(dat, ident);
346 }
347 else
348 {
349 // no conversions necessary, use streaming form
350 // Best to be safe and check it isn't a dir.
351 assert_path_is_file(file);
352 Botan::Pipe p(new Botan::Hash_Filter("SHA-1"), new Botan::Hex_Encoder());
353 Botan::DataSource_Stream infile(file.as_external());
354 p.process_msg(infile);
355
356 ident = lowercase(p.read_all_as_string());
357 }
358}
359
360void split_into_lines(std::string const & in,
361 std::string const & encoding,
362 std::vector<std::string> & out)
363{
364 std::string lc_encoding = lowercase(encoding);
365 out.clear();
366
367 // note: this function does not handle ISO-2022-X, Shift-JIS, and
368 // probably a good deal of other encodings as well. please expand
369 // the logic here if you can work out an easy way of doing line
370 // breaking on these encodings. currently it's just designed to
371 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
372 // respectively.
373 //
374 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
375 // ASCII, and UTF-8 families of encodings.
376
377 if (lc_encoding == default_encoding
378 || lc_encoding.find("ascii") != std::string::npos
379 || lc_encoding.find("8859") != std::string::npos
380 || lc_encoding.find("euc") != std::string::npos
381 || lc_encoding.find("koi") != std::string::npos
382 || lc_encoding.find("gb") != std::string::npos
383 || lc_encoding == "utf-8"
384 || lc_encoding == "utf_8"
385 || lc_encoding == "utf8")
386 {
387 std::string::size_type begin = 0;
388 std::string::size_type end = in.find_first_of("\r\n", begin);
389
390 while (end != std::string::npos && end >= begin)
391 {
392 out.push_back(in.substr(begin, end-begin));
393 if (in.at(end) == '\r'
394 && in.size() > end+1
395 && in.at(end+1) == '\n')
396 begin = end + 2;
397 else
398 begin = end + 1;
399 if (begin >= in.size())
400 break;
401 end = in.find_first_of("\r\n", begin);
402 }
403 if (begin < in.size())
404 out.push_back(in.substr(begin, in.size() - begin));
405 }
406 else
407 {
408 out.push_back(in);
409 }
410}
411
412
413void
414split_into_lines(string const & in,
415 vector<string> & out)
416{
417 split_into_lines(in, default_encoding, out);
418}
419
420void
421join_lines(vector<string> const & in,
422 string & out,
423 string const & linesep)
424{
425 ostringstream oss;
426 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
427 out = oss.str();
428}
429
430void
431join_lines(vector<string> const & in,
432 string & out)
433{
434 join_lines(in, out, "\n");
435}
436
437void
438prefix_lines_with(string const & prefix, string const & lines, string & out)
439{
440 std::vector<std::string> msgs;
441 split_into_lines(lines, msgs);
442
443 ostringstream oss;
444 for (std::vector<string>::const_iterator i = msgs.begin();
445 i != msgs.end();)
446 {
447 oss << prefix << *i;
448 i++;
449 if (i != msgs.end())
450 oss << endl;
451 }
452
453 out = oss.str();
454}
455
456string
457remove_ws(string const & s)
458{
459 string tmp;
460 tmp.reserve(s.size());
461 for (string::const_iterator i = s.begin();
462 i != s.end(); ++i)
463 {
464 switch (*i)
465 {
466 case '\n':
467 case '\r':
468 case '\t':
469 case ' ':
470 break;
471 default:
472 tmp += *i;
473 break;
474 }
475 }
476 return tmp;
477}
478
479string
480trim_ws(string const & s)
481{
482 string tmp = s;
483 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
484 if (pos < string::npos)
485 tmp.erase(++pos);
486 pos = tmp.find_first_not_of("\n\r\t ");
487 if (pos < string::npos)
488 tmp = tmp.substr(pos);
489 return tmp;
490}
491
492string
493canonical_base64(string const & s)
494{
495 return xform<Botan::Base64_Encoder>
496 (xform<Botan::Base64_Decoder>(s));
497}
498
499
500// general character code conversion routines
501
502static string
503system_charset()
504{
505 char const * locale_charset_name = stringprep_locale_charset ();
506 I(locale_charset_name != NULL);
507 string sys_charset(locale_charset_name);
508 return sys_charset;
509}
510
511void
512charset_convert(string const & src_charset,
513 string const & dst_charset,
514 string const & src,
515 string & dst)
516{
517 if (src_charset == dst_charset)
518 dst = src;
519 else
520 {
521 L(F("converting %d bytes from %s to %s\n") % src.size()
522 % src_charset % dst_charset);
523 char * converted = stringprep_convert(src.c_str(),
524 dst_charset.c_str(),
525 src_charset.c_str());
526 E(converted != NULL,
527 F("failed to convert string from %s to %s: '%s'")
528 % src_charset % dst_charset % src);
529 dst = string(converted);
530 free(converted);
531 }
532}
533
534void
535system_to_utf8(external const & ext, utf8 & utf)
536{
537 string out;
538 charset_convert(system_charset(), "UTF-8", ext(), out);
539 utf = out;
540}
541
542size_t
543display_width(utf8 const & utf)
544{
545 // this function is called many thousands of times by the tickers, so we
546 // try and avoid performing heap allocations by starting with a reasonable
547 // size buffer, and only ever growing the buffer if needed.
548 static size_t widebuf_sz = 128;
549 static boost::scoped_array<wchar_t> widebuf(new wchar_t[widebuf_sz]);
550
551 size_t len = mbstowcs(0, utf().c_str(), 0) + 1;
552
553 if (len == static_cast<size_t>(-1))
554 return utf().length(); // conversion failed; punt and return original length
555
556 if (len > widebuf_sz) {
557 widebuf.reset(new wchar_t[len]);
558 widebuf_sz = len;
559 }
560
561 mbstowcs(widebuf.get(), utf().c_str(), widebuf_sz);
562
563 return wcswidth(widebuf.get(), widebuf_sz);
564}
565
566// Lots of gunk to avoid charset conversion as much as possible. Running
567// iconv over every element of every path in a 30,000 file manifest takes
568// multiple seconds, which then is a minimum bound on pretty much any
569// operation we do...
570static inline bool
571system_charset_is_utf8_impl()
572{
573 std::string lc_encoding = lowercase(system_charset());
574 return (lc_encoding == "utf-8"
575 || lc_encoding == "utf_8"
576 || lc_encoding == "utf8");
577}
578
579static inline bool
580system_charset_is_utf8()
581{
582 static bool it_is = system_charset_is_utf8_impl();
583 return it_is;
584}
585
586static inline bool
587system_charset_is_ascii_extension_impl()
588{
589 if (system_charset_is_utf8())
590 return true;
591 std::string lc_encoding = lowercase(system_charset());
592 // if your character set is identical to ascii in the lower 7 bits, then add
593 // it here for a speed boost.
594 return (lc_encoding.find("ascii") != std::string::npos
595 || lc_encoding.find("8859") != std::string::npos
596 || lc_encoding.find("ansi_x3.4") != std::string::npos
597 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
598 // Unix Code) is a simple and clean encoding, standard on Unix
599 // systems.... It is backwards-compatible with ASCII (i.e. valid
600 // ASCII implies valid EUC)."
601 || lc_encoding.find("euc") != std::string::npos);
602}
603
604static inline bool
605system_charset_is_ascii_extension()
606{
607 static bool it_is = system_charset_is_ascii_extension_impl();
608 return it_is;
609}
610
611inline static bool
612is_all_ascii(string const & utf)
613{
614 // could speed this up by vectorization -- mask against 0x80808080,
615 // process a whole word at at time...
616 for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i)
617 if (0x80 & *i)
618 return false;
619 return true;
620}
621
622// this function must be fast. do not make it slow.
623void
624utf8_to_system(utf8 const & utf, std::string & ext)
625{
626 if (system_charset_is_utf8())
627 ext = utf();
628 else if (system_charset_is_ascii_extension()
629 && is_all_ascii(utf()))
630 ext = utf();
631 else
632 charset_convert("UTF-8", system_charset(), utf(), ext);
633}
634
635void
636utf8_to_system(utf8 const & utf, external & ext)
637{
638 string out;
639 utf8_to_system(utf, out);
640 ext = out;
641}
642
643static string
644decode_idna_error(int err)
645{
646 switch (static_cast<Idna_rc>(err))
647 {
648 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
649 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
650 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
651 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
652 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
653 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
654 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
655 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
656 case IDNA_ICONV_ERROR: return "iconv error"; break;
657 case IDNA_MALLOC_ERROR: return "malloc error"; break;
658 default: return "unknown error"; break;
659 }
660 return "unknown error";
661}
662
663void
664ace_to_utf8(ace const & a, utf8 & utf)
665{
666 char *out = NULL;
667 L(F("converting %d bytes from IDNA ACE to UTF-8\n") % a().size());
668 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
669 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
670 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
671 % a().size()
672 % decode_idna_error(res));
673 utf = string(out);
674 free(out);
675}
676
677void
678utf8_to_ace(utf8 const & utf, ace & a)
679{
680 char *out = NULL;
681 L(F("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size());
682 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
683 N(res == IDNA_SUCCESS,
684 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
685 % utf().size()
686 % decode_idna_error(res));
687 a = string(out);
688 free(out);
689}
690
691void
692internalize_cert_name(utf8 const & utf, cert_name & c)
693{
694 ace a;
695 utf8_to_ace(utf, a);
696 c = a();
697}
698
699void
700internalize_cert_name(external const & ext, cert_name & c)
701{
702 utf8 utf;
703 system_to_utf8(ext(), utf);
704 internalize_cert_name(utf, c);
705}
706
707void
708externalize_cert_name(cert_name const & c, utf8 & utf)
709{
710 ace_to_utf8(ace(c()), utf);
711}
712
713void
714externalize_cert_name(cert_name const & c, external & ext)
715{
716 utf8 utf;
717 externalize_cert_name(c, utf);
718 utf8_to_system(utf, ext);
719}
720
721void
722internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
723{
724 string tmp;
725 typedef boost::tokenizer<boost::char_separator<char> >
726 tokenizer;
727 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
728 tokenizer tokens(utf(), sep);
729 bool in_domain = false;
730 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
731 {
732 if (!in_domain || *i == "." || *i == "@")
733 tmp += *i;
734 else
735 {
736 ace a;
737 utf8_to_ace(*i, a);
738 tmp += a();
739 }
740 if (*i == "@")
741 in_domain = true;
742 }
743 key = tmp;
744}
745
746void
747internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
748{
749 utf8 utf;
750 system_to_utf8(ext, utf);
751 internalize_rsa_keypair_id(utf, key);
752}
753
754void
755externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
756{
757 string tmp;
758 typedef boost::tokenizer<boost::char_separator<char> >
759 tokenizer;
760 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
761 tokenizer tokens(key(), sep);
762 bool in_domain = false;
763 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
764 {
765 if (!in_domain || *i == "." || *i == "@")
766 tmp += *i;
767 else
768 {
769 ace a(*i);
770 utf8 u;
771 ace_to_utf8(a, u);
772 tmp += u();
773 }
774 if (*i == "@")
775 in_domain = true;
776 }
777 utf = tmp;
778}
779
780void
781externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
782{
783 utf8 utf;
784 externalize_rsa_keypair_id(key, utf);
785 utf8_to_system(utf, ext);
786}
787
788void
789internalize_var_domain(utf8 const & utf, var_domain & d)
790{
791 ace a;
792 utf8_to_ace(utf, a);
793 d = a();
794}
795
796void
797internalize_var_domain(external const & ext, var_domain & d)
798{
799 utf8 utf;
800 system_to_utf8(ext(), utf);
801 internalize_var_domain(utf, d);
802}
803
804void
805externalize_var_domain(var_domain const & d, utf8 & utf)
806{
807 ace_to_utf8(ace(d()), utf);
808}
809
810void
811externalize_var_domain(var_domain const & d, external & ext)
812{
813 utf8 utf;
814 externalize_var_domain(d, utf);
815 utf8_to_system(utf, ext);
816}
817
818void
819line_end_convert(string const & linesep, string const & src, string & dst)
820{
821 string linesep_str("\n");
822 if (linesep == "CR" || linesep == "\r")
823 linesep_str = "\r";
824 else if (linesep == "CRLF" || linesep == "\r\n")
825 linesep_str = "\r\n";
826 else if (linesep == "LF"|| linesep == "\n")
827 linesep_str = "\n";
828
829 L(F("doing linesep conversion to %s\n") % linesep);
830 vector<string> tmp;
831 split_into_lines(src, tmp);
832 join_lines(tmp, dst, linesep_str);
833 if (src.size() >= linesep.size() &&
834 (src.compare(src.size() - linesep.size(), linesep.size(), linesep) == 0))
835 dst += linesep_str;
836}
837
838
839#ifdef BUILD_UNIT_TESTS
840#include "unit_tests.hh"
841#include <stdlib.h>
842
843static void
844enc_test()
845{
846 data d2, d1("the rain in spain");
847 gzip<data> gzd1, gzd2;
848 base64< gzip<data> > bgzd;
849 encode_gzip(d1, gzd1);
850 encode_base64(gzd1, bgzd);
851 decode_base64(bgzd, gzd2);
852 BOOST_CHECK(gzd2 == gzd1);
853 decode_gzip(gzd2, d2);
854 BOOST_CHECK(d2 == d1);
855}
856
857static void
858rdiff_test()
859{
860 data dat1(string("the first day of spring\nmakes me want to sing\n"));
861 data dat2(string("the first day of summer\nis a major bummer\n"));
862 delta del;
863 diff(dat1, dat2, del);
864
865 data dat3;
866 patch(dat1, del, dat3);
867 BOOST_CHECK(dat3 == dat2);
868}
869
870static void
871calculate_ident_test()
872{
873 data input(string("the only blender which can be turned into the most powerful vaccum cleaner"));
874 hexenc<id> output;
875 string ident("86e03bdb3870e2a207dfd0dcbfd4c4f2e3bc97bd");
876 calculate_ident(input, output);
877 BOOST_CHECK(output() == ident);
878}
879
880static void
881caseconv_test()
882{
883 BOOST_CHECK(uppercase("hello") == "HELLO");
884 BOOST_CHECK(uppercase("heLlO") == "HELLO");
885 BOOST_CHECK(lowercase("POODLE DAY") == "poodle day");
886 BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day");
887 BOOST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
888 BOOST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
889}
890
891static void
892join_lines_test()
893{
894 vector<string> strs;
895 string joined;
896
897 strs.clear();
898 join_lines(strs, joined);
899 BOOST_CHECK(joined == "");
900
901 strs.push_back("hi");
902 join_lines(strs, joined);
903 BOOST_CHECK(joined == "hi\n");
904
905 strs.push_back("there");
906 join_lines(strs, joined);
907 BOOST_CHECK(joined == "hi\nthere\n");
908
909 strs.push_back("user");
910 join_lines(strs, joined);
911 BOOST_CHECK(joined == "hi\nthere\nuser\n");
912}
913
914static void
915strip_ws_test()
916{
917 BOOST_CHECK(trim_ws("\n leading space") == "leading space");
918 BOOST_CHECK(trim_ws("trailing space \n") == "trailing space");
919 BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
920 BOOST_CHECK(remove_ws(" I like going\tfor walks\n ")
921 == "Ilikegoingforwalks");
922}
923
924#define IDNA_ACE_PREFIX "xn--"
925#define IDNA_SUCCESS 0
926
927struct
928idna
929{
930 char *name;
931 size_t inlen;
932 uint32_t in[100];
933 char *out;
934 int allowunassigned;
935 int usestd3asciirules;
936 int toasciirc;
937 int tounicoderc;
938} idna_vec[] =
939 {
940 {
941 "Arabic (Egyptian)", 17,
942 {
943 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
944 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
945 0x061F},
946 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS,
947 IDNA_SUCCESS},
948 {
949 "Chinese (simplified)", 9,
950 {
951 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
952 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS,
953 IDNA_SUCCESS},
954 {
955 "Chinese (traditional)", 9,
956 {
957 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
958 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS,
959 IDNA_SUCCESS},
960 {
961 "Czech", 22,
962 {
963 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
964 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
965 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
966 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS,
967 IDNA_SUCCESS},
968 {
969 "Hebrew", 22,
970 {
971 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
972 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
973 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
974 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS,
975 IDNA_SUCCESS},
976 {
977 "Hindi (Devanagari)", 30,
978 {
979 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
980 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
981 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
982 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
983 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
984 IDNA_SUCCESS},
985 {
986 "Japanese (kanji and hiragana)", 18,
987 {
988 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
989 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
990 0x306E, 0x304B},
991 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
992 IDNA_SUCCESS},
993 {
994 "Russian (Cyrillic)", 28,
995 {
996 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
997 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
998 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
999 0x0441, 0x0441, 0x043A, 0x0438},
1000 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
1001 IDNA_SUCCESS, IDNA_SUCCESS},
1002 {
1003 "Spanish", 40,
1004 {
1005 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
1006 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
1007 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
1008 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
1009 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
1010 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
1011 IDNA_SUCCESS},
1012 {
1013 "Vietnamese", 31,
1014 {
1015 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
1016 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
1017 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
1018 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
1019 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
1020 IDNA_SUCCESS},
1021 {
1022 "Japanese", 8,
1023 {
1024 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
1025 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS,
1026 IDNA_SUCCESS},
1027 {
1028 "Japanese", 24,
1029 {
1030 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
1031 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
1032 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
1033 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
1034 IDNA_SUCCESS},
1035 {
1036 "Japanese", 25,
1037 {
1038 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
1039 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
1040 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
1041 0x6240},
1042 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
1043 IDNA_SUCCESS},
1044 {
1045 "Japanese", 8,
1046 {
1047 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
1048 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS,
1049 IDNA_SUCCESS},
1050 {
1051 "Japanese", 13,
1052 {
1053 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
1054 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
1055 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS,
1056 IDNA_SUCCESS},
1057 {
1058 "Japanese", 9,
1059 {
1060 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
1061 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1062 {
1063 "Japanese", 7,
1064 {
1065 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
1066 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1067 {
1068 "Greek", 8,
1069 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
1070 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1071 {
1072 "Maltese (Malti)", 10,
1073 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
1074 0x0127, 0x0061},
1075 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
1076 {
1077 "Russian (Cyrillic)", 28,
1078 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
1079 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
1080 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
1081 0x0441, 0x0441, 0x043a, 0x0438},
1082 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
1083 IDNA_SUCCESS, IDNA_SUCCESS},
1084 };
1085
1086static void
1087check_idna_encoding()
1088{
1089 putenv("CHARSET=UTF-8");
1090
1091 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
1092 {
1093 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
1094
1095 size_t p, q;
1096 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
1097 idna_vec[i].inlen,
1098 &p, &q);
1099 utf8 utf = string(uc);
1100 utf8 tutf;
1101 free(uc);
1102
1103 ace a = string(idna_vec[i].out);
1104 ace tace;
1105 utf8_to_ace(utf, tace);
1106 L(boost::format("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace());
1107 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
1108 ace_to_utf8(a, tutf);
1109 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
1110 }
1111}
1112
1113static void encode_test()
1114{
1115 check_idna_encoding();
1116}
1117
1118void
1119add_transform_tests(test_suite * suite)
1120{
1121 I(suite);
1122 suite->add(BOOST_TEST_CASE(&enc_test));
1123 suite->add(BOOST_TEST_CASE(&rdiff_test));
1124 suite->add(BOOST_TEST_CASE(&calculate_ident_test));
1125 suite->add(BOOST_TEST_CASE(&caseconv_test));
1126 suite->add(BOOST_TEST_CASE(&join_lines_test));
1127 suite->add(BOOST_TEST_CASE(&strip_ws_test));
1128 suite->add(BOOST_TEST_CASE(&encode_test));
1129}
1130
1131#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status