monotone

monotone Mtn Source Tree

Root/transforms.cc

1// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6#include <algorithm>
7#include <cctype>
8#include <functional>
9#include <iterator>
10#include <sstream>
11#include <string>
12#include <vector>
13
14#include <boost/filesystem/path.hpp>
15#include <boost/tokenizer.hpp>
16
17#include "cryptopp/filters.h"
18#include "cryptopp/files.h"
19#include "cryptopp/sha.h"
20#include "cryptopp/hex.h"
21#include "cryptopp/base64.h"
22#include "cryptopp/gzip.h"
23
24#include "idna/idna.h"
25#include "idna/stringprep.h"
26
27#include "cleanup.hh"
28#include "constants.hh"
29#include "sanity.hh"
30#include "transforms.hh"
31#include "vocab.hh"
32#include "work.hh"
33#include "xdelta.hh"
34
35using namespace std;
36
37// this file contans various sorts of string transformations. each
38// transformation should be self-explanatory from its type signature. see
39// transforms.hh for the summary.
40
41// NB this file uses very "value-centric" functional approach; even though
42// many of the underlying transformations are "stream-centric" and the
43// underlying libraries (eg. crypto++) are stream oriented. this will
44// probably strike some people as contemptably inefficient, since it means
45// that occasionally 1, 2, or even 3 copies of an entire file will wind up
46// in memory at once. I am taking this approach for 3 reasons: first, I
47// want the type system to help me and value types are much easier to work
48// with than stream types. second, it is *much* easier to debug a program
49// that operates on values than streams, and correctness takes precedence
50// over all other features of this program. third, this is a peer-to-peer
51// sort of program for small-ish source-code text files, not a fileserver,
52// and is memory-limited anyways (for example, storing things in sqlite
53// requires they be able to fit in memory). you're hopefully not going to
54// be dealing with hundreds of users hammering on locks and memory
55// concurrently.
56//
57// if future analysis proves these assumptions wrong, feel free to revisit
58// the matter, but bring strong evidence along with you that the stream
59// paradigm "must" be used. this program is intended for source code
60// control and I make no bones about it.
61
62using namespace std;
63
64// the generic function
65template<typename XFM> string xform(string const & in)
66{
67 string out;
68 out.reserve(in.size() * 2);
69 CryptoPP::StringSource
70 str(in, true,
71 new XFM(new CryptoPP::StringSink(out)));
72 return out;
73}
74
75// specialize it
76template string xform<CryptoPP::Base64Encoder>(string const &);
77template string xform<CryptoPP::Base64Decoder>(string const &);
78template string xform<CryptoPP::HexEncoder>(string const &);
79template string xform<CryptoPP::HexDecoder>(string const &);
80template string xform<CryptoPP::Gzip>(string const &);
81template string xform<CryptoPP::Gunzip>(string const &);
82
83// for use in hexenc encoding
84
85struct
86lowerize
87{
88 char operator()(char const & c) const
89 {
90 return ::tolower(static_cast<int>(c));
91 }
92};
93
94string
95lowercase(string const & in)
96{
97 string n(in);
98 transform(n.begin(), n.end(), n.begin(), lowerize());
99 return n;
100}
101
102struct
103upperize
104{
105 char operator()(char const & c) const
106 {
107 return ::toupper(static_cast<int>(c));
108 }
109};
110
111string
112uppercase(string const & in)
113{
114 string n(in);
115 transform(n.begin(), n.end(), n.begin(), upperize());
116 return n;
117}
118
119
120// diffing and patching
121
122
123void
124diff(data const & olddata,
125 data const & newdata,
126 base64< gzip<delta> > & del)
127{
128 string unpacked;
129 compute_delta(olddata(), newdata(), unpacked);
130 pack(delta(unpacked), del);
131}
132
133void
134patch(data const & olddata,
135 base64< gzip<delta> > const & del,
136 data & newdata)
137{
138 delta unpacked;
139 unpack(del, unpacked);
140 string result;
141 apply_delta(olddata(), unpacked(), result);
142 newdata = result;
143}
144
145void
146diff(manifest_map const & oldman,
147 manifest_map const & newman,
148 base64< gzip<delta> > & del)
149{
150 string xd;
151 compute_delta(oldman, newman, xd);
152 pack(delta(xd), del);
153}
154
155void
156diff(base64< gzip<data> > const & olddata,
157 base64< gzip<data> > const & newdata,
158 base64< gzip<delta> > & del)
159{
160 gzip<data> olddata_decoded;
161 gzip<data> newdata_decoded;
162
163 decode_base64(olddata, olddata_decoded);
164 decode_base64(newdata, newdata_decoded);
165
166 data olddata_decompressed;
167 data newdata_decompressed;
168
169 decode_gzip(olddata_decoded, olddata_decompressed);
170 decode_gzip(newdata_decoded, newdata_decompressed);
171
172 diff(olddata_decompressed,
173 newdata_decompressed,
174 del);
175}
176
177void
178patch(base64< gzip<data> > const & olddata,
179 base64< gzip<delta> > const & del,
180 base64< gzip<data> > & newdata)
181{
182 data olddata_unpacked, newdata_unpacked;
183 unpack(olddata, olddata_unpacked);
184 patch(olddata_unpacked, del, newdata_unpacked);
185 pack(newdata_unpacked, newdata);
186}
187
188
189// identifier (a.k.a. sha1 signature) calculation
190
191void
192calculate_ident(data const & dat,
193 hexenc<id> & ident)
194{
195 CryptoPP::SHA hash;
196 hash.Update(reinterpret_cast<byte const *>(dat().c_str()),
197 static_cast<unsigned int>(dat().size()));
198 char digest[CryptoPP::SHA::DIGESTSIZE];
199 hash.Final(reinterpret_cast<byte *>(digest));
200 string out(digest, CryptoPP::SHA::DIGESTSIZE);
201 id ident_decoded(out);
202 encode_hexenc(ident_decoded, ident);
203}
204
205void
206calculate_ident(base64< gzip<data> > const & dat,
207 hexenc<id> & ident)
208{
209 gzip<data> data_decoded;
210 data data_decompressed;
211 decode_base64(dat, data_decoded);
212 decode_gzip(data_decoded, data_decompressed);
213 calculate_ident(data_decompressed, ident);
214}
215
216void
217calculate_ident(file_data const & dat,
218 file_id & ident)
219{
220 hexenc<id> tmp;
221 calculate_ident(dat.inner(), tmp);
222 ident = tmp;
223}
224
225void
226calculate_ident(manifest_map const & m,
227 manifest_id & ident)
228{
229 CryptoPP::SHA hash;
230 size_t sz = 0;
231 static size_t bufsz = 0;
232 static char *buf = NULL;
233
234 for (manifest_map::const_iterator i = m.begin();
235 i != m.end(); ++i)
236 {
237 sz += i->second.inner()().size();
238 sz += i->first().size();
239 sz += 3;
240 }
241
242 if (sz > bufsz)
243 {
244 bufsz = sz;
245 buf = static_cast<char *>(realloc(buf, bufsz));
246 I(buf);
247 }
248
249 // this has to go quite fast, for cvs importing
250 char *c = buf;
251 for (manifest_map::const_iterator i = m.begin();
252 i != m.end(); ++i)
253 {
254 memcpy(c, i->second.inner()().data(), i->second.inner()().size());
255 c += i->second.inner()().size();
256 *c++ = ' ';
257 *c++ = ' ';
258 memcpy(c, i->first().data(), i->first().size());
259 c += i->first().size();
260 *c++ = '\n';
261 }
262
263 hash.Update(reinterpret_cast<byte const *>(buf),
264 static_cast<unsigned int>(sz));
265
266 char digest[CryptoPP::SHA::DIGESTSIZE];
267 hash.Final(reinterpret_cast<byte *>(digest));
268 string out(digest, CryptoPP::SHA::DIGESTSIZE);
269 id ident_decoded(out);
270 hexenc<id> raw_ident;
271 encode_hexenc(ident_decoded, raw_ident);
272 ident = manifest_id(raw_ident);
273}
274
275void
276calculate_ident(manifest_data const & dat,
277 manifest_id & ident)
278{
279 hexenc<id> tmp;
280 calculate_ident(dat.inner(), tmp);
281 ident = tmp;
282}
283
284
285void calculate_ident(revision_data const & dat,
286 revision_id & ident)
287{
288 hexenc<id> tmp;
289 data unpacked;
290 unpack(dat.inner(), unpacked);
291 calculate_ident(unpacked, tmp);
292 ident = tmp;
293}
294
295void calculate_ident(revision_set const & cs,
296 revision_id & ident)
297{
298 data tmp;
299 hexenc<id> tid;
300 write_revision_set(cs, tmp);
301 calculate_ident(tmp, tid);
302 ident = tid;
303}
304
305// this might reasonably go in file_io.cc too..
306void
307calculate_ident(file_path const & file,
308 hexenc<id> & ident,
309 lua_hooks & lua)
310{
311 string db_linesep, ext_linesep;
312 string db_charset, ext_charset;
313
314 bool do_lineconv = (lua.hook_get_linesep_conv(file, db_linesep, ext_linesep)
315 && db_linesep != ext_linesep);
316
317 bool do_charconv = (lua.hook_get_charset_conv(file, db_charset, ext_charset)
318 && db_charset != ext_charset);
319
320 if (do_charconv || do_lineconv)
321 {
322 data dat;
323 read_localized_data(file, dat, lua);
324 calculate_ident(dat, ident);
325 }
326 else
327 {
328 // no conversions necessary, use streaming form
329 CryptoPP::SHA hash;
330 unsigned int const sz = 2 * CryptoPP::SHA::DIGESTSIZE;
331 char buffer[sz];
332 CryptoPP::FileSource f(file().c_str(), true, new CryptoPP::HashFilter
333 (hash, new CryptoPP::HexEncoder
334 (new CryptoPP::ArraySink(reinterpret_cast<byte *>(buffer), sz))));
335 ident = lowercase(string(buffer, sz));
336 }
337}
338
339void split_into_lines(std::string const & in,
340 std::string const & encoding,
341 std::vector<std::string> & out)
342{
343 std::string lc_encoding = lowercase(encoding);
344 out.clear();
345
346 // note: this function does not handle ISO-2022-X, Shift-JIS, and
347 // probably a good deal of other encodings as well. please expand
348 // the logic here if you can work out an easy way of doing line
349 // breaking on these encodings. currently it's just designed to
350 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
351 // respectively.
352 //
353 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
354 // ASCII, and UTF-8 families of encodings.
355
356 if (lc_encoding == default_encoding
357 || lc_encoding.find("ascii") != std::string::npos
358 || lc_encoding.find("8859") != std::string::npos
359 || lc_encoding.find("euc") != std::string::npos
360 || lc_encoding.find("koi") != std::string::npos
361 || lc_encoding.find("gb") != std::string::npos
362 || lc_encoding == "utf-8"
363 || lc_encoding == "utf_8"
364 || lc_encoding == "utf8")
365 {
366 std::string::size_type begin = 0;
367 std::string::size_type end = in.find_first_of("\r\n", begin);
368
369 while (end != std::string::npos && end >= begin)
370 {
371 out.push_back(in.substr(begin, end-begin));
372 if (in.at(end) == '\r'
373 && in.size() > end+1
374 && in.at(end+1) == '\n')
375 begin = end + 2;
376 else
377 begin = end + 1;
378 if (begin >= in.size())
379 break;
380 end = in.find_first_of("\r\n", begin);
381 }
382 if (begin < in.size())
383 out.push_back(in.substr(begin, in.size() - begin));
384 }
385 else
386 {
387 out.push_back(in);
388 }
389 if (out.size() == 0)
390 out.push_back("");
391}
392
393
394void
395split_into_lines(string const & in,
396 vector<string> & out)
397{
398 split_into_lines(in, default_encoding, out);
399}
400
401void
402join_lines(vector<string> const & in,
403 string & out,
404 string const & linesep)
405{
406 ostringstream oss;
407 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
408 out = oss.str();
409}
410
411void
412join_lines(vector<string> const & in,
413 string & out)
414{
415 join_lines(in, out, "\n");
416}
417
418void
419prefix_lines_with(string const & prefix, string const & lines, string & out)
420{
421 std::vector<std::string> msgs;
422 split_into_lines(lines, msgs);
423
424 ostringstream oss;
425 for (std::vector<string>::const_iterator i = msgs.begin();
426 i != msgs.end();)
427 {
428 oss << prefix << *i;
429 i++;
430 if (i != msgs.end())
431 oss << endl;
432 }
433
434 out = oss.str();
435}
436
437string
438remove_ws(string const & s)
439{
440 string tmp;
441 tmp.reserve(s.size());
442 for (string::const_iterator i = s.begin();
443 i != s.end(); ++i)
444 {
445 switch (*i)
446 {
447 case '\n':
448 case '\r':
449 case '\t':
450 case ' ':
451 break;
452 default:
453 tmp += *i;
454 break;
455 }
456 }
457 return tmp;
458}
459
460string
461trim_ws(string const & s)
462{
463 string tmp = s;
464 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
465 if (pos < string::npos)
466 tmp.erase(++pos);
467 pos = tmp.find_first_not_of("\n\r\t ");
468 if (pos < string::npos)
469 tmp = tmp.substr(pos);
470 return tmp;
471}
472
473string
474canonical_base64(string const & s)
475{
476 return xform<CryptoPP::Base64Encoder>
477 (xform<CryptoPP::Base64Decoder>(s));
478}
479
480
481// general character code conversion routines
482
483static string
484system_charset()
485{
486 char const * locale_charset_name = stringprep_locale_charset ();
487 I(locale_charset_name != NULL);
488 string sys_charset(locale_charset_name);
489 return sys_charset;
490}
491
492void
493charset_convert(string const & src_charset,
494 string const & dst_charset,
495 string const & src,
496 string & dst)
497{
498 if (src_charset == dst_charset)
499 dst = src;
500 else
501 {
502 L(F("converting %d bytes from %s to %s\n") % src.size()
503 % src_charset % dst_charset);
504 char * converted = stringprep_convert(src.c_str(),
505 dst_charset.c_str(),
506 src_charset.c_str());
507 I(converted != NULL);
508 dst = string(converted);
509 free(converted);
510 }
511}
512
513
514void
515system_to_utf8(external const & ext, utf8 & utf)
516{
517 string out;
518 charset_convert(system_charset(), "UTF-8", ext(), out);
519 utf = out;
520}
521
522void
523utf8_to_system(utf8 const & utf, external & ext)
524{
525 string out;
526 charset_convert("UTF-8", system_charset(), utf(), out);
527 ext = out;
528}
529
530static string
531decode_idna_error(int err)
532{
533 switch (static_cast<Idna_rc>(err))
534 {
535 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
536 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
537 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
538 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
539 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
540 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
541 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
542 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
543 case IDNA_ICONV_ERROR: return "iconv error"; break;
544 case IDNA_MALLOC_ERROR: return "malloc error"; break;
545 default: return "unknown error"; break;
546 }
547 return "unknown error";
548}
549
550void
551ace_to_utf8(ace const & a, utf8 & utf)
552{
553 char *out = NULL;
554 L(F("converting %d bytes from IDNA ACE to UTF-8\n") % a().size());
555 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
556 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
557 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
558 % a().size()
559 % decode_idna_error(res));
560 utf = string(out);
561 free(out);
562}
563
564void
565utf8_to_ace(utf8 const & utf, ace & a)
566{
567 char *out = NULL;
568 L(F("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size());
569 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
570 N(res == IDNA_SUCCESS,
571 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
572 % utf().size()
573 % decode_idna_error(res));
574 a = string(out);
575 free(out);
576}
577
578
579void
580internalize_cert_name(utf8 const & utf, cert_name & c)
581{
582 ace a;
583 utf8_to_ace(utf, a);
584 c = a();
585}
586
587void
588internalize_cert_name(external const & ext, cert_name & c)
589{
590 utf8 utf;
591 system_to_utf8(ext(), utf);
592 internalize_cert_name(utf, c);
593}
594
595void
596externalize_cert_name(cert_name const & c, utf8 & utf)
597{
598 ace_to_utf8(ace(c()), utf);
599}
600
601void
602externalize_cert_name(cert_name const & c, external & ext)
603{
604 utf8 utf;
605 externalize_cert_name(c, utf);
606 utf8_to_system(utf, ext);
607}
608
609void
610internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
611{
612 string tmp;
613 typedef boost::tokenizer<boost::char_separator<char> >
614 tokenizer;
615 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
616 tokenizer tokens(utf(), sep);
617 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
618 {
619 if (*i == "." || *i == "@")
620 tmp += *i;
621 else
622 {
623 ace a;
624 utf8_to_ace(*i, a);
625 tmp += a();
626 }
627 }
628 key = tmp;
629}
630
631void
632internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
633{
634 utf8 utf;
635 system_to_utf8(ext, utf);
636 internalize_rsa_keypair_id(utf, key);
637}
638
639void
640externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
641{
642 string tmp;
643 typedef boost::tokenizer<boost::char_separator<char> >
644 tokenizer;
645 boost::char_separator<char> sep("", ".@", boost::keep_empty_tokens);
646 tokenizer tokens(key(), sep);
647 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
648 {
649 if (*i == "." || *i == "@")
650 tmp += *i;
651 else
652 {
653 ace a(*i);
654 utf8 u;
655 ace_to_utf8(a, u);
656 tmp += u();
657 }
658 }
659 utf = tmp;
660}
661
662void
663externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
664{
665 utf8 utf;
666 externalize_rsa_keypair_id(key, utf);
667 utf8_to_system(utf, ext);
668}
669
670void
671internalize_var_domain(utf8 const & utf, var_domain & d)
672{
673 ace a;
674 utf8_to_ace(utf, a);
675 d = a();
676}
677
678void
679internalize_var_domain(external const & ext, var_domain & d)
680{
681 utf8 utf;
682 system_to_utf8(ext(), utf);
683 internalize_var_domain(utf, d);
684}
685
686void
687externalize_var_domain(var_domain const & d, utf8 & utf)
688{
689 ace_to_utf8(ace(d()), utf);
690}
691
692void
693externalize_var_domain(var_domain const & d, external & ext)
694{
695 utf8 utf;
696 externalize_var_domain(d, utf);
697 utf8_to_system(utf, ext);
698}
699
700void
701line_end_convert(string const & linesep, string const & src, string & dst)
702{
703 string linesep_str("\n");
704 if (linesep == "CR" || linesep == "\r")
705 linesep_str = "\r";
706 else if (linesep == "CRLF" || linesep == "\r\n")
707 linesep_str = "\r\n";
708 else if (linesep == "LF"|| linesep == "\n")
709 linesep_str = "\n";
710
711 L(F("doing linesep conversion to %s\n") % linesep);
712 vector<string> tmp;
713 split_into_lines(src, tmp);
714 join_lines(tmp, dst, linesep_str);
715 if (src.size() >= 1 &&
716 (src[src.size() - 1] == '\r' ||
717 src[src.size() - 1] == '\n'))
718 dst += linesep_str;
719}
720
721#ifdef BUILD_UNIT_TESTS
722#include "unit_tests.hh"
723
724static void
725enc_test()
726{
727 data d2, d1("the rain in spain");
728 gzip<data> gzd1, gzd2;
729 base64< gzip<data> > bgzd;
730 encode_gzip(d1, gzd1);
731 encode_base64(gzd1, bgzd);
732 decode_base64(bgzd, gzd2);
733 BOOST_CHECK(gzd2 == gzd1);
734 decode_gzip(gzd2, d2);
735 BOOST_CHECK(d2 == d1);
736}
737
738static void
739rdiff_test()
740{
741 data dat1(string("the first day of spring\nmakes me want to sing\n"));
742 data dat2(string("the first day of summer\nis a major bummer\n"));
743 data dat3;
744 gzip<data> dat1_gz, dat2_gz, dat3_gz;
745 base64< gzip<data> > dat1_bgz, dat2_bgz, dat3_bgz;
746 encode_gzip(dat1, dat1_gz);
747 encode_gzip(dat2, dat2_gz);
748 encode_base64(dat1_gz, dat1_bgz);
749 encode_base64(dat2_gz, dat2_bgz);
750 base64< gzip<delta> > del_bgz;
751 diff(dat1_bgz, dat2_bgz, del_bgz);
752
753 patch(dat1_bgz, del_bgz, dat3_bgz);
754 decode_base64(dat3_bgz, dat3_gz);
755 decode_gzip(dat3_gz, dat3);
756 BOOST_CHECK(dat3 == dat2);
757}
758
759static void
760calculate_ident_test()
761{
762 data input(string("the only blender which can be turned into the most powerful vaccum cleaner"));
763 hexenc<id> output;
764 string ident("86e03bdb3870e2a207dfd0dcbfd4c4f2e3bc97bd");
765 calculate_ident(input, output);
766 BOOST_CHECK(output() == ident);
767}
768
769static void
770caseconv_test()
771{
772 BOOST_CHECK(uppercase("hello") == "HELLO");
773 BOOST_CHECK(uppercase("heLlO") == "HELLO");
774 BOOST_CHECK(lowercase("POODLE DAY") == "poodle day");
775 BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day");
776 BOOST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
777 BOOST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
778}
779
780static void
781join_lines_test()
782{
783 vector<string> strs;
784 string joined;
785
786 strs.clear();
787 join_lines(strs, joined);
788 BOOST_CHECK(joined == "");
789
790 strs.push_back("hi");
791 join_lines(strs, joined);
792 BOOST_CHECK(joined == "hi\n");
793
794 strs.push_back("there");
795 join_lines(strs, joined);
796 BOOST_CHECK(joined == "hi\nthere\n");
797
798 strs.push_back("user");
799 join_lines(strs, joined);
800 BOOST_CHECK(joined == "hi\nthere\nuser\n");
801}
802
803static void
804strip_ws_test()
805{
806 BOOST_CHECK(trim_ws("\n leading space") == "leading space");
807 BOOST_CHECK(trim_ws("trailing space \n") == "trailing space");
808 BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
809 BOOST_CHECK(remove_ws(" I like going\tfor walks\n ")
810 == "Ilikegoingforwalks");
811}
812
813#define IDNA_ACE_PREFIX "xn--"
814#define IDNA_SUCCESS 0
815
816struct
817idna
818{
819 char *name;
820 size_t inlen;
821 uint32_t in[100];
822 char *out;
823 int allowunassigned;
824 int usestd3asciirules;
825 int toasciirc;
826 int tounicoderc;
827} idna_vec[] =
828 {
829 {
830 "Arabic (Egyptian)", 17,
831 {
832 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
833 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
834 0x061F},
835 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS,
836 IDNA_SUCCESS},
837 {
838 "Chinese (simplified)", 9,
839 {
840 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
841 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS,
842 IDNA_SUCCESS},
843 {
844 "Chinese (traditional)", 9,
845 {
846 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
847 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS,
848 IDNA_SUCCESS},
849 {
850 "Czech", 22,
851 {
852 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
853 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
854 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
855 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS,
856 IDNA_SUCCESS},
857 {
858 "Hebrew", 22,
859 {
860 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
861 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
862 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
863 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS,
864 IDNA_SUCCESS},
865 {
866 "Hindi (Devanagari)", 30,
867 {
868 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
869 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
870 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
871 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
872 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
873 IDNA_SUCCESS},
874 {
875 "Japanese (kanji and hiragana)", 18,
876 {
877 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
878 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
879 0x306E, 0x304B},
880 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
881 IDNA_SUCCESS},
882 {
883 "Russian (Cyrillic)", 28,
884 {
885 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
886 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
887 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
888 0x0441, 0x0441, 0x043A, 0x0438},
889 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
890 IDNA_SUCCESS, IDNA_SUCCESS},
891 {
892 "Spanish", 40,
893 {
894 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
895 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
896 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
897 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
898 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
899 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
900 IDNA_SUCCESS},
901 {
902 "Vietnamese", 31,
903 {
904 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
905 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
906 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
907 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
908 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
909 IDNA_SUCCESS},
910 {
911 "Japanese", 8,
912 {
913 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
914 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS,
915 IDNA_SUCCESS},
916 {
917 "Japanese", 24,
918 {
919 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
920 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
921 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
922 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
923 IDNA_SUCCESS},
924 {
925 "Japanese", 25,
926 {
927 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
928 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
929 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
930 0x6240},
931 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
932 IDNA_SUCCESS},
933 {
934 "Japanese", 8,
935 {
936 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
937 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS,
938 IDNA_SUCCESS},
939 {
940 "Japanese", 13,
941 {
942 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
943 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
944 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS,
945 IDNA_SUCCESS},
946 {
947 "Japanese", 9,
948 {
949 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
950 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
951 {
952 "Japanese", 7,
953 {
954 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
955 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
956 {
957 "Greek", 8,
958 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
959 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
960 {
961 "Maltese (Malti)", 10,
962 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
963 0x0127, 0x0061},
964 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
965 {
966 "Russian (Cyrillic)", 28,
967 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
968 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
969 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
970 0x0441, 0x0441, 0x043a, 0x0438},
971 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
972 IDNA_SUCCESS, IDNA_SUCCESS},
973 };
974
975static void
976check_idna_encoding()
977{
978 putenv("CHARSET=UTF-8");
979
980 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
981 {
982 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
983
984 size_t p, q;
985 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
986 idna_vec[i].inlen,
987 &p, &q);
988 utf8 utf = string(uc);
989 utf8 tutf;
990 free(uc);
991
992 ace a = string(idna_vec[i].out);
993 ace tace;
994 utf8_to_ace(utf, tace);
995 L(F("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace());
996 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
997 ace_to_utf8(a, tutf);
998 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
999 }
1000}
1001
1002static void encode_test()
1003{
1004 check_idna_encoding();
1005}
1006
1007void
1008add_transform_tests(test_suite * suite)
1009{
1010 I(suite);
1011 suite->add(BOOST_TEST_CASE(&enc_test));
1012 suite->add(BOOST_TEST_CASE(&rdiff_test));
1013 suite->add(BOOST_TEST_CASE(&calculate_ident_test));
1014 suite->add(BOOST_TEST_CASE(&caseconv_test));
1015 suite->add(BOOST_TEST_CASE(&join_lines_test));
1016 suite->add(BOOST_TEST_CASE(&strip_ws_test));
1017 suite->add(BOOST_TEST_CASE(&encode_test));
1018}
1019
1020#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status