monotone

monotone Mtn Source Tree

Root/charset.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include <vector>
11
12#include <boost/tokenizer.hpp>
13
14#include "idna/idna.h"
15#include "idna/stringprep.h"
16
17#include "charset.hh"
18#include "numeric_vocab.hh"
19#include "sanity.hh"
20#include "simplestring_xform.hh"
21
22using std::string;
23using std::vector;
24using std::free;
25
26using boost::char_separator;
27
28// General character code conversion routines.
29
30static string
31system_charset()
32{
33 char const * locale_charset_name = stringprep_locale_charset ();
34 I(locale_charset_name != NULL);
35 string sys_charset(locale_charset_name);
36 return sys_charset;
37}
38
39void
40charset_convert(string const & src_charset,
41 string const & dst_charset,
42 string const & src,
43 string & dst,
44 bool best_effort)
45{
46 if (src_charset == dst_charset)
47 dst = src;
48 else
49 {
50 char * converted = stringprep_convert(src.c_str(),
51 dst_charset.c_str(),
52 src_charset.c_str(),
53 best_effort);
54 E(converted != NULL,
55 F("failed to convert string from %s to %s: '%s'")
56 % src_charset % dst_charset % src);
57 dst = string(converted);
58 free(converted);
59 }
60}
61
62size_t
63display_width(utf8 const & utf)
64{
65 string const & u = utf();
66 size_t sz = 0;
67 string::const_iterator i = u.begin();
68 while (i != u.end())
69 {
70 if (UNLIKELY(static_cast<u8>(*i) & static_cast<u8>(0x80)))
71 {
72 // A UTF-8 escape: consume the full escape.
73 ++i;
74 ++sz;
75 while (i != u.end()
76 && (static_cast<u8>(*i) & static_cast<u8>(0x80))
77 && (!(static_cast<u8>(*i) & static_cast<u8>(0x40))))
78 ++i;
79 }
80 else
81 {
82 // An ASCII-like character in the range 0..0x7F.
83 ++i;
84 ++sz;
85 }
86 }
87 return sz;
88}
89
90// Lots of gunk to avoid charset conversion as much as possible. Running
91// iconv over every element of every path in a 30,000 file manifest takes
92// multiple seconds, which then is a minimum bound on pretty much any
93// operation we do...
94static inline bool
95system_charset_is_utf8_impl()
96{
97 string lc_encoding = lowercase(system_charset());
98 return (lc_encoding == "utf-8"
99 || lc_encoding == "utf_8"
100 || lc_encoding == "utf8");
101}
102
103static inline bool
104system_charset_is_utf8()
105{
106 static bool it_is = system_charset_is_utf8_impl();
107 return it_is;
108}
109
110static inline bool
111system_charset_is_ascii_extension_impl()
112{
113 if (system_charset_is_utf8())
114 return true;
115 string lc_encoding = lowercase(system_charset());
116 // if your character set is identical to ascii in the lower 7 bits, then add
117 // it here for a speed boost.
118 return (lc_encoding.find("ascii") != string::npos
119 || lc_encoding.find("8859") != string::npos
120 || lc_encoding.find("ansi_x3.4") != string::npos
121 || lc_encoding == "646" // another name for ascii
122 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
123 // Unix Code) is a simple and clean encoding, standard on Unix
124 // systems.... It is backwards-compatible with ASCII (i.e. valid
125 // ASCII implies valid EUC)."
126 || lc_encoding.find("euc") != string::npos);
127}
128
129static inline bool
130system_charset_is_ascii_extension()
131{
132 static bool it_is = system_charset_is_ascii_extension_impl();
133 return it_is;
134}
135
136inline static bool
137is_all_ascii(string const & utf)
138{
139 // could speed this up by vectorization -- mask against 0x80808080,
140 // process a whole word at at time...
141 for (string::const_iterator i = utf.begin(); i != utf.end(); ++i)
142 if (0x80 & *i)
143 return false;
144 return true;
145}
146
147// this function must be fast. do not make it slow.
148void
149utf8_to_system_strict(utf8 const & utf, string & ext)
150{
151 if (system_charset_is_utf8())
152 ext = utf();
153 else if (system_charset_is_ascii_extension()
154 && is_all_ascii(utf()))
155 ext = utf();
156 else
157 charset_convert("UTF-8", system_charset(), utf(), ext, false);
158}
159
160// this function must be fast. do not make it slow.
161void
162utf8_to_system_best_effort(utf8 const & utf, string & ext)
163{
164 if (system_charset_is_utf8())
165 ext = utf();
166 else if (system_charset_is_ascii_extension()
167 && is_all_ascii(utf()))
168 ext = utf();
169 else
170 charset_convert("UTF-8", system_charset(), utf(), ext, true);
171}
172
173void
174utf8_to_system_strict(utf8 const & utf, external & ext)
175{
176 string out;
177 utf8_to_system_strict(utf, out);
178 ext = external(out);
179}
180
181void
182utf8_to_system_best_effort(utf8 const & utf, external & ext)
183{
184 string out;
185 utf8_to_system_best_effort(utf, out);
186 ext = external(out);
187}
188
189void
190system_to_utf8(external const & ext, utf8 & utf)
191{
192 if (system_charset_is_utf8())
193 utf = utf8(ext());
194 else if (system_charset_is_ascii_extension()
195 && is_all_ascii(ext()))
196 utf = utf8(ext());
197 else
198 {
199 string out;
200 charset_convert(system_charset(), "UTF-8", ext(), out, false);
201 utf = utf8(out);
202 I(utf8_validate(utf));
203 }
204}
205
206// utf8_validate and the helper functions is_valid_unicode_char and
207// utf8_consume_continuation_char g_utf8_validate and supporting functions
208// from the file gutf8.c of the GLib library.
209
210static bool
211is_valid_unicode_char(u32 c)
212{
213 return (c < 0x110000 &&
214 ((c & 0xfffff800) != 0xd800) &&
215 (c < 0xfdd0 || c > 0xfdef) &&
216 (c & 0xfffe) != 0xfffe);
217}
218
219static bool
220utf8_consume_continuation_char(u8 c, u32 & val)
221{
222 if ((c & 0xc0) != 0x80)
223 return false;
224 val <<= 6;
225 val |= c & 0x3f;
226 return true;
227}
228
229bool
230utf8_validate(utf8 const & utf)
231{
232 string::size_type left = utf().size();
233 u32 min, val;
234
235 for (string::const_iterator i = utf().begin();
236 i != utf().end(); ++i, --left)
237 {
238 u8 c = *i;
239 if (c < 128)
240 continue;
241 if ((c & 0xe0) == 0xc0)
242 {
243 if (left < 2)
244 return false;
245 if ((c & 0x1e) == 0)
246 return false;
247 ++i; --left; c = *i;
248 if ((c & 0xc0) != 0x80)
249 return false;
250 }
251 else
252 {
253 if ((c & 0xf0) == 0xe0)
254 {
255 if (left < 3)
256 return false;
257 min = 1 << 11;
258 val = c & 0x0f;
259 goto two_remaining;
260 }
261 else if ((c & 0xf8) == 0xf0)
262 {
263 if (left < 4)
264 return false;
265 min = 1 << 16;
266 val = c & 0x07;
267 }
268 else
269 return false;
270 ++i; --left; c = *i;
271 if (!utf8_consume_continuation_char(c, val))
272 return false;
273two_remaining:
274 ++i; --left; c = *i;
275 if (!utf8_consume_continuation_char(c, val))
276 return false;
277 ++i; --left; c = *i;
278 if (!utf8_consume_continuation_char(c, val))
279 return false;
280 if (val < min)
281 return false;
282 if (!is_valid_unicode_char(val))
283 return false;
284 }
285 }
286 return true;
287}
288
289static string
290decode_idna_error(int err)
291{
292 switch (static_cast<Idna_rc>(err))
293 {
294 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
295 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
296 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
297 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
298 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
299 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
300 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
301 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
302 case IDNA_ICONV_ERROR: return "iconv error"; break;
303 case IDNA_MALLOC_ERROR: return "malloc error"; break;
304 default: return "unknown error"; break;
305 }
306 return "unknown error";
307}
308
309void
310ace_to_utf8(ace const & a, utf8 & utf)
311{
312 char *out = NULL;
313 L(FL("converting %d bytes from IDNA ACE to UTF-8") % a().size());
314 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
315 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
316 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
317 % a().size()
318 % decode_idna_error(res));
319 utf = utf8(string(out));
320 free(out);
321}
322
323void
324utf8_to_ace(utf8 const & utf, ace & a)
325{
326 char *out = NULL;
327 L(FL("converting %d bytes from UTF-8 to IDNA ACE") % utf().size());
328 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
329 N(res == IDNA_SUCCESS,
330 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
331 % utf().size()
332 % decode_idna_error(res));
333 a = ace(string(out));
334 free(out);
335}
336
337void
338internalize_cert_name(utf8 const & utf, cert_name & c)
339{
340 ace a;
341 utf8_to_ace(utf, a);
342 c = cert_name(a());
343}
344
345void
346internalize_cert_name(external const & ext, cert_name & c)
347{
348 utf8 utf;
349 system_to_utf8(ext, utf);
350 internalize_cert_name(utf, c);
351}
352
353void
354internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
355{
356 string tmp;
357 typedef boost::tokenizer<char_separator<char> >
358 tokenizer;
359 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
360 tokenizer tokens(utf(), sep);
361 bool in_domain = false;
362 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
363 {
364 if (!in_domain || *i == "." || *i == "@")
365 tmp += *i;
366 else
367 {
368 ace a;
369 utf8_to_ace(utf8(*i), a);
370 tmp += a();
371 }
372 if (*i == "@")
373 in_domain = true;
374 }
375 key = rsa_keypair_id(tmp);
376}
377
378void
379internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
380{
381 utf8 utf;
382 system_to_utf8(ext, utf);
383 internalize_rsa_keypair_id(utf, key);
384}
385
386void
387externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
388{
389 string tmp;
390 typedef boost::tokenizer<char_separator<char> >
391 tokenizer;
392 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
393 tokenizer tokens(key(), sep);
394 bool in_domain = false;
395 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
396 {
397 if (!in_domain || *i == "." || *i == "@")
398 tmp += *i;
399 else
400 {
401 ace a(*i);
402 utf8 u;
403 ace_to_utf8(a, u);
404 tmp += u();
405 }
406 if (*i == "@")
407 in_domain = true;
408 }
409 utf = utf8(tmp);
410}
411
412void
413externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
414{
415 utf8 utf;
416 externalize_rsa_keypair_id(key, utf);
417 utf8_to_system_strict(utf, ext);
418}
419
420void
421internalize_var_domain(utf8 const & utf, var_domain & d)
422{
423 ace a;
424 utf8_to_ace(utf, a);
425 d = var_domain(a());
426}
427
428void
429internalize_var_domain(external const & ext, var_domain & d)
430{
431 utf8 utf;
432 system_to_utf8(ext, utf);
433 internalize_var_domain(utf, d);
434}
435
436void
437externalize_var_domain(var_domain const & d, utf8 & utf)
438{
439 ace_to_utf8(ace(d()), utf);
440}
441
442void
443externalize_var_domain(var_domain const & d, external & ext)
444{
445 utf8 utf;
446 externalize_var_domain(d, utf);
447 utf8_to_system_strict(utf, ext);
448}
449
450
451#ifdef BUILD_UNIT_TESTS
452#include "unit_tests.hh"
453#include <stdlib.h>
454
455#define IDNA_ACE_PREFIX "xn--"
456#define IDNA_SUCCESS 0
457
458struct
459idna
460{
461 char *name;
462 size_t inlen;
463 uint32_t in[100];
464 char *out;
465 int allowunassigned;
466 int usestd3asciirules;
467 int toasciirc;
468 int tounicoderc;
469} idna_vec[] =
470 {
471 {
472 "Arabic (Egyptian)", 17,
473 {
474 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
475 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
476 0x061F},
477 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0,
478 IDNA_SUCCESS, IDNA_SUCCESS},
479 {
480 "Chinese (simplified)", 9,
481 {
482 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
483 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0,
484 IDNA_SUCCESS, IDNA_SUCCESS},
485 {
486 "Chinese (traditional)", 9,
487 {
488 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
489 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0,
490 IDNA_SUCCESS, IDNA_SUCCESS},
491 {
492 "Czech", 22,
493 {
494 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
495 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
496 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
497 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0,
498 IDNA_SUCCESS, IDNA_SUCCESS},
499 {
500 "Hebrew", 22,
501 {
502 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
503 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
504 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
505 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0,
506 IDNA_SUCCESS, IDNA_SUCCESS},
507 {
508 "Hindi (Devanagari)", 30,
509 {
510 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
511 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
512 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
513 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
514 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
515 IDNA_SUCCESS, IDNA_SUCCESS},
516 {
517 "Japanese (kanji and hiragana)", 18,
518 {
519 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
520 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
521 0x306E, 0x304B},
522 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
523 IDNA_SUCCESS, IDNA_SUCCESS},
524 {
525 "Russian (Cyrillic)", 28,
526 {
527 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
528 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
529 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
530 0x0441, 0x0441, 0x043A, 0x0438},
531 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
532 IDNA_SUCCESS, IDNA_SUCCESS},
533 {
534 "Spanish", 40,
535 {
536 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
537 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
538 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
539 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
540 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
541 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
542 IDNA_SUCCESS, IDNA_SUCCESS},
543 {
544 "Vietnamese", 31,
545 {
546 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
547 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
548 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
549 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
550 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
551 IDNA_SUCCESS, IDNA_SUCCESS},
552 {
553 "Japanese", 8,
554 {
555 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
556 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0,
557 IDNA_SUCCESS, IDNA_SUCCESS},
558 {
559 "Japanese", 24,
560 {
561 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
562 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
563 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
564 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
565 IDNA_SUCCESS, IDNA_SUCCESS},
566 {
567 "Japanese", 25,
568 {
569 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
570 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
571 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
572 0x6240},
573 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
574 IDNA_SUCCESS, IDNA_SUCCESS},
575 {
576 "Japanese", 8,
577 {
578 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
579 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0,
580 IDNA_SUCCESS, IDNA_SUCCESS},
581 {
582 "Japanese", 13,
583 {
584 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
585 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
586 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0,
587 IDNA_SUCCESS, IDNA_SUCCESS},
588 {
589 "Japanese", 9,
590 {
591 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
592 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
593 {
594 "Japanese", 7,
595 {
596 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
597 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
598 {
599 "Greek", 8,
600 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
601 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
602 {
603 "Maltese (Malti)", 10,
604 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
605 0x0127, 0x0061},
606 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
607 {
608 "Russian (Cyrillic)", 28,
609 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
610 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
611 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
612 0x0441, 0x0441, 0x043a, 0x0438},
613 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
614 IDNA_SUCCESS, IDNA_SUCCESS},
615 };
616
617UNIT_TEST(charset, idna_encoding)
618{
619 putenv("CHARSET=UTF-8");
620
621 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
622 {
623 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
624
625 size_t p, q;
626 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
627 idna_vec[i].inlen,
628 &p, &q);
629 utf8 utf = utf8(uc);
630 utf8 tutf;
631 free(uc);
632
633 ace a = ace(idna_vec[i].out);
634 ace tace;
635 utf8_to_ace(utf, tace);
636 L(FL("ACE-encoded %s: '%s'") % idna_vec[i].name % tace());
637 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
638 ace_to_utf8(a, tutf);
639 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
640 }
641}
642
643UNIT_TEST(charset, utf8_validation)
644{
645 // these tests are based on the tests from the file utf8-validate.c of the
646 // GLib library, and also include sequences from Markus Kuhn's UTF-8
647 // example files.
648 const char* good_strings[] = {
649 "this is a valid but boring ASCII string",
650 "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88",
651 "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73",
652 "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d",
653 "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c",
654 "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6",
655 "\xc2\xa9\xc2\xa9\xc2\xa9",
656 "\xe2\x89\xa0\xe2\x89\xa0",
657 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5",
658 "\x00",
659 "\xc2\x80",
660 "\xe0\xa0\x80",
661 "\xf0\x90\x80\x80",
662 "\x7f",
663 "\xdf\xbf",
664 "\xed\x9f\xbf",
665 "\xee\x80\x80",
666 "\xef\xbf\xbd",
667 0
668 };
669 const char* bad_strings[] = {
670 "\xf8\x88\x80\x80\x80",
671 "\xfc\x84\x80\x80\x80\x80",
672 "\xef\xbf\xbf",
673 "\xf7\xbf\xbf\xbf",
674 "\xfb\xbf\xbf\xbf\xbf",
675 "\xfd\xbf\xbf\xbf\xbf\xbf",
676 "\xf4\x8f\xbf\xbf",
677 "\xf4\x90\x80\x80",
678 "\x80",
679 "\xbf",
680 "\x80\xbf",
681 "\x80\xbf\x80",
682 "\x80\xbf\x80\xbf",
683 "\x80\xbf\x80\xbf\x80",
684 "\x80\xbf\x80\xbf\x80\xbf",
685 "\x80\xbf\x80\xbf\x80\xbf\x80",
686 "\x80",
687 "\x81",
688 "\x82",
689 "\x83",
690 "\x84",
691 "\x85",
692 "\x86",
693 "\x87",
694 "\x88",
695 "\x89",
696 "\x8a",
697 "\x8b",
698 "\x8c",
699 "\x8d",
700 "\x8e",
701 "\x8f",
702 "\x90",
703 "\x91",
704 "\x92",
705 "\x93",
706 "\x94",
707 "\x95",
708 "\x96",
709 "\x97",
710 "\x98",
711 "\x99",
712 "\x9a",
713 "\x9b",
714 "\x9c",
715 "\x9d",
716 "\x9e",
717 "\x9f",
718 "\xa0",
719 "\xa1",
720 "\xa2",
721 "\xa3",
722 "\xa4",
723 "\xa5",
724 "\xa6",
725 "\xa7",
726 "\xa8",
727 "\xa9",
728 "\xaa",
729 "\xab",
730 "\xac",
731 "\xad",
732 "\xae",
733 "\xaf",
734 "\xb0",
735 "\xb1",
736 "\xb2",
737 "\xb3",
738 "\xb4",
739 "\xb5",
740 "\xb6",
741 "\xb7",
742 "\xb8",
743 "\xb9",
744 "\xba",
745 "\xbb",
746 "\xbc",
747 "\xbd",
748 "\xbe",
749 "\xbf",
750 "\xc0\x20",
751 "\xc1\x20",
752 "\xc2\x20",
753 "\xc3\x20",
754 "\xc4\x20",
755 "\xc5\x20",
756 "\xc6\x20",
757 "\xc7\x20",
758 "\xc8\x20",
759 "\xc9\x20",
760 "\xca\x20",
761 "\xcb\x20",
762 "\xcc\x20",
763 "\xcd\x20",
764 "\xce\x20",
765 "\xcf\x20",
766 "\xd0\x20",
767 "\xd1\x20",
768 "\xd2\x20",
769 "\xd3\x20",
770 "\xd4\x20",
771 "\xd5\x20",
772 "\xd6\x20",
773 "\xd7\x20",
774 "\xd8\x20",
775 "\xd9\x20",
776 "\xda\x20",
777 "\xdb\x20",
778 "\xdc\x20",
779 "\xdd\x20",
780 "\xde\x20",
781 "\xdf\x20",
782 "\xe0\x20",
783 "\xe1\x20",
784 "\xe2\x20",
785 "\xe3\x20",
786 "\xe4\x20",
787 "\xe5\x20",
788 "\xe6\x20",
789 "\xe7\x20",
790 "\xe8\x20",
791 "\xe9\x20",
792 "\xea\x20",
793 "\xeb\x20",
794 "\xec\x20",
795 "\xed\x20",
796 "\xee\x20",
797 "\xef\x20",
798 "\xf0\x20",
799 "\xf1\x20",
800 "\xf2\x20",
801 "\xf3\x20",
802 "\xf4\x20",
803 "\xf5\x20",
804 "\xf6\x20",
805 "\xf7\x20",
806 "\xf8\x20",
807 "\xf9\x20",
808 "\xfa\x20",
809 "\xfb\x20",
810 "\xfc\x20",
811 "\xfd\x20",
812 "\x20\xc0",
813 "\x20\xe0\x80",
814 "\x20\xf0\x80\x80",
815 "\x20\xf8\x80\x80\x80",
816 "\x20\xfc\x80\x80\x80\x80",
817 "\x20\xdf",
818 "\x20\xef\xbf",
819 "\x20\xf7\xbf\xbf",
820 "\x20\xfb\xbf\xbf\xbf",
821 "\x20\xfd\xbf\xbf\xbf\xbf",
822 "\x20\xfe\x20",
823 "\x20\xff\x20",
824 "\x20\xc0\xaf\x20",
825 "\x20\xe0\x80\xaf\x20",
826 "\x20\xf0\x80\x80\xaf\x20",
827 "\x20\xf8\x80\x80\x80\xaf\x20",
828 "\x20\xfc\x80\x80\x80\x80\xaf\x20",
829 "\x20\xc1\xbf\x20",
830 "\x20\xe0\x9f\xbf\x20",
831 "\x20\xf0\x8f\xbf\xbf\x20",
832 "\x20\xf8\x87\xbf\xbf\xbf\x20",
833 "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20",
834 "\x20\xc0\x80\x20",
835 "\x20\xe0\x80\x80\x20",
836 "\x20\xf0\x80\x80\x80\x20",
837 "\x20\xf8\x80\x80\x80\x80\x20",
838 "\x20\xfc\x80\x80\x80\x80\x80\x20",
839 "\x20\xed\xa0\x80\x20",
840 "\x20\xed\xad\xbf\x20",
841 "\x20\xed\xae\x80\x20",
842 "\x20\xed\xaf\xbf\x20",
843 "\x20\xed\xb0\x80\x20",
844 "\x20\xed\xbe\x80\x20",
845 "\x20\xed\xbf\xbf\x20",
846 "\x20\xed\xa0\x80\xed\xb0\x80\x20",
847 "\x20\xed\xa0\x80\xed\xbf\xbf\x20",
848 "\x20\xed\xad\xbf\xed\xb0\x80\x20",
849 "\x20\xed\xad\xbf\xed\xbf\xbf\x20",
850 "\x20\xed\xae\x80\xed\xb0\x80\x20",
851 "\x20\xed\xae\x80\xed\xbf\xbf\x20",
852 "\x20\xed\xaf\xbf\xed\xb0\x80\x20",
853 "\x20\xed\xaf\xbf\xed\xbf\xbf\x20",
854 "\x20\xef\xbf\xbe\x20",
855 "\x20\xef\xbf\xbf\x20",
856 0
857 };
858
859 for (int i = 0; good_strings[i]; ++i)
860 BOOST_CHECK(utf8_validate(utf8(good_strings[i])) == true);
861
862 for (int i = 0; bad_strings[i]; ++i)
863 BOOST_CHECK(utf8_validate(utf8(bad_strings[i])) == false);
864}
865
866#endif // BUILD_UNIT_TESTS
867
868// Local Variables:
869// mode: C++
870// fill-column: 76
871// c-file-style: "gnu"
872// indent-tabs-mode: nil
873// End:
874// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status