monotone

monotone Mtn Source Tree

Root/charset.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include <vector>
12
13#include <boost/tokenizer.hpp>
14
15#include "idna/idna.h"
16#include "idna/stringprep.h"
17
18#include "charset.hh"
19#include "numeric_vocab.hh"
20#include "sanity.hh"
21#include "simplestring_xform.hh"
22
23using std::string;
24using std::vector;
25using std::free;
26
27using boost::char_separator;
28
29// General character code conversion routines.
30
31static string
32system_charset()
33{
34 char const * locale_charset_name = stringprep_locale_charset ();
35 I(locale_charset_name != NULL);
36 string sys_charset(locale_charset_name);
37 return sys_charset;
38}
39
40void
41charset_convert(string const & src_charset,
42 string const & dst_charset,
43 string const & src,
44 string & dst,
45 bool best_effort)
46{
47 if (src_charset == dst_charset)
48 dst = src;
49 else
50 {
51 char * converted = stringprep_convert(src.c_str(),
52 dst_charset.c_str(),
53 src_charset.c_str(),
54 best_effort);
55 E(converted != NULL,
56 F("failed to convert string from %s to %s: '%s'")
57 % src_charset % dst_charset % src);
58 dst = string(converted);
59 free(converted);
60 }
61}
62
63size_t
64display_width(utf8 const & utf)
65{
66 string const & u = utf();
67 size_t sz = 0;
68 string::const_iterator i = u.begin();
69 while (i != u.end())
70 {
71 if (UNLIKELY(static_cast<u8>(*i) & static_cast<u8>(0x80)))
72 {
73 // A UTF-8 escape: consume the full escape.
74 ++i;
75 ++sz;
76 while (i != u.end()
77 && (static_cast<u8>(*i) & static_cast<u8>(0x80))
78 && (!(static_cast<u8>(*i) & static_cast<u8>(0x40))))
79 ++i;
80 }
81 else
82 {
83 // An ASCII-like character in the range 0..0x7F.
84 ++i;
85 ++sz;
86 }
87 }
88 return sz;
89}
90
91// Lots of gunk to avoid charset conversion as much as possible. Running
92// iconv over every element of every path in a 30,000 file manifest takes
93// multiple seconds, which then is a minimum bound on pretty much any
94// operation we do...
95static inline bool
96system_charset_is_utf8_impl()
97{
98 string lc_encoding = lowercase(system_charset());
99 return (lc_encoding == "utf-8"
100 || lc_encoding == "utf_8"
101 || lc_encoding == "utf8");
102}
103
104static inline bool
105system_charset_is_utf8()
106{
107 static bool it_is = system_charset_is_utf8_impl();
108 return it_is;
109}
110
111static inline bool
112system_charset_is_ascii_extension_impl()
113{
114 if (system_charset_is_utf8())
115 return true;
116 string lc_encoding = lowercase(system_charset());
117 // if your character set is identical to ascii in the lower 7 bits, then add
118 // it here for a speed boost.
119 return (lc_encoding.find("ascii") != string::npos
120 || lc_encoding.find("8859") != string::npos
121 || lc_encoding.find("ansi_x3.4") != string::npos
122 || lc_encoding == "646" // another name for ascii
123 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
124 // Unix Code) is a simple and clean encoding, standard on Unix
125 // systems.... It is backwards-compatible with ASCII (i.e. valid
126 // ASCII implies valid EUC)."
127 || lc_encoding.find("euc") != string::npos);
128}
129
130static inline bool
131system_charset_is_ascii_extension()
132{
133 static bool it_is = system_charset_is_ascii_extension_impl();
134 return it_is;
135}
136
137inline static bool
138is_all_ascii(string const & utf)
139{
140 // could speed this up by vectorization -- mask against 0x80808080,
141 // process a whole word at at time...
142 for (string::const_iterator i = utf.begin(); i != utf.end(); ++i)
143 if (0x80 & *i)
144 return false;
145 return true;
146}
147
148// this function must be fast. do not make it slow.
149void
150utf8_to_system_strict(utf8 const & utf, string & ext)
151{
152 if (system_charset_is_utf8())
153 ext = utf();
154 else if (system_charset_is_ascii_extension()
155 && is_all_ascii(utf()))
156 ext = utf();
157 else
158 charset_convert("UTF-8", system_charset(), utf(), ext, false);
159}
160
161// this function must be fast. do not make it slow.
162void
163utf8_to_system_best_effort(utf8 const & utf, string & ext)
164{
165 if (system_charset_is_utf8())
166 ext = utf();
167 else if (system_charset_is_ascii_extension()
168 && is_all_ascii(utf()))
169 ext = utf();
170 else
171 charset_convert("UTF-8", system_charset(), utf(), ext, true);
172}
173
174void
175utf8_to_system_strict(utf8 const & utf, external & ext)
176{
177 string out;
178 utf8_to_system_strict(utf, out);
179 ext = external(out);
180}
181
182void
183utf8_to_system_best_effort(utf8 const & utf, external & ext)
184{
185 string out;
186 utf8_to_system_best_effort(utf, out);
187 ext = external(out);
188}
189
190void
191system_to_utf8(external const & ext, utf8 & utf)
192{
193 if (system_charset_is_utf8())
194 utf = utf8(ext());
195 else if (system_charset_is_ascii_extension()
196 && is_all_ascii(ext()))
197 utf = utf8(ext());
198 else
199 {
200 string out;
201 charset_convert(system_charset(), "UTF-8", ext(), out, false);
202 utf = utf8(out);
203 I(utf8_validate(utf));
204 }
205}
206
207// utf8_validate and the helper functions is_valid_unicode_char and
208// utf8_consume_continuation_char g_utf8_validate and supporting functions
209// from the file gutf8.c of the GLib library.
210
211static bool
212is_valid_unicode_char(u32 c)
213{
214 return (c < 0x110000 &&
215 ((c & 0xfffff800) != 0xd800) &&
216 (c < 0xfdd0 || c > 0xfdef) &&
217 (c & 0xfffe) != 0xfffe);
218}
219
220static bool
221utf8_consume_continuation_char(u8 c, u32 & val)
222{
223 if ((c & 0xc0) != 0x80)
224 return false;
225 val <<= 6;
226 val |= c & 0x3f;
227 return true;
228}
229
230bool
231utf8_validate(utf8 const & utf)
232{
233 string::size_type left = utf().size();
234 u32 min, val;
235
236 for (string::const_iterator i = utf().begin();
237 i != utf().end(); ++i, --left)
238 {
239 u8 c = *i;
240 if (c < 128)
241 continue;
242 if ((c & 0xe0) == 0xc0)
243 {
244 if (left < 2)
245 return false;
246 if ((c & 0x1e) == 0)
247 return false;
248 ++i; --left; c = *i;
249 if ((c & 0xc0) != 0x80)
250 return false;
251 }
252 else
253 {
254 if ((c & 0xf0) == 0xe0)
255 {
256 if (left < 3)
257 return false;
258 min = 1 << 11;
259 val = c & 0x0f;
260 goto two_remaining;
261 }
262 else if ((c & 0xf8) == 0xf0)
263 {
264 if (left < 4)
265 return false;
266 min = 1 << 16;
267 val = c & 0x07;
268 }
269 else
270 return false;
271 ++i; --left; c = *i;
272 if (!utf8_consume_continuation_char(c, val))
273 return false;
274two_remaining:
275 ++i; --left; c = *i;
276 if (!utf8_consume_continuation_char(c, val))
277 return false;
278 ++i; --left; c = *i;
279 if (!utf8_consume_continuation_char(c, val))
280 return false;
281 if (val < min)
282 return false;
283 if (!is_valid_unicode_char(val))
284 return false;
285 }
286 }
287 return true;
288}
289
290static string
291decode_idna_error(int err)
292{
293 switch (static_cast<Idna_rc>(err))
294 {
295 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
296 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
297 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
298 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
299 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
300 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
301 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
302 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
303 case IDNA_ICONV_ERROR: return "iconv error"; break;
304 case IDNA_MALLOC_ERROR: return "malloc error"; break;
305 default: return "unknown error"; break;
306 }
307 return "unknown error";
308}
309
310void
311ace_to_utf8(ace const & a, utf8 & utf)
312{
313 char *out = NULL;
314 L(FL("converting %d bytes from IDNA ACE to UTF-8") % a().size());
315 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
316 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
317 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
318 % a().size()
319 % decode_idna_error(res));
320 utf = utf8(string(out));
321 free(out);
322}
323
324void
325utf8_to_ace(utf8 const & utf, ace & a)
326{
327 char *out = NULL;
328 L(FL("converting %d bytes from UTF-8 to IDNA ACE") % utf().size());
329 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
330 N(res == IDNA_SUCCESS,
331 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
332 % utf().size()
333 % decode_idna_error(res));
334 a = ace(string(out));
335 free(out);
336}
337
338void
339internalize_cert_name(utf8 const & utf, cert_name & c)
340{
341 ace a;
342 utf8_to_ace(utf, a);
343 c = cert_name(a());
344}
345
346void
347internalize_cert_name(external const & ext, cert_name & c)
348{
349 utf8 utf;
350 system_to_utf8(ext, utf);
351 internalize_cert_name(utf, c);
352}
353
354void
355internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
356{
357 string tmp;
358 typedef boost::tokenizer<char_separator<char> >
359 tokenizer;
360 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
361 tokenizer tokens(utf(), sep);
362 bool in_domain = false;
363 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
364 {
365 if (!in_domain || *i == "." || *i == "@")
366 tmp += *i;
367 else
368 {
369 ace a;
370 utf8_to_ace(utf8(*i), a);
371 tmp += a();
372 }
373 if (*i == "@")
374 in_domain = true;
375 }
376 key = rsa_keypair_id(tmp);
377}
378
379void
380internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
381{
382 utf8 utf;
383 system_to_utf8(ext, utf);
384 internalize_rsa_keypair_id(utf, key);
385}
386
387void
388externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
389{
390 string tmp;
391 typedef boost::tokenizer<char_separator<char> >
392 tokenizer;
393 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
394 tokenizer tokens(key(), sep);
395 bool in_domain = false;
396 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
397 {
398 if (!in_domain || *i == "." || *i == "@")
399 tmp += *i;
400 else
401 {
402 ace a(*i);
403 utf8 u;
404 ace_to_utf8(a, u);
405 tmp += u();
406 }
407 if (*i == "@")
408 in_domain = true;
409 }
410 utf = utf8(tmp);
411}
412
413void
414externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
415{
416 utf8 utf;
417 externalize_rsa_keypair_id(key, utf);
418 utf8_to_system_strict(utf, ext);
419}
420
421void
422internalize_var_domain(utf8 const & utf, var_domain & d)
423{
424 ace a;
425 utf8_to_ace(utf, a);
426 d = var_domain(a());
427}
428
429void
430internalize_var_domain(external const & ext, var_domain & d)
431{
432 utf8 utf;
433 system_to_utf8(ext, utf);
434 internalize_var_domain(utf, d);
435}
436
437void
438externalize_var_domain(var_domain const & d, utf8 & utf)
439{
440 ace_to_utf8(ace(d()), utf);
441}
442
443void
444externalize_var_domain(var_domain const & d, external & ext)
445{
446 utf8 utf;
447 externalize_var_domain(d, utf);
448 utf8_to_system_strict(utf, ext);
449}
450
451
452#ifdef BUILD_UNIT_TESTS
453#include "unit_tests.hh"
454#include <stdlib.h>
455
456#define IDNA_ACE_PREFIX "xn--"
457#define IDNA_SUCCESS 0
458
459struct
460idna
461{
462 char *name;
463 size_t inlen;
464 u32 in[100];
465 char *out;
466 int allowunassigned;
467 int usestd3asciirules;
468 int toasciirc;
469 int tounicoderc;
470} idna_vec[] =
471 {
472 {
473 "Arabic (Egyptian)", 17,
474 {
475 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
476 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
477 0x061F},
478 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0,
479 IDNA_SUCCESS, IDNA_SUCCESS},
480 {
481 "Chinese (simplified)", 9,
482 {
483 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
484 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0,
485 IDNA_SUCCESS, IDNA_SUCCESS},
486 {
487 "Chinese (traditional)", 9,
488 {
489 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
490 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0,
491 IDNA_SUCCESS, IDNA_SUCCESS},
492 {
493 "Czech", 22,
494 {
495 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
496 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
497 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
498 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0,
499 IDNA_SUCCESS, IDNA_SUCCESS},
500 {
501 "Hebrew", 22,
502 {
503 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
504 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
505 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
506 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0,
507 IDNA_SUCCESS, IDNA_SUCCESS},
508 {
509 "Hindi (Devanagari)", 30,
510 {
511 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
512 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
513 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
514 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
515 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
516 IDNA_SUCCESS, IDNA_SUCCESS},
517 {
518 "Japanese (kanji and hiragana)", 18,
519 {
520 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
521 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
522 0x306E, 0x304B},
523 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
524 IDNA_SUCCESS, IDNA_SUCCESS},
525 {
526 "Russian (Cyrillic)", 28,
527 {
528 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
529 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
530 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
531 0x0441, 0x0441, 0x043A, 0x0438},
532 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
533 IDNA_SUCCESS, IDNA_SUCCESS},
534 {
535 "Spanish", 40,
536 {
537 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
538 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
539 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
540 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
541 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
542 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
543 IDNA_SUCCESS, IDNA_SUCCESS},
544 {
545 "Vietnamese", 31,
546 {
547 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
548 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
549 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
550 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
551 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
552 IDNA_SUCCESS, IDNA_SUCCESS},
553 {
554 "Japanese", 8,
555 {
556 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
557 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0,
558 IDNA_SUCCESS, IDNA_SUCCESS},
559 {
560 "Japanese", 24,
561 {
562 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
563 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
564 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
565 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
566 IDNA_SUCCESS, IDNA_SUCCESS},
567 {
568 "Japanese", 25,
569 {
570 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
571 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
572 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
573 0x6240},
574 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
575 IDNA_SUCCESS, IDNA_SUCCESS},
576 {
577 "Japanese", 8,
578 {
579 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
580 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0,
581 IDNA_SUCCESS, IDNA_SUCCESS},
582 {
583 "Japanese", 13,
584 {
585 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
586 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
587 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0,
588 IDNA_SUCCESS, IDNA_SUCCESS},
589 {
590 "Japanese", 9,
591 {
592 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
593 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
594 {
595 "Japanese", 7,
596 {
597 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
598 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
599 {
600 "Greek", 8,
601 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
602 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
603 {
604 "Maltese (Malti)", 10,
605 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
606 0x0127, 0x0061},
607 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
608 {
609 "Russian (Cyrillic)", 28,
610 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
611 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
612 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
613 0x0441, 0x0441, 0x043a, 0x0438},
614 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
615 IDNA_SUCCESS, IDNA_SUCCESS},
616 };
617
618UNIT_TEST(charset, idna_encoding)
619{
620 putenv("CHARSET=UTF-8");
621
622 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
623 {
624 UNIT_TEST_CHECKPOINT(("IDNA language: " + string(idna_vec[i].name)).c_str());
625
626 size_t p, q;
627 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
628 idna_vec[i].inlen,
629 &p, &q);
630 utf8 utf = utf8(uc);
631 utf8 tutf;
632 free(uc);
633
634 ace a = ace(idna_vec[i].out);
635 ace tace;
636 utf8_to_ace(utf, tace);
637 L(FL("ACE-encoded %s: '%s'") % idna_vec[i].name % tace());
638 UNIT_TEST_CHECK(lowercase(a()) == lowercase(tace()));
639 ace_to_utf8(a, tutf);
640 UNIT_TEST_CHECK(lowercase(utf()) == lowercase(tutf()));
641 }
642}
643
644UNIT_TEST(charset, utf8_validation)
645{
646 // these tests are based on the tests from the file utf8-validate.c of the
647 // GLib library, and also include sequences from Markus Kuhn's UTF-8
648 // example files.
649 const char* good_strings[] = {
650 "this is a valid but boring ASCII string",
651 "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88",
652 "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73",
653 "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d",
654 "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c",
655 "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6",
656 "\xc2\xa9\xc2\xa9\xc2\xa9",
657 "\xe2\x89\xa0\xe2\x89\xa0",
658 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5",
659 "\x00",
660 "\xc2\x80",
661 "\xe0\xa0\x80",
662 "\xf0\x90\x80\x80",
663 "\x7f",
664 "\xdf\xbf",
665 "\xed\x9f\xbf",
666 "\xee\x80\x80",
667 "\xef\xbf\xbd",
668 0
669 };
670 const char* bad_strings[] = {
671 "\xf8\x88\x80\x80\x80",
672 "\xfc\x84\x80\x80\x80\x80",
673 "\xef\xbf\xbf",
674 "\xf7\xbf\xbf\xbf",
675 "\xfb\xbf\xbf\xbf\xbf",
676 "\xfd\xbf\xbf\xbf\xbf\xbf",
677 "\xf4\x8f\xbf\xbf",
678 "\xf4\x90\x80\x80",
679 "\x80",
680 "\xbf",
681 "\x80\xbf",
682 "\x80\xbf\x80",
683 "\x80\xbf\x80\xbf",
684 "\x80\xbf\x80\xbf\x80",
685 "\x80\xbf\x80\xbf\x80\xbf",
686 "\x80\xbf\x80\xbf\x80\xbf\x80",
687 "\x80",
688 "\x81",
689 "\x82",
690 "\x83",
691 "\x84",
692 "\x85",
693 "\x86",
694 "\x87",
695 "\x88",
696 "\x89",
697 "\x8a",
698 "\x8b",
699 "\x8c",
700 "\x8d",
701 "\x8e",
702 "\x8f",
703 "\x90",
704 "\x91",
705 "\x92",
706 "\x93",
707 "\x94",
708 "\x95",
709 "\x96",
710 "\x97",
711 "\x98",
712 "\x99",
713 "\x9a",
714 "\x9b",
715 "\x9c",
716 "\x9d",
717 "\x9e",
718 "\x9f",
719 "\xa0",
720 "\xa1",
721 "\xa2",
722 "\xa3",
723 "\xa4",
724 "\xa5",
725 "\xa6",
726 "\xa7",
727 "\xa8",
728 "\xa9",
729 "\xaa",
730 "\xab",
731 "\xac",
732 "\xad",
733 "\xae",
734 "\xaf",
735 "\xb0",
736 "\xb1",
737 "\xb2",
738 "\xb3",
739 "\xb4",
740 "\xb5",
741 "\xb6",
742 "\xb7",
743 "\xb8",
744 "\xb9",
745 "\xba",
746 "\xbb",
747 "\xbc",
748 "\xbd",
749 "\xbe",
750 "\xbf",
751 "\xc0\x20",
752 "\xc1\x20",
753 "\xc2\x20",
754 "\xc3\x20",
755 "\xc4\x20",
756 "\xc5\x20",
757 "\xc6\x20",
758 "\xc7\x20",
759 "\xc8\x20",
760 "\xc9\x20",
761 "\xca\x20",
762 "\xcb\x20",
763 "\xcc\x20",
764 "\xcd\x20",
765 "\xce\x20",
766 "\xcf\x20",
767 "\xd0\x20",
768 "\xd1\x20",
769 "\xd2\x20",
770 "\xd3\x20",
771 "\xd4\x20",
772 "\xd5\x20",
773 "\xd6\x20",
774 "\xd7\x20",
775 "\xd8\x20",
776 "\xd9\x20",
777 "\xda\x20",
778 "\xdb\x20",
779 "\xdc\x20",
780 "\xdd\x20",
781 "\xde\x20",
782 "\xdf\x20",
783 "\xe0\x20",
784 "\xe1\x20",
785 "\xe2\x20",
786 "\xe3\x20",
787 "\xe4\x20",
788 "\xe5\x20",
789 "\xe6\x20",
790 "\xe7\x20",
791 "\xe8\x20",
792 "\xe9\x20",
793 "\xea\x20",
794 "\xeb\x20",
795 "\xec\x20",
796 "\xed\x20",
797 "\xee\x20",
798 "\xef\x20",
799 "\xf0\x20",
800 "\xf1\x20",
801 "\xf2\x20",
802 "\xf3\x20",
803 "\xf4\x20",
804 "\xf5\x20",
805 "\xf6\x20",
806 "\xf7\x20",
807 "\xf8\x20",
808 "\xf9\x20",
809 "\xfa\x20",
810 "\xfb\x20",
811 "\xfc\x20",
812 "\xfd\x20",
813 "\x20\xc0",
814 "\x20\xe0\x80",
815 "\x20\xf0\x80\x80",
816 "\x20\xf8\x80\x80\x80",
817 "\x20\xfc\x80\x80\x80\x80",
818 "\x20\xdf",
819 "\x20\xef\xbf",
820 "\x20\xf7\xbf\xbf",
821 "\x20\xfb\xbf\xbf\xbf",
822 "\x20\xfd\xbf\xbf\xbf\xbf",
823 "\x20\xfe\x20",
824 "\x20\xff\x20",
825 "\x20\xc0\xaf\x20",
826 "\x20\xe0\x80\xaf\x20",
827 "\x20\xf0\x80\x80\xaf\x20",
828 "\x20\xf8\x80\x80\x80\xaf\x20",
829 "\x20\xfc\x80\x80\x80\x80\xaf\x20",
830 "\x20\xc1\xbf\x20",
831 "\x20\xe0\x9f\xbf\x20",
832 "\x20\xf0\x8f\xbf\xbf\x20",
833 "\x20\xf8\x87\xbf\xbf\xbf\x20",
834 "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20",
835 "\x20\xc0\x80\x20",
836 "\x20\xe0\x80\x80\x20",
837 "\x20\xf0\x80\x80\x80\x20",
838 "\x20\xf8\x80\x80\x80\x80\x20",
839 "\x20\xfc\x80\x80\x80\x80\x80\x20",
840 "\x20\xed\xa0\x80\x20",
841 "\x20\xed\xad\xbf\x20",
842 "\x20\xed\xae\x80\x20",
843 "\x20\xed\xaf\xbf\x20",
844 "\x20\xed\xb0\x80\x20",
845 "\x20\xed\xbe\x80\x20",
846 "\x20\xed\xbf\xbf\x20",
847 "\x20\xed\xa0\x80\xed\xb0\x80\x20",
848 "\x20\xed\xa0\x80\xed\xbf\xbf\x20",
849 "\x20\xed\xad\xbf\xed\xb0\x80\x20",
850 "\x20\xed\xad\xbf\xed\xbf\xbf\x20",
851 "\x20\xed\xae\x80\xed\xb0\x80\x20",
852 "\x20\xed\xae\x80\xed\xbf\xbf\x20",
853 "\x20\xed\xaf\xbf\xed\xb0\x80\x20",
854 "\x20\xed\xaf\xbf\xed\xbf\xbf\x20",
855 "\x20\xef\xbf\xbe\x20",
856 "\x20\xef\xbf\xbf\x20",
857 0
858 };
859
860 for (int i = 0; good_strings[i]; ++i)
861 UNIT_TEST_CHECK(utf8_validate(utf8(good_strings[i])) == true);
862
863 for (int i = 0; bad_strings[i]; ++i)
864 UNIT_TEST_CHECK(utf8_validate(utf8(bad_strings[i])) == false);
865}
866
867#endif // BUILD_UNIT_TESTS
868
869// Local Variables:
870// mode: C++
871// fill-column: 76
872// c-file-style: "gnu"
873// indent-tabs-mode: nil
874// End:
875// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status