monotone

monotone Mtn Source Tree

Root/charset.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include <vector>
11
12#include <boost/tokenizer.hpp>
13
14#include "idna/idna.h"
15#include "idna/stringprep.h"
16
17#include "charset.hh"
18#include "numeric_vocab.hh"
19#include "sanity.hh"
20#include "simplestring_xform.hh"
21
22using std::string;
23using std::vector;
24using std::free;
25
26using boost::char_separator;
27
28// General character code conversion routines.
29
30static string
31system_charset()
32{
33 char const * locale_charset_name = stringprep_locale_charset ();
34 I(locale_charset_name != NULL);
35 string sys_charset(locale_charset_name);
36 return sys_charset;
37}
38
39void
40charset_convert(string const & src_charset,
41 string const & dst_charset,
42 string const & src,
43 string & dst)
44{
45 if (src_charset == dst_charset)
46 dst = src;
47 else
48 {
49 L(FL("converting %d bytes from %s to %s") % src.size()
50 % src_charset % dst_charset);
51 char * converted = stringprep_convert(src.c_str(),
52 dst_charset.c_str(),
53 src_charset.c_str());
54 E(converted != NULL,
55 F("failed to convert string from %s to %s: '%s'")
56 % src_charset % dst_charset % src);
57 dst = string(converted);
58 free(converted);
59 }
60}
61
62void
63system_to_utf8(external const & ext, utf8 & utf)
64{
65 string out;
66 charset_convert(system_charset(), "UTF-8", ext(), out);
67 I(utf8_validate(out));
68 utf = out;
69}
70
71size_t
72display_width(utf8 const & utf)
73{
74 string const & u = utf();
75 size_t sz = 0;
76 string::const_iterator i = u.begin();
77 while (i != u.end())
78 {
79 if (UNLIKELY(static_cast<u8>(*i) & static_cast<u8>(0x80)))
80 {
81 // A UTF-8 escape: consume the full escape.
82 ++i;
83 ++sz;
84 while (i != u.end()
85 && (static_cast<u8>(*i) & static_cast<u8>(0x80))
86 && (!(static_cast<u8>(*i) & static_cast<u8>(0x40))))
87 ++i;
88 }
89 else
90 {
91 // An ASCII-like character in the range 0..0x7F.
92 ++i;
93 ++sz;
94 }
95 }
96 return sz;
97}
98
99// Lots of gunk to avoid charset conversion as much as possible. Running
100// iconv over every element of every path in a 30,000 file manifest takes
101// multiple seconds, which then is a minimum bound on pretty much any
102// operation we do...
103static inline bool
104system_charset_is_utf8_impl()
105{
106 string lc_encoding = lowercase(system_charset());
107 return (lc_encoding == "utf-8"
108 || lc_encoding == "utf_8"
109 || lc_encoding == "utf8");
110}
111
112static inline bool
113system_charset_is_utf8()
114{
115 static bool it_is = system_charset_is_utf8_impl();
116 return it_is;
117}
118
119static inline bool
120system_charset_is_ascii_extension_impl()
121{
122 if (system_charset_is_utf8())
123 return true;
124 string lc_encoding = lowercase(system_charset());
125 // if your character set is identical to ascii in the lower 7 bits, then add
126 // it here for a speed boost.
127 return (lc_encoding.find("ascii") != string::npos
128 || lc_encoding.find("8859") != string::npos
129 || lc_encoding.find("ansi_x3.4") != string::npos
130 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
131 // Unix Code) is a simple and clean encoding, standard on Unix
132 // systems.... It is backwards-compatible with ASCII (i.e. valid
133 // ASCII implies valid EUC)."
134 || lc_encoding.find("euc") != string::npos);
135}
136
137static inline bool
138system_charset_is_ascii_extension()
139{
140 static bool it_is = system_charset_is_ascii_extension_impl();
141 return it_is;
142}
143
144inline static bool
145is_all_ascii(string const & utf)
146{
147 // could speed this up by vectorization -- mask against 0x80808080,
148 // process a whole word at at time...
149 for (string::const_iterator i = utf.begin(); i != utf.end(); ++i)
150 if (0x80 & *i)
151 return false;
152 return true;
153}
154
155// this function must be fast. do not make it slow.
156void
157utf8_to_system(utf8 const & utf, string & ext)
158{
159 if (system_charset_is_utf8())
160 ext = utf();
161 else if (system_charset_is_ascii_extension()
162 && is_all_ascii(utf()))
163 ext = utf();
164 else
165 charset_convert("UTF-8", system_charset(), utf(), ext);
166}
167
168void
169utf8_to_system(utf8 const & utf, external & ext)
170{
171 string out;
172 utf8_to_system(utf, out);
173 ext = out;
174}
175
176// utf8_validate and the helper functions is_valid_unicode_char and
177// utf8_consume_continuation_char g_utf8_validate and supporting functions
178// from the file gutf8.c of the GLib library.
179
180static bool
181is_valid_unicode_char(u32 c)
182{
183 return (c < 0x110000 &&
184 ((c & 0xfffff800) != 0xd800) &&
185 (c < 0xfdd0 || c > 0xfdef) &&
186 (c & 0xfffe) != 0xfffe);
187}
188
189static bool
190utf8_consume_continuation_char(u8 c, u32 & val)
191{
192 if ((c & 0xc0) != 0x80)
193 return false;
194 val <<= 6;
195 val |= c & 0x3f;
196 return true;
197}
198
199bool
200utf8_validate(utf8 const & utf)
201{
202 string::size_type left = utf().size();
203 u32 min, val;
204
205 for (string::const_iterator i = utf().begin();
206 i != utf().end(); ++i, --left)
207 {
208 u8 c = *i;
209 if (c < 128)
210 continue;
211 if ((c & 0xe0) == 0xc0)
212 {
213 if (left < 2)
214 return false;
215 if ((c & 0x1e) == 0)
216 return false;
217 ++i; --left; c = *i;
218 if ((c & 0xc0) != 0x80)
219 return false;
220 }
221 else
222 {
223 if ((c & 0xf0) == 0xe0)
224 {
225 if (left < 3)
226 return false;
227 min = 1 << 11;
228 val = c & 0x0f;
229 goto two_remaining;
230 }
231 else if ((c & 0xf8) == 0xf0)
232 {
233 if (left < 4)
234 return false;
235 min = 1 << 16;
236 val = c & 0x07;
237 }
238 else
239 return false;
240 ++i; --left; c = *i;
241 if (!utf8_consume_continuation_char(c, val))
242 return false;
243two_remaining:
244 ++i; --left; c = *i;
245 if (!utf8_consume_continuation_char(c, val))
246 return false;
247 ++i; --left; c = *i;
248 if (!utf8_consume_continuation_char(c, val))
249 return false;
250 if (val < min)
251 return false;
252 if (!is_valid_unicode_char(val))
253 return false;
254 }
255 }
256 return true;
257}
258
259static string
260decode_idna_error(int err)
261{
262 switch (static_cast<Idna_rc>(err))
263 {
264 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
265 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
266 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
267 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
268 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
269 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
270 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
271 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
272 case IDNA_ICONV_ERROR: return "iconv error"; break;
273 case IDNA_MALLOC_ERROR: return "malloc error"; break;
274 default: return "unknown error"; break;
275 }
276 return "unknown error";
277}
278
279void
280ace_to_utf8(ace const & a, utf8 & utf)
281{
282 char *out = NULL;
283 L(FL("converting %d bytes from IDNA ACE to UTF-8") % a().size());
284 int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
285 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
286 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
287 % a().size()
288 % decode_idna_error(res));
289 utf = string(out);
290 free(out);
291}
292
293void
294utf8_to_ace(utf8 const & utf, ace & a)
295{
296 char *out = NULL;
297 L(FL("converting %d bytes from UTF-8 to IDNA ACE") % utf().size());
298 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
299 N(res == IDNA_SUCCESS,
300 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
301 % utf().size()
302 % decode_idna_error(res));
303 a = string(out);
304 free(out);
305}
306
307void
308internalize_cert_name(utf8 const & utf, cert_name & c)
309{
310 ace a;
311 utf8_to_ace(utf, a);
312 c = a();
313}
314
315void
316internalize_cert_name(external const & ext, cert_name & c)
317{
318 utf8 utf;
319 system_to_utf8(ext(), utf);
320 internalize_cert_name(utf, c);
321}
322
323void
324externalize_cert_name(cert_name const & c, utf8 & utf)
325{
326 ace_to_utf8(ace(c()), utf);
327}
328
329void
330externalize_cert_name(cert_name const & c, external & ext)
331{
332 utf8 utf;
333 externalize_cert_name(c, utf);
334 utf8_to_system(utf, ext);
335}
336
337void
338internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
339{
340 string tmp;
341 typedef boost::tokenizer<char_separator<char> >
342 tokenizer;
343 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
344 tokenizer tokens(utf(), sep);
345 bool in_domain = false;
346 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
347 {
348 if (!in_domain || *i == "." || *i == "@")
349 tmp += *i;
350 else
351 {
352 ace a;
353 utf8_to_ace(*i, a);
354 tmp += a();
355 }
356 if (*i == "@")
357 in_domain = true;
358 }
359 key = tmp;
360}
361
362void
363internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
364{
365 utf8 utf;
366 system_to_utf8(ext, utf);
367 internalize_rsa_keypair_id(utf, key);
368}
369
370void
371externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
372{
373 string tmp;
374 typedef boost::tokenizer<char_separator<char> >
375 tokenizer;
376 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
377 tokenizer tokens(key(), sep);
378 bool in_domain = false;
379 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
380 {
381 if (!in_domain || *i == "." || *i == "@")
382 tmp += *i;
383 else
384 {
385 ace a(*i);
386 utf8 u;
387 ace_to_utf8(a, u);
388 tmp += u();
389 }
390 if (*i == "@")
391 in_domain = true;
392 }
393 utf = tmp;
394}
395
396void
397externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
398{
399 utf8 utf;
400 externalize_rsa_keypair_id(key, utf);
401 utf8_to_system(utf, ext);
402}
403
404void
405internalize_var_domain(utf8 const & utf, var_domain & d)
406{
407 ace a;
408 utf8_to_ace(utf, a);
409 d = a();
410}
411
412void
413internalize_var_domain(external const & ext, var_domain & d)
414{
415 utf8 utf;
416 system_to_utf8(ext(), utf);
417 internalize_var_domain(utf, d);
418}
419
420void
421externalize_var_domain(var_domain const & d, utf8 & utf)
422{
423 ace_to_utf8(ace(d()), utf);
424}
425
426void
427externalize_var_domain(var_domain const & d, external & ext)
428{
429 utf8 utf;
430 externalize_var_domain(d, utf);
431 utf8_to_system(utf, ext);
432}
433
434
435#ifdef BUILD_UNIT_TESTS
436#include "unit_tests.hh"
437#include <stdlib.h>
438
439#define IDNA_ACE_PREFIX "xn--"
440#define IDNA_SUCCESS 0
441
442struct
443idna
444{
445 char *name;
446 size_t inlen;
447 uint32_t in[100];
448 char *out;
449 int allowunassigned;
450 int usestd3asciirules;
451 int toasciirc;
452 int tounicoderc;
453} idna_vec[] =
454 {
455 {
456 "Arabic (Egyptian)", 17,
457 {
458 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
459 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
460 0x061F},
461 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0,
462 IDNA_SUCCESS, IDNA_SUCCESS},
463 {
464 "Chinese (simplified)", 9,
465 {
466 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587},
467 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0,
468 IDNA_SUCCESS, IDNA_SUCCESS},
469 {
470 "Chinese (traditional)", 9,
471 {
472 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587},
473 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0,
474 IDNA_SUCCESS, IDNA_SUCCESS},
475 {
476 "Czech", 22,
477 {
478 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
479 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
480 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079},
481 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0,
482 IDNA_SUCCESS, IDNA_SUCCESS},
483 {
484 "Hebrew", 22,
485 {
486 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
487 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
488 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA},
489 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0,
490 IDNA_SUCCESS, IDNA_SUCCESS},
491 {
492 "Hindi (Devanagari)", 30,
493 {
494 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
495 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
496 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
497 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902},
498 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0,
499 IDNA_SUCCESS, IDNA_SUCCESS},
500 {
501 "Japanese (kanji and hiragana)", 18,
502 {
503 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
504 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
505 0x306E, 0x304B},
506 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0,
507 IDNA_SUCCESS, IDNA_SUCCESS},
508 {
509 "Russian (Cyrillic)", 28,
510 {
511 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
512 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
513 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
514 0x0441, 0x0441, 0x043A, 0x0438},
515 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
516 IDNA_SUCCESS, IDNA_SUCCESS},
517 {
518 "Spanish", 40,
519 {
520 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
521 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
522 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
523 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
524 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C},
525 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0,
526 IDNA_SUCCESS, IDNA_SUCCESS},
527 {
528 "Vietnamese", 31,
529 {
530 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
531 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
532 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
533 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074},
534 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0,
535 IDNA_SUCCESS, IDNA_SUCCESS},
536 {
537 "Japanese", 8,
538 {
539 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F},
540 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0,
541 IDNA_SUCCESS, IDNA_SUCCESS},
542 {
543 "Japanese", 24,
544 {
545 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
546 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
547 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053},
548 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0,
549 IDNA_SUCCESS, IDNA_SUCCESS},
550 {
551 "Japanese", 25,
552 {
553 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
554 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
555 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
556 0x6240},
557 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0,
558 IDNA_SUCCESS, IDNA_SUCCESS},
559 {
560 "Japanese", 8,
561 {
562 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032},
563 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0,
564 IDNA_SUCCESS, IDNA_SUCCESS},
565 {
566 "Japanese", 13,
567 {
568 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
569 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D},
570 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0,
571 IDNA_SUCCESS, IDNA_SUCCESS},
572 {
573 "Japanese", 9,
574 {
575 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0},
576 IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
577 {
578 "Japanese", 7,
579 {
580 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067},
581 IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
582 {
583 "Greek", 8,
584 {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac},
585 IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
586 {
587 "Maltese (Malti)", 10,
588 {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127,
589 0x0127, 0x0061},
590 IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS},
591 {
592 "Russian (Cyrillic)", 28,
593 {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435,
594 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432,
595 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443,
596 0x0441, 0x0441, 0x043a, 0x0438},
597 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0,
598 IDNA_SUCCESS, IDNA_SUCCESS},
599 };
600
601static void
602check_idna_encoding()
603{
604 putenv("CHARSET=UTF-8");
605
606 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
607 {
608 BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name));
609
610 size_t p, q;
611 char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in,
612 idna_vec[i].inlen,
613 &p, &q);
614 utf8 utf = string(uc);
615 utf8 tutf;
616 free(uc);
617
618 ace a = string(idna_vec[i].out);
619 ace tace;
620 utf8_to_ace(utf, tace);
621 L(FL("ACE-encoded %s: '%s'") % idna_vec[i].name % tace());
622 BOOST_CHECK(lowercase(a()) == lowercase(tace()));
623 ace_to_utf8(a, tutf);
624 BOOST_CHECK(lowercase(utf()) == lowercase(tutf()));
625 }
626}
627
628static void encode_test()
629{
630 check_idna_encoding();
631}
632
633static void utf8_validation_test()
634{
635 // these tests are based on the tests from the file utf8-validate.c of the
636 // GLib library, and also include sequences from Markus Kuhn's UTF-8
637 // example files.
638 const char* good_strings[] = {
639 "this is a valid but boring ASCII string",
640 "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88",
641 "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73",
642 "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d",
643 "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c",
644 "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6",
645 "\xc2\xa9\xc2\xa9\xc2\xa9",
646 "\xe2\x89\xa0\xe2\x89\xa0",
647 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5",
648 "\x00",
649 "\xc2\x80",
650 "\xe0\xa0\x80",
651 "\xf0\x90\x80\x80",
652 "\x7f",
653 "\xdf\xbf",
654 "\xed\x9f\xbf",
655 "\xee\x80\x80",
656 "\xef\xbf\xbd",
657 0
658 };
659 const char* bad_strings[] = {
660 "\xf8\x88\x80\x80\x80",
661 "\xfc\x84\x80\x80\x80\x80",
662 "\xef\xbf\xbf",
663 "\xf7\xbf\xbf\xbf",
664 "\xfb\xbf\xbf\xbf\xbf",
665 "\xfd\xbf\xbf\xbf\xbf\xbf",
666 "\xf4\x8f\xbf\xbf",
667 "\xf4\x90\x80\x80",
668 "\x80",
669 "\xbf",
670 "\x80\xbf",
671 "\x80\xbf\x80",
672 "\x80\xbf\x80\xbf",
673 "\x80\xbf\x80\xbf\x80",
674 "\x80\xbf\x80\xbf\x80\xbf",
675 "\x80\xbf\x80\xbf\x80\xbf\x80",
676 "\x80",
677 "\x81",
678 "\x82",
679 "\x83",
680 "\x84",
681 "\x85",
682 "\x86",
683 "\x87",
684 "\x88",
685 "\x89",
686 "\x8a",
687 "\x8b",
688 "\x8c",
689 "\x8d",
690 "\x8e",
691 "\x8f",
692 "\x90",
693 "\x91",
694 "\x92",
695 "\x93",
696 "\x94",
697 "\x95",
698 "\x96",
699 "\x97",
700 "\x98",
701 "\x99",
702 "\x9a",
703 "\x9b",
704 "\x9c",
705 "\x9d",
706 "\x9e",
707 "\x9f",
708 "\xa0",
709 "\xa1",
710 "\xa2",
711 "\xa3",
712 "\xa4",
713 "\xa5",
714 "\xa6",
715 "\xa7",
716 "\xa8",
717 "\xa9",
718 "\xaa",
719 "\xab",
720 "\xac",
721 "\xad",
722 "\xae",
723 "\xaf",
724 "\xb0",
725 "\xb1",
726 "\xb2",
727 "\xb3",
728 "\xb4",
729 "\xb5",
730 "\xb6",
731 "\xb7",
732 "\xb8",
733 "\xb9",
734 "\xba",
735 "\xbb",
736 "\xbc",
737 "\xbd",
738 "\xbe",
739 "\xbf",
740 "\xc0\x20",
741 "\xc1\x20",
742 "\xc2\x20",
743 "\xc3\x20",
744 "\xc4\x20",
745 "\xc5\x20",
746 "\xc6\x20",
747 "\xc7\x20",
748 "\xc8\x20",
749 "\xc9\x20",
750 "\xca\x20",
751 "\xcb\x20",
752 "\xcc\x20",
753 "\xcd\x20",
754 "\xce\x20",
755 "\xcf\x20",
756 "\xd0\x20",
757 "\xd1\x20",
758 "\xd2\x20",
759 "\xd3\x20",
760 "\xd4\x20",
761 "\xd5\x20",
762 "\xd6\x20",
763 "\xd7\x20",
764 "\xd8\x20",
765 "\xd9\x20",
766 "\xda\x20",
767 "\xdb\x20",
768 "\xdc\x20",
769 "\xdd\x20",
770 "\xde\x20",
771 "\xdf\x20",
772 "\xe0\x20",
773 "\xe1\x20",
774 "\xe2\x20",
775 "\xe3\x20",
776 "\xe4\x20",
777 "\xe5\x20",
778 "\xe6\x20",
779 "\xe7\x20",
780 "\xe8\x20",
781 "\xe9\x20",
782 "\xea\x20",
783 "\xeb\x20",
784 "\xec\x20",
785 "\xed\x20",
786 "\xee\x20",
787 "\xef\x20",
788 "\xf0\x20",
789 "\xf1\x20",
790 "\xf2\x20",
791 "\xf3\x20",
792 "\xf4\x20",
793 "\xf5\x20",
794 "\xf6\x20",
795 "\xf7\x20",
796 "\xf8\x20",
797 "\xf9\x20",
798 "\xfa\x20",
799 "\xfb\x20",
800 "\xfc\x20",
801 "\xfd\x20",
802 "\x20\xc0",
803 "\x20\xe0\x80",
804 "\x20\xf0\x80\x80",
805 "\x20\xf8\x80\x80\x80",
806 "\x20\xfc\x80\x80\x80\x80",
807 "\x20\xdf",
808 "\x20\xef\xbf",
809 "\x20\xf7\xbf\xbf",
810 "\x20\xfb\xbf\xbf\xbf",
811 "\x20\xfd\xbf\xbf\xbf\xbf",
812 "\x20\xfe\x20",
813 "\x20\xff\x20",
814 "\x20\xc0\xaf\x20",
815 "\x20\xe0\x80\xaf\x20",
816 "\x20\xf0\x80\x80\xaf\x20",
817 "\x20\xf8\x80\x80\x80\xaf\x20",
818 "\x20\xfc\x80\x80\x80\x80\xaf\x20",
819 "\x20\xc1\xbf\x20",
820 "\x20\xe0\x9f\xbf\x20",
821 "\x20\xf0\x8f\xbf\xbf\x20",
822 "\x20\xf8\x87\xbf\xbf\xbf\x20",
823 "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20",
824 "\x20\xc0\x80\x20",
825 "\x20\xe0\x80\x80\x20",
826 "\x20\xf0\x80\x80\x80\x20",
827 "\x20\xf8\x80\x80\x80\x80\x20",
828 "\x20\xfc\x80\x80\x80\x80\x80\x20",
829 "\x20\xed\xa0\x80\x20",
830 "\x20\xed\xad\xbf\x20",
831 "\x20\xed\xae\x80\x20",
832 "\x20\xed\xaf\xbf\x20",
833 "\x20\xed\xb0\x80\x20",
834 "\x20\xed\xbe\x80\x20",
835 "\x20\xed\xbf\xbf\x20",
836 "\x20\xed\xa0\x80\xed\xb0\x80\x20",
837 "\x20\xed\xa0\x80\xed\xbf\xbf\x20",
838 "\x20\xed\xad\xbf\xed\xb0\x80\x20",
839 "\x20\xed\xad\xbf\xed\xbf\xbf\x20",
840 "\x20\xed\xae\x80\xed\xb0\x80\x20",
841 "\x20\xed\xae\x80\xed\xbf\xbf\x20",
842 "\x20\xed\xaf\xbf\xed\xb0\x80\x20",
843 "\x20\xed\xaf\xbf\xed\xbf\xbf\x20",
844 "\x20\xef\xbf\xbe\x20",
845 "\x20\xef\xbf\xbf\x20",
846 0
847 };
848
849 for (int i = 0; good_strings[i]; ++i)
850 BOOST_CHECK(utf8_validate(string(good_strings[i])) == true);
851
852 for (int i = 0; bad_strings[i]; ++i)
853 BOOST_CHECK(utf8_validate(string(bad_strings[i])) == false);
854}
855
856
857void
858add_charset_tests(test_suite * suite)
859{
860 I(suite);
861 suite->add(BOOST_TEST_CASE(&encode_test));
862 suite->add(BOOST_TEST_CASE(&utf8_validation_test));
863}
864
865#endif // BUILD_UNIT_TESTS
866
867// Local Variables:
868// mode: C++
869// fill-column: 76
870// c-file-style: "gnu"
871// indent-tabs-mode: nil
872// End:
873// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status