monotone

monotone Mtn Source Tree

Root/charset.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include "vector.hh"
12
13#include <boost/tokenizer.hpp>
14
15#include "idna/idna.h"
16#include "idna/stringprep.h"
17
18#include "charset.hh"
19#include "numeric_vocab.hh"
20#include "sanity.hh"
21#include "simplestring_xform.hh"
22
23using std::string;
24using std::vector;
25using std::free;
26
27using boost::char_separator;
28
29// General character code conversion routines.
30
31static string
32system_charset()
33{
34 char const * locale_charset_name = stringprep_locale_charset ();
35 I(locale_charset_name != NULL);
36 string sys_charset(locale_charset_name);
37 return sys_charset;
38}
39
40void
41charset_convert(string const & src_charset,
42 string const & dst_charset,
43 string const & src,
44 string & dst,
45 bool best_effort)
46{
47 if (src_charset == dst_charset)
48 dst = src;
49 else
50 {
51 char * converted = stringprep_convert(src.c_str(),
52 dst_charset.c_str(),
53 src_charset.c_str(),
54 best_effort);
55 E(converted != NULL,
56 F("failed to convert string from %s to %s: '%s'")
57 % src_charset % dst_charset % src);
58 dst = string(converted);
59 free(converted);
60 }
61}
62
63size_t
64display_width(utf8 const & utf)
65{
66 string const & u = utf();
67 size_t sz = 0;
68 string::const_iterator i = u.begin();
69 while (i != u.end())
70 {
71 if (UNLIKELY(static_cast<u8>(*i) & static_cast<u8>(0x80)))
72 {
73 // A UTF-8 escape: consume the full escape.
74 ++i;
75 ++sz;
76 while (i != u.end()
77 && (static_cast<u8>(*i) & static_cast<u8>(0x80))
78 && (!(static_cast<u8>(*i) & static_cast<u8>(0x40))))
79 ++i;
80 }
81 else
82 {
83 // An ASCII-like character in the range 0..0x7F.
84 ++i;
85 ++sz;
86 }
87 }
88 return sz;
89}
90
91// Lots of gunk to avoid charset conversion as much as possible. Running
92// iconv over every element of every path in a 30,000 file manifest takes
93// multiple seconds, which then is a minimum bound on pretty much any
94// operation we do...
95static inline bool
96system_charset_is_utf8_impl()
97{
98 string lc_encoding = lowercase(system_charset());
99 return (lc_encoding == "utf-8"
100 || lc_encoding == "utf_8"
101 || lc_encoding == "utf8");
102}
103
104static inline bool
105system_charset_is_utf8()
106{
107 static bool it_is = system_charset_is_utf8_impl();
108 return it_is;
109}
110
111static inline bool
112system_charset_is_ascii_extension_impl()
113{
114 if (system_charset_is_utf8())
115 return true;
116 string lc_encoding = lowercase(system_charset());
117 // if your character set is identical to ascii in the lower 7 bits, then add
118 // it here for a speed boost.
119 return (lc_encoding.find("ascii") != string::npos
120 || lc_encoding.find("8859") != string::npos
121 || lc_encoding.find("ansi_x3.4") != string::npos
122 || lc_encoding == "646" // another name for ascii
123 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
124 // Unix Code) is a simple and clean encoding, standard on Unix
125 // systems.... It is backwards-compatible with ASCII (i.e. valid
126 // ASCII implies valid EUC)."
127 || lc_encoding.find("euc") != string::npos);
128}
129
130static inline bool
131system_charset_is_ascii_extension()
132{
133 static bool it_is = system_charset_is_ascii_extension_impl();
134 return it_is;
135}
136
137inline static bool
138is_all_ascii(string const & utf)
139{
140 // could speed this up by vectorization -- mask against 0x80808080,
141 // process a whole word at at time...
142 for (string::const_iterator i = utf.begin(); i != utf.end(); ++i)
143 if (0x80 & *i)
144 return false;
145 return true;
146}
147
148// this function must be fast. do not make it slow.
149void
150utf8_to_system_strict(utf8 const & utf, string & ext)
151{
152 if (system_charset_is_utf8())
153 ext = utf();
154 else if (system_charset_is_ascii_extension()
155 && is_all_ascii(utf()))
156 ext = utf();
157 else
158 charset_convert("UTF-8", system_charset(), utf(), ext, false);
159}
160
161// this function must be fast. do not make it slow.
162void
163utf8_to_system_best_effort(utf8 const & utf, string & ext)
164{
165 if (system_charset_is_utf8())
166 ext = utf();
167 else if (system_charset_is_ascii_extension()
168 && is_all_ascii(utf()))
169 ext = utf();
170 else
171 charset_convert("UTF-8", system_charset(), utf(), ext, true);
172}
173
174void
175utf8_to_system_strict(utf8 const & utf, external & ext)
176{
177 string out;
178 utf8_to_system_strict(utf, out);
179 ext = external(out);
180}
181
182void
183utf8_to_system_best_effort(utf8 const & utf, external & ext)
184{
185 string out;
186 utf8_to_system_best_effort(utf, out);
187 ext = external(out);
188}
189
190void
191system_to_utf8(external const & ext, utf8 & utf)
192{
193 if (system_charset_is_utf8())
194 utf = utf8(ext());
195 else if (system_charset_is_ascii_extension()
196 && is_all_ascii(ext()))
197 utf = utf8(ext());
198 else
199 {
200 string out;
201 charset_convert(system_charset(), "UTF-8", ext(), out, false);
202 utf = utf8(out);
203 I(utf8_validate(utf));
204 }
205}
206
207// utf8_validate and the helper functions is_valid_unicode_char and
208// utf8_consume_continuation_char g_utf8_validate and supporting functions
209// from the file gutf8.c of the GLib library.
210
211static bool
212is_valid_unicode_char(u32 c)
213{
214 return (c < 0x110000 &&
215 ((c & 0xfffff800) != 0xd800) &&
216 (c < 0xfdd0 || c > 0xfdef) &&
217 (c & 0xfffe) != 0xfffe);
218}
219
220static bool
221utf8_consume_continuation_char(u8 c, u32 & val)
222{
223 if ((c & 0xc0) != 0x80)
224 return false;
225 val <<= 6;
226 val |= c & 0x3f;
227 return true;
228}
229
230bool
231utf8_validate(utf8 const & utf)
232{
233 string::size_type left = utf().size();
234 u32 min, val;
235
236 for (string::const_iterator i = utf().begin();
237 i != utf().end(); ++i, --left)
238 {
239 u8 c = *i;
240 if (c < 128)
241 continue;
242 if ((c & 0xe0) == 0xc0)
243 {
244 if (left < 2)
245 return false;
246 if ((c & 0x1e) == 0)
247 return false;
248 ++i; --left; c = *i;
249 if ((c & 0xc0) != 0x80)
250 return false;
251 }
252 else
253 {
254 if ((c & 0xf0) == 0xe0)
255 {
256 if (left < 3)
257 return false;
258 min = 1 << 11;
259 val = c & 0x0f;
260 goto two_remaining;
261 }
262 else if ((c & 0xf8) == 0xf0)
263 {
264 if (left < 4)
265 return false;
266 min = 1 << 16;
267 val = c & 0x07;
268 }
269 else
270 return false;
271 ++i; --left; c = *i;
272 if (!utf8_consume_continuation_char(c, val))
273 return false;
274two_remaining:
275 ++i; --left; c = *i;
276 if (!utf8_consume_continuation_char(c, val))
277 return false;
278 ++i; --left; c = *i;
279 if (!utf8_consume_continuation_char(c, val))
280 return false;
281 if (val < min)
282 return false;
283 if (!is_valid_unicode_char(val))
284 return false;
285 }
286 }
287 return true;
288}
289
290static string
291decode_idna_error(int err)
292{
293 switch (static_cast<Idna_rc>(err))
294 {
295 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
296 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
297 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
298 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
299 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
300 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
301 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
302 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
303 case IDNA_ICONV_ERROR: return "iconv error"; break;
304 case IDNA_MALLOC_ERROR: return "malloc error"; break;
305 default: return "unknown error"; break;
306 }
307 return "unknown error";
308}
309
310static void
311ace_to_utf8(string const & a, utf8 & utf)
312{
313 char *out = NULL;
314 L(FL("converting %d bytes from IDNA ACE to UTF-8") % a.size());
315 int res = idna_to_unicode_8z8z(a.c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
316 N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX,
317 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
318 % a.size()
319 % decode_idna_error(res));
320 utf = utf8(string(out));
321 free(out);
322}
323
324static void
325utf8_to_ace(utf8 const & utf, string & a)
326{
327 char *out = NULL;
328 L(FL("converting %d bytes from UTF-8 to IDNA ACE") % utf().size());
329 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
330 N(res == IDNA_SUCCESS,
331 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
332 % utf().size()
333 % decode_idna_error(res));
334 a = string(out);
335 free(out);
336}
337
338void
339internalize_cert_name(utf8 const & utf, cert_name & c)
340{
341 string a;
342 utf8_to_ace(utf, a);
343 c = cert_name(a);
344}
345
346void
347internalize_cert_name(external const & ext, cert_name & c)
348{
349 utf8 utf;
350 system_to_utf8(ext, utf);
351 internalize_cert_name(utf, c);
352}
353
354void
355internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key)
356{
357 string tmp;
358 typedef boost::tokenizer<char_separator<char> >
359 tokenizer;
360 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
361 tokenizer tokens(utf(), sep);
362 bool in_domain = false;
363 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
364 {
365 if (!in_domain || *i == "." || *i == "@")
366 tmp += *i;
367 else
368 {
369 string a;
370 utf8_to_ace(utf8(*i), a);
371 tmp += a;
372 }
373 if (*i == "@")
374 in_domain = true;
375 }
376 key = rsa_keypair_id(tmp);
377}
378
379void
380internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key)
381{
382 utf8 utf;
383 system_to_utf8(ext, utf);
384 internalize_rsa_keypair_id(utf, key);
385}
386
387void
388externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf)
389{
390 string tmp;
391 typedef boost::tokenizer<char_separator<char> >
392 tokenizer;
393 char_separator<char> sep("", ".@", boost::keep_empty_tokens);
394 tokenizer tokens(key(), sep);
395 bool in_domain = false;
396 for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i)
397 {
398 if (!in_domain || *i == "." || *i == "@")
399 tmp += *i;
400 else
401 {
402 utf8 u;
403 ace_to_utf8(*i, u);
404 tmp += u();
405 }
406 if (*i == "@")
407 in_domain = true;
408 }
409 utf = utf8(tmp);
410}
411
412void
413externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext)
414{
415 utf8 utf;
416 externalize_rsa_keypair_id(key, utf);
417 utf8_to_system_strict(utf, ext);
418}
419
420void
421internalize_var_domain(utf8 const & utf, var_domain & d)
422{
423 string a;
424 utf8_to_ace(utf, a);
425 d = var_domain(a);
426}
427
428void
429internalize_var_domain(external const & ext, var_domain & d)
430{
431 utf8 utf;
432 system_to_utf8(ext, utf);
433 internalize_var_domain(utf, d);
434}
435
436void
437externalize_var_domain(var_domain const & d, utf8 & utf)
438{
439 ace_to_utf8(d(), utf);
440}
441
442void
443externalize_var_domain(var_domain const & d, external & ext)
444{
445 utf8 utf;
446 externalize_var_domain(d, utf);
447 utf8_to_system_strict(utf, ext);
448}
449
450
451#ifdef BUILD_UNIT_TESTS
452#include "unit_tests.hh"
453#include <stdlib.h>
454
455#define IDNA_ACE_PREFIX "xn--"
456#define IDNA_SUCCESS 0
457
458struct
459idna
460{
461 char const * name;
462 char const * utf;
463 char const * ace;
464} const idna_vec[] =
465 {
466 // In C, \x escapes consume an unbounded number of hexadecimal digits,
467 // and if the resulting number is too big for a byte it is a semantic
468 // error. However, if a string constant is composed of more than one
469 // string literal, they do not extend across a boundary between string
470 // literals. Thus, in some places in this array, string literals have
471 // been split solely to end \x escapes after two hex digits.
472 {
473 "Arabic (Egyptian)",
474 "\xd9\x84\xd9\x8a\xd9\x87\xd9\x85\xd8\xa7\xd8\xa8\xd8\xaa\xd9\x83\xd9"
475 "\x84\xd9\x85\xd9\x88\xd8\xb4\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a\xd8\x9f",
476 IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn" },
477 {
478 "Chinese (simplified)",
479 "\xe4\xbb\x96\xe4\xbb\xac\xe4\xb8\xba\xe4\xbb\x80\xe4\xb9\x88\xe4\xb8"
480 "\x8d\xe8\xaf\xb4\xe4\xb8\xad\xe6\x96\x87",
481 IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye" },
482 {
483 "Chinese (traditional)",
484 "\xe4\xbb\x96\xe5\x80\x91\xe7\x88\xb2\xe4\xbb\x80\xe9\xba\xbd\xe4\xb8"
485 "\x8d\xe8\xaa\xaa\xe4\xb8\xad\xe6\x96\x87",
486 IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb" },
487 {
488 "Czech",
489 "Pro\xc4\x8dprost\xc4\x9bnemluv\xc3\xad\xc4\x8d""esky",
490 IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a"},
491 {
492 "Hebrew",
493 "\xd7\x9c\xd7\x9e\xd7\x94\xd7\x94\xd7\x9d\xd7\xa4\xd7\xa9\xd7\x95\xd7"
494 "\x98\xd7\x9c\xd7\x90\xd7\x9e\xd7\x93\xd7\x91\xd7\xa8\xd7\x99\xd7\x9d"
495 "\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa",
496 IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b"},
497 {
498 "Hindi (Devanagari)",
499 "\xe0\xa4\xaf\xe0\xa4\xb9\xe0\xa4\xb2\xe0\xa5\x8b\xe0\xa4\x97\xe0\xa4"
500 "\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6\xe0\xa5\x80\xe0"
501 "\xa4\x95\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x82\xe0\xa4\xa8"
502 "\xe0\xa4\xb9\xe0\xa5\x80\xe0\xa4\x82\xe0\xa4\xac\xe0\xa5\x8b\xe0\xa4"
503 "\xb2\xe0\xa4\xb8\xe0\xa4\x95\xe0\xa4\xa4\xe0\xa5\x87\xe0\xa4\xb9\xe0"
504 "\xa5\x88\xe0\xa4\x82",
505 IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"},
506 {
507 "Japanese (kanji and hiragana)",
508 "\xe3\x81\xaa\xe3\x81\x9c\xe3\x81\xbf\xe3\x82\x93\xe3\x81\xaa\xe6\x97"
509 "\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x82\x92\xe8\xa9\xb1\xe3\x81\x97\xe3"
510 "\x81\xa6\xe3\x81\x8f\xe3\x82\x8c\xe3\x81\xaa\xe3\x81\x84\xe3\x81\xae"
511 "\xe3\x81\x8b",
512 IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"},
513 {
514 "Russian (Cyrillic)",
515 "\xd0\xbf\xd0\xbe\xd1\x87\xd0\xb5\xd0\xbc\xd1\x83\xd0\xb6\xd0\xb5\xd0"
516 "\xbe\xd0\xbd\xd0\xb8\xd0\xbd\xd0\xb5\xd0\xb3\xd0\xbe\xd0\xb2\xd0\xbe"
517 "\xd1\x80\xd1\x8f\xd1\x82\xd0\xbf\xd0\xbe\xd1\x80\xd1\x83\xd1\x81\xd1"
518 "\x81\xd0\xba\xd0\xb8",
519 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l"},
520 {
521 "Spanish",
522 "Porqu\xc3\xa9nopuedensimplementehablarenEspa\xc3\xb1ol",
523 IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a"},
524 {
525 "Vietnamese",
526 "T\xe1\xba\xa1isaoh\xe1\xbb\x8dkh\xc3\xb4ngth\xe1\xbb\x83""ch\xe1\xbb"
527 "\x89n\xc3\xb3iti\xe1\xba\xbfngVi\xe1\xbb\x87t",
528 IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"},
529 {
530 "Japanese",
531 "3\xe5\xb9\xb4""B\xe7\xb5\x84\xe9\x87\x91\xe5\x85\xab\xe5\x85\x88\xe7"
532 "\x94\x9f",
533 IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b"},
534 {
535 "Japanese",
536 "\xe5\xae\x89\xe5\xae\xa4\xe5\xa5\x88\xe7\xbe\x8e\xe6\x81\xb5-with-"
537 "SUPER-MONKEYS",
538 IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"},
539 {
540 "Japanese",
541 "Hello-Another-Way-\xe3\x81\x9d\xe3\x82\x8c\xe3\x81\x9e\xe3\x82\x8c"
542 "\xe3\x81\xae\xe5\xa0\xb4\xe6\x89\x80",
543 IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b"},
544 {
545 "Japanese",
546 "\xe3\x81\xb2\xe3\x81\xa8\xe3\x81\xa4\xe5\xb1\x8b\xe6\xa0\xb9\xe3\x81"
547 "\xae\xe4\xb8\x8b""2",
548 IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v"},
549 {
550 "Japanese",
551 "Maji\xe3\x81\xa7Koi\xe3\x81\x99\xe3\x82\x8b""5\xe7\xa7\x92\xe5\x89\x8d",
552 IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e"},
553 {
554 "Japanese",
555 "\xe3\x83\x91\xe3\x83\x95\xe3\x82\xa3\xe3\x83\xbc""de\xe3\x83\xab\xe3\x83"
556 "\xb3\xe3\x83\x90",
557 IDNA_ACE_PREFIX "de-jg4avhby1noc0d"},
558 {
559 "Japanese",
560 "\xe3\x81\x9d\xe3\x81\xae\xe3\x82\xb9\xe3\x83\x94\xe3\x83\xbc\xe3\x83"
561 "\x89\xe3\x81\xa7",
562 IDNA_ACE_PREFIX "d9juau41awczczp"},
563 {
564 "Greek",
565 "\xce\xb5\xce\xbb\xce\xbb\xce\xb7\xce\xbd\xce\xb9\xce\xba\xce\xac",
566 IDNA_ACE_PREFIX "hxargifdar"},
567 {
568 "Maltese (Malti)",
569 "bon\xc4\xa1usa\xc4\xa7\xc4\xa7""a",
570 IDNA_ACE_PREFIX "bonusaa-5bb1da"},
571 {
572 "Russian (Cyrillic)",
573 "\xd0\xbf\xd0\xbe\xd1\x87\xd0\xb5\xd0\xbc\xd1\x83\xd0\xb6\xd0\xb5\xd0"
574 "\xbe\xd0\xbd\xd0\xb8\xd0\xbd\xd0\xb5\xd0\xb3\xd0\xbe\xd0\xb2\xd0\xbe"
575 "\xd1\x80\xd1\x8f\xd1\x82\xd0\xbf\xd0\xbe\xd1\x80\xd1\x83\xd1\x81\xd1"
576 "\x81\xd0\xba\xd0\xb8",
577 IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l"},
578 };
579
580UNIT_TEST(charset, idna_encoding)
581{
582 // putenv takes a char*, not a const char*, there is nothing we can do.
583 putenv(const_cast<char *>("CHARSET=UTF-8"));
584
585 for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
586 {
587 UNIT_TEST_CHECKPOINT(("IDNA language: "
588 + string(idna_vec[i].name)).c_str());
589
590 string u = lowercase(idna_vec[i].utf);
591 string a = lowercase(idna_vec[i].ace);
592 string tace;
593 utf8_to_ace(utf8(u), tace);
594 L(FL("ACE-encoded %s: '%s'") % idna_vec[i].name % tace);
595 UNIT_TEST_CHECK(a == lowercase(tace));
596
597 utf8 tutf;
598 ace_to_utf8(a, tutf);
599 L(FL("UTF-encoded %s: '%s'") % idna_vec[i].name % tutf);
600 UNIT_TEST_CHECK(u == lowercase(tutf()));
601 }
602}
603
604UNIT_TEST(charset, utf8_validation)
605{
606 // these tests are based on the tests from the file utf8-validate.c of the
607 // GLib library, and also include sequences from Markus Kuhn's UTF-8
608 // example files.
609 const char* good_strings[] = {
610 "this is a valid but boring ASCII string",
611
612 "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f"
613 "\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d"
614 "\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80"
615 "\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88",
616
617 "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2"
618 "\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73",
619
620 "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70"
621 "\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65"
622 "\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d",
623
624 "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e"
625 "\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e"
626 "\xe2\x80\x9c",
627
628 "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2"
629 "\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35"
630 "\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6",
631
632 "\xc2\xa9\xc2\xa9\xc2\xa9",
633 "\xe2\x89\xa0\xe2\x89\xa0",
634 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5",
635 "\x00",
636 "\xc2\x80",
637 "\xe0\xa0\x80",
638 "\xf0\x90\x80\x80",
639 "\x7f",
640 "\xdf\xbf",
641 "\xed\x9f\xbf",
642 "\xee\x80\x80",
643 "\xef\xbf\xbd",
644 0
645 };
646 const char* bad_strings[] = {
647 "\xf8\x88\x80\x80\x80",
648 "\xfc\x84\x80\x80\x80\x80",
649 "\xef\xbf\xbf",
650 "\xf7\xbf\xbf\xbf",
651 "\xfb\xbf\xbf\xbf\xbf",
652 "\xfd\xbf\xbf\xbf\xbf\xbf",
653 "\xf4\x8f\xbf\xbf",
654 "\xf4\x90\x80\x80",
655 "\x80",
656 "\xbf",
657 "\x80\xbf",
658 "\x80\xbf\x80",
659 "\x80\xbf\x80\xbf",
660 "\x80\xbf\x80\xbf\x80",
661 "\x80\xbf\x80\xbf\x80\xbf",
662 "\x80\xbf\x80\xbf\x80\xbf\x80",
663 "\x80",
664 "\x81",
665 "\x82",
666 "\x83",
667 "\x84",
668 "\x85",
669 "\x86",
670 "\x87",
671 "\x88",
672 "\x89",
673 "\x8a",
674 "\x8b",
675 "\x8c",
676 "\x8d",
677 "\x8e",
678 "\x8f",
679 "\x90",
680 "\x91",
681 "\x92",
682 "\x93",
683 "\x94",
684 "\x95",
685 "\x96",
686 "\x97",
687 "\x98",
688 "\x99",
689 "\x9a",
690 "\x9b",
691 "\x9c",
692 "\x9d",
693 "\x9e",
694 "\x9f",
695 "\xa0",
696 "\xa1",
697 "\xa2",
698 "\xa3",
699 "\xa4",
700 "\xa5",
701 "\xa6",
702 "\xa7",
703 "\xa8",
704 "\xa9",
705 "\xaa",
706 "\xab",
707 "\xac",
708 "\xad",
709 "\xae",
710 "\xaf",
711 "\xb0",
712 "\xb1",
713 "\xb2",
714 "\xb3",
715 "\xb4",
716 "\xb5",
717 "\xb6",
718 "\xb7",
719 "\xb8",
720 "\xb9",
721 "\xba",
722 "\xbb",
723 "\xbc",
724 "\xbd",
725 "\xbe",
726 "\xbf",
727 "\xc0\x20",
728 "\xc1\x20",
729 "\xc2\x20",
730 "\xc3\x20",
731 "\xc4\x20",
732 "\xc5\x20",
733 "\xc6\x20",
734 "\xc7\x20",
735 "\xc8\x20",
736 "\xc9\x20",
737 "\xca\x20",
738 "\xcb\x20",
739 "\xcc\x20",
740 "\xcd\x20",
741 "\xce\x20",
742 "\xcf\x20",
743 "\xd0\x20",
744 "\xd1\x20",
745 "\xd2\x20",
746 "\xd3\x20",
747 "\xd4\x20",
748 "\xd5\x20",
749 "\xd6\x20",
750 "\xd7\x20",
751 "\xd8\x20",
752 "\xd9\x20",
753 "\xda\x20",
754 "\xdb\x20",
755 "\xdc\x20",
756 "\xdd\x20",
757 "\xde\x20",
758 "\xdf\x20",
759 "\xe0\x20",
760 "\xe1\x20",
761 "\xe2\x20",
762 "\xe3\x20",
763 "\xe4\x20",
764 "\xe5\x20",
765 "\xe6\x20",
766 "\xe7\x20",
767 "\xe8\x20",
768 "\xe9\x20",
769 "\xea\x20",
770 "\xeb\x20",
771 "\xec\x20",
772 "\xed\x20",
773 "\xee\x20",
774 "\xef\x20",
775 "\xf0\x20",
776 "\xf1\x20",
777 "\xf2\x20",
778 "\xf3\x20",
779 "\xf4\x20",
780 "\xf5\x20",
781 "\xf6\x20",
782 "\xf7\x20",
783 "\xf8\x20",
784 "\xf9\x20",
785 "\xfa\x20",
786 "\xfb\x20",
787 "\xfc\x20",
788 "\xfd\x20",
789 "\x20\xc0",
790 "\x20\xe0\x80",
791 "\x20\xf0\x80\x80",
792 "\x20\xf8\x80\x80\x80",
793 "\x20\xfc\x80\x80\x80\x80",
794 "\x20\xdf",
795 "\x20\xef\xbf",
796 "\x20\xf7\xbf\xbf",
797 "\x20\xfb\xbf\xbf\xbf",
798 "\x20\xfd\xbf\xbf\xbf\xbf",
799 "\x20\xfe\x20",
800 "\x20\xff\x20",
801 "\x20\xc0\xaf\x20",
802 "\x20\xe0\x80\xaf\x20",
803 "\x20\xf0\x80\x80\xaf\x20",
804 "\x20\xf8\x80\x80\x80\xaf\x20",
805 "\x20\xfc\x80\x80\x80\x80\xaf\x20",
806 "\x20\xc1\xbf\x20",
807 "\x20\xe0\x9f\xbf\x20",
808 "\x20\xf0\x8f\xbf\xbf\x20",
809 "\x20\xf8\x87\xbf\xbf\xbf\x20",
810 "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20",
811 "\x20\xc0\x80\x20",
812 "\x20\xe0\x80\x80\x20",
813 "\x20\xf0\x80\x80\x80\x20",
814 "\x20\xf8\x80\x80\x80\x80\x20",
815 "\x20\xfc\x80\x80\x80\x80\x80\x20",
816 "\x20\xed\xa0\x80\x20",
817 "\x20\xed\xad\xbf\x20",
818 "\x20\xed\xae\x80\x20",
819 "\x20\xed\xaf\xbf\x20",
820 "\x20\xed\xb0\x80\x20",
821 "\x20\xed\xbe\x80\x20",
822 "\x20\xed\xbf\xbf\x20",
823 "\x20\xed\xa0\x80\xed\xb0\x80\x20",
824 "\x20\xed\xa0\x80\xed\xbf\xbf\x20",
825 "\x20\xed\xad\xbf\xed\xb0\x80\x20",
826 "\x20\xed\xad\xbf\xed\xbf\xbf\x20",
827 "\x20\xed\xae\x80\xed\xb0\x80\x20",
828 "\x20\xed\xae\x80\xed\xbf\xbf\x20",
829 "\x20\xed\xaf\xbf\xed\xb0\x80\x20",
830 "\x20\xed\xaf\xbf\xed\xbf\xbf\x20",
831 "\x20\xef\xbf\xbe\x20",
832 "\x20\xef\xbf\xbf\x20",
833 0
834 };
835
836 for (int i = 0; good_strings[i]; ++i)
837 UNIT_TEST_CHECK(utf8_validate(utf8(good_strings[i])) == true);
838
839 for (int i = 0; bad_strings[i]; ++i)
840 UNIT_TEST_CHECK(utf8_validate(utf8(bad_strings[i])) == false);
841}
842
843#endif // BUILD_UNIT_TESTS
844
845// Local Variables:
846// mode: C++
847// fill-column: 76
848// c-file-style: "gnu"
849// indent-tabs-mode: nil
850// End:
851// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status