monotone

monotone Mtn Source Tree

Root/src/charset.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include "vector.hh"
12
13#include <boost/tokenizer.hpp>
14#include <idna.h>
15#include <stringprep.h>
16
17#include "charset.hh"
18#include "numeric_vocab.hh"
19#include "sanity.hh"
20#include "simplestring_xform.hh"
21#include "vocab_cast.hh"
22
23using std::string;
24using std::vector;
25using std::free;
26
27using boost::char_separator;
28
29// General character code conversion routines.
30
31static string
32system_charset()
33{
34 char const * locale_charset_name = stringprep_locale_charset ();
35 I(locale_charset_name != NULL);
36 string sys_charset(locale_charset_name);
37 return sys_charset;
38}
39
40void
41charset_convert(string const & src_charset,
42 string const & dst_charset,
43 string const & src,
44 string & dst,
45 bool best_effort,
46 origin::type whence)
47{
48 if (src_charset == dst_charset)
49 dst = src;
50 else
51 {
52 // Always try converting without special treatment first.
53 char const * converted = stringprep_convert(src.c_str(),
54 dst_charset.c_str(),
55 src_charset.c_str());
56
57 if (best_effort && !converted)
58 {
59 // Not all iconv implementations support this.
60 string tmp_charset(dst_charset);
61 tmp_charset += "//TRANSLIT";
62 converted = stringprep_convert(src.c_str(),
63 tmp_charset.c_str(),
64 src_charset.c_str());
65
66 // If that didn't work just give up.
67 if (!converted)
68 converted = src.c_str();
69 }
70
71 E(converted != NULL, whence,
72 F("failed to convert string from %s to %s: '%s'")
73 % src_charset % dst_charset % src);
74 dst = string(converted);
75 if (converted != src.c_str())
76 free(const_cast<char*>(converted));
77 }
78}
79
80size_t
81display_width(utf8 const & utf)
82{
83 string const & u = utf();
84 size_t sz = 0;
85 string::const_iterator i = u.begin();
86 while (i != u.end())
87 {
88 if (UNLIKELY(static_cast<u8>(*i) & static_cast<u8>(0x80)))
89 {
90 // A UTF-8 escape: consume the full escape.
91 ++i;
92 ++sz;
93 while (i != u.end()
94 && (static_cast<u8>(*i) & static_cast<u8>(0x80))
95 && (!(static_cast<u8>(*i) & static_cast<u8>(0x40))))
96 ++i;
97 }
98 else
99 {
100 // An ASCII-like character in the range 0..0x7F.
101 ++i;
102 ++sz;
103 }
104 }
105 return sz;
106}
107
108// Lots of gunk to avoid charset conversion as much as possible. Running
109// iconv over every element of every path in a 30,000 file manifest takes
110// multiple seconds, which then is a minimum bound on pretty much any
111// operation we do...
112static inline bool
113system_charset_is_utf8_impl()
114{
115 string lc_encoding = lowercase(system_charset());
116 return (lc_encoding == "utf-8"
117 || lc_encoding == "utf_8"
118 || lc_encoding == "utf8");
119}
120
121static inline bool
122system_charset_is_utf8()
123{
124 static bool it_is = system_charset_is_utf8_impl();
125 return it_is;
126}
127
128static inline bool
129system_charset_is_ascii_extension_impl()
130{
131 if (system_charset_is_utf8())
132 return true;
133 string lc_encoding = lowercase(system_charset());
134 // if your character set is identical to ascii in the lower 7 bits, then add
135 // it here for a speed boost.
136 return (lc_encoding.find("ascii") != string::npos
137 || lc_encoding.find("8859") != string::npos
138 || lc_encoding.find("ansi_x3.4") != string::npos
139 || lc_encoding == "646" // another name for ascii
140 // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended
141 // Unix Code) is a simple and clean encoding, standard on Unix
142 // systems.... It is backwards-compatible with ASCII (i.e. valid
143 // ASCII implies valid EUC)."
144 || lc_encoding.find("euc") != string::npos);
145}
146
147static inline bool
148system_charset_is_ascii_extension()
149{
150 static bool it_is = system_charset_is_ascii_extension_impl();
151 return it_is;
152}
153
154inline static bool
155is_all_ascii(string const & utf)
156{
157 // could speed this up by vectorization -- mask against 0x80808080,
158 // process a whole word at at time...
159 for (string::const_iterator i = utf.begin(); i != utf.end(); ++i)
160 if (0x80 & *i)
161 return false;
162 return true;
163}
164
165// this function must be fast. do not make it slow.
166void
167utf8_to_system_strict(utf8 const & utf, string & ext)
168{
169 if (system_charset_is_utf8())
170 ext = utf();
171 else if (system_charset_is_ascii_extension()
172 && is_all_ascii(utf()))
173 ext = utf();
174 else
175 charset_convert("UTF-8", system_charset(), utf(), ext, false,
176 utf.made_from);
177}
178
179// this function must be fast. do not make it slow.
180void
181utf8_to_system_best_effort(utf8 const & utf, string & ext)
182{
183 if (system_charset_is_utf8())
184 ext = utf();
185 else if (system_charset_is_ascii_extension()
186 && is_all_ascii(utf()))
187 ext = utf();
188 else
189 charset_convert("UTF-8", system_charset(), utf(), ext, true,
190 utf.made_from);
191}
192
193void
194utf8_to_system_strict(utf8 const & utf, external & ext)
195{
196 string out;
197 utf8_to_system_strict(utf, out);
198 ext = external(out, utf.made_from);
199}
200
201void
202utf8_to_system_best_effort(utf8 const & utf, external & ext)
203{
204 string out;
205 utf8_to_system_best_effort(utf, out);
206 ext = external(out, utf.made_from);
207}
208
209void
210system_to_utf8(external const & ext, utf8 & utf)
211{
212 if (system_charset_is_utf8())
213 utf = typecast_vocab<utf8>(ext);
214 else if (system_charset_is_ascii_extension()
215 && is_all_ascii(ext()))
216 utf = typecast_vocab<utf8>(ext);
217 else
218 {
219 string out;
220 charset_convert(system_charset(), "UTF-8", ext(), out, false,
221 ext.made_from);
222 utf = utf8(out, ext.made_from);
223 I(utf8_validate(utf));
224 }
225}
226
227// utf8_validate and the helper functions is_valid_unicode_char and
228// utf8_consume_continuation_char g_utf8_validate and supporting functions
229// from the file gutf8.c of the GLib library.
230
231static bool
232is_valid_unicode_char(u32 c)
233{
234 return (c < 0x110000 &&
235 ((c & 0xfffff800) != 0xd800) &&
236 (c < 0xfdd0 || c > 0xfdef) &&
237 (c & 0xfffe) != 0xfffe);
238}
239
240static bool
241utf8_consume_continuation_char(u8 c, u32 & val)
242{
243 if ((c & 0xc0) != 0x80)
244 return false;
245 val <<= 6;
246 val |= c & 0x3f;
247 return true;
248}
249
250bool
251utf8_validate(utf8 const & utf)
252{
253 string::size_type left = utf().size();
254 u32 min, val;
255
256 for (string::const_iterator i = utf().begin();
257 i != utf().end(); ++i, --left)
258 {
259 u8 c = *i;
260 if (c < 128)
261 continue;
262 if ((c & 0xe0) == 0xc0)
263 {
264 if (left < 2)
265 return false;
266 if ((c & 0x1e) == 0)
267 return false;
268 ++i; --left; c = *i;
269 if ((c & 0xc0) != 0x80)
270 return false;
271 }
272 else
273 {
274 if ((c & 0xf0) == 0xe0)
275 {
276 if (left < 3)
277 return false;
278 min = 1 << 11;
279 val = c & 0x0f;
280 goto two_remaining;
281 }
282 else if ((c & 0xf8) == 0xf0)
283 {
284 if (left < 4)
285 return false;
286 min = 1 << 16;
287 val = c & 0x07;
288 }
289 else
290 return false;
291 ++i; --left; c = *i;
292 if (!utf8_consume_continuation_char(c, val))
293 return false;
294two_remaining:
295 ++i; --left; c = *i;
296 if (!utf8_consume_continuation_char(c, val))
297 return false;
298 ++i; --left; c = *i;
299 if (!utf8_consume_continuation_char(c, val))
300 return false;
301 if (val < min)
302 return false;
303 if (!is_valid_unicode_char(val))
304 return false;
305 }
306 }
307 return true;
308}
309
310static string
311decode_idna_error(int err)
312{
313 switch (static_cast<Idna_rc>(err))
314 {
315 case IDNA_STRINGPREP_ERROR: return "stringprep error"; break;
316 case IDNA_PUNYCODE_ERROR: return "punycode error"; break;
317 case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break;
318 case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break;
319 case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break;
320 case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break;
321 case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break;
322 case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break;
323 case IDNA_ICONV_ERROR: return "iconv error"; break;
324 case IDNA_MALLOC_ERROR: return "malloc error"; break;
325 default: return "unknown error"; break;
326 }
327 return "unknown error";
328}
329
330void
331ace_to_utf8(string const & a, utf8 & utf, origin::type whence)
332{
333 char *out = NULL;
334 L(FL("converting %d bytes from IDNA ACE to UTF-8") % a.size());
335 int res = idna_to_unicode_8z8z(a.c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
336 E(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX, whence,
337 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
338 % a.size()
339 % decode_idna_error(res));
340 utf = utf8(string(out), whence);
341 free(out);
342}
343
344void
345utf8_to_ace(utf8 const & utf, string & a)
346{
347 char *out = NULL;
348 L(FL("converting %d bytes from UTF-8 to IDNA ACE") % utf().size());
349 int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES);
350 E(res == IDNA_SUCCESS, utf.made_from,
351 F("error converting %d UTF-8 bytes to IDNA ACE: %s")
352 % utf().size()
353 % decode_idna_error(res));
354 a = string(out);
355 free(out);
356}
357
358// Local Variables:
359// mode: C++
360// fill-column: 76
361// c-file-style: "gnu"
362// indent-tabs-mode: nil
363// End:
364// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status