monotone

monotone Mtn Source Tree

Root/idna/toutf8.c

1/* toutf8.cConvert strings from system locale into UTF-8.
2 * Copyright (C) 2002, 2003 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
21
22#if HAVE_CONFIG_H
23# include "config.h"
24#endif
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#include "idna/stringprep.h"
31
32#ifdef _LIBC
33# define HAVE_ICONV 1
34# define LOCALE_WORKS 1
35# define ICONV_CONST
36#endif
37
38#if defined(HAVE_ERRNO_H) || defined(_LIBC)
39# include <errno.h>
40#endif
41
42#ifdef HAVE_ICONV
43# include <iconv.h>
44
45# if LOCALE_WORKS
46# include <langinfo.h>
47# include <locale.h>
48# endif
49
50static const char *
51stringprep_locale_charset_slow (void)
52{
53 const char *charset = getenv ("CHARSET");/* flawfinder: ignore */
54
55 if (charset && *charset)
56 return charset;
57
58# ifdef LOCALE_WORKS
59 {
60 char *p;
61
62 p = setlocale (LC_CTYPE, NULL);
63 setlocale (LC_CTYPE, "");
64
65 charset = nl_langinfo (CODESET);
66
67 setlocale (LC_CTYPE, p);
68
69 if (charset && *charset)
70 return charset;
71 }
72# endif
73
74 return "ASCII";
75}
76
77static const char *stringprep_locale_charset_cache = NULL;
78
79/**
80 * stringprep_locale_charset:
81 *
82 * Find out system locale charset.
83 *
84 * Note that this function return what it believe the SYSTEM is using
85 * as a locale, not what locale the program is currently in (modified,
86 * e.g., by a setlocale(LC_CTYPE, "ISO-8859-1")). The reason is that
87 * data read from argv[], stdin etc comes from the system, and is more
88 * likely to be encoded using the system locale than the program
89 * locale.
90 *
91 * You can set the environment variable CHARSET to override the value
92 * returned. Note that this function caches the result, so you will
93 * have to modify CHARSET before calling (even indirectly) any
94 * stringprep functions, e.g., by setting it when invoking the
95 * application.
96 *
97 * Return value: Return the character set used by the system locale.
98 * It will never return NULL, but use "ASCII" as a fallback.
99 **/
100const char *
101stringprep_locale_charset (void)
102{
103 if (!stringprep_locale_charset_cache)
104 stringprep_locale_charset_cache = stringprep_locale_charset_slow ();
105
106 return stringprep_locale_charset_cache;
107}
108
109/**
110 * stringprep_convert:
111 * @str: input zero-terminated string.
112 * @to_codeset: name of destination character set.
113 * @from_codeset: name of origin character set, as used by @str.
114 *
115 * Convert the string from one character set to another using the
116 * system's iconv() function.
117 *
118 * Return value: Returns newly allocated zero-terminated string which
119 * is @str transcoded into to_codeset.
120 **/
121char *
122stringprep_convert (const char *str,
123 const char *to_codeset, const char *from_codeset,
124 int best_effort)
125{
126 iconv_t cd;
127 char *dest;
128 char *outp;
129 char *p, *startp;
130 size_t inbytes_remaining;
131 size_t outbytes_remaining;
132 size_t err;
133 size_t outbuf_size;
134 int have_error = 0;
135 int from_utf8;
136 int len;
137
138 if (strcmp (to_codeset, from_codeset) == 0)
139 {
140 char *p;
141 p = malloc (strlen (str) + 1);
142 if (!p)
143return NULL;
144 strcpy (p, str);
145 return p;
146 }
147
148 from_utf8 = (strcmp (from_codeset, "UTF-8") == 0);
149
150#ifdef ICONV_TRANSLIT
151 if (best_effort)
152 {
153 char to_c[strlen (to_codeset) + 10];
154 strcpy (to_c, to_codeset);
155 strcat (to_c, "//TRANSLIT");
156 cd = iconv_open (to_c, from_codeset);
157 }
158 else
159 cd = iconv_open (to_codeset, from_codeset);
160#else
161 cd = iconv_open (to_codeset, from_codeset);
162#endif
163
164 if (cd == (iconv_t) - 1)
165 return NULL;
166
167 p = (char *) malloc (strlen (str) + 1);
168 if (p == NULL)
169 return NULL;
170 strcpy (p, str);
171 len = strlen (p);
172 startp = p;
173 inbytes_remaining = len;
174 outbuf_size = len + 1;/* + 1 for nul in case len == 1 */
175
176 outbytes_remaining = outbuf_size - 1;/* -1 for nul */
177 outp = dest = malloc (outbuf_size);
178
179again:
180
181 err = iconv (cd, (ICONV_CONST char **) &p, &inbytes_remaining,
182 &outp, &outbytes_remaining);
183
184 if (err == (size_t) - 1)
185 {
186 switch (errno)
187{
188case EINVAL:
189 /* Incomplete text, do not report an error */
190 break;
191
192case E2BIG:
193 {
194 size_t used = outp - dest;
195
196 outbuf_size *= 2;
197 dest = realloc (dest, outbuf_size);
198
199 outp = dest + used;
200 outbytes_remaining = outbuf_size - used - 1;/* -1 for nul */
201
202 goto again;
203 }
204 break;
205
206case EILSEQ:
207 if (!best_effort || outbytes_remaining == 0)
208 {
209 have_error = 1;
210 break;
211 }
212 else
213 {
214 int char_len;
215 if (!from_utf8)
216 char_len = 1; // not from UTF-8, one '?' will do
217 else
218 {
219 if ((*p & 0x80) == 0)
220 char_len = 1;
221 else if ((*p & 0x40) == 0)
222 char_len = 1; // error: not allowed to begin a sequence
223 else if ((*p & 0x20) == 0)
224 char_len = 2;
225 else if ((*p & 0x10) == 0)
226 char_len = 3;
227 else if ((*p & 0x08) == 0)
228 char_len = 4;
229 else if ((*p & 0x04) == 0)
230 char_len = 5;
231 else if ((*p & 0x02) == 0)
232 char_len = 6;
233 else
234 char_len = 1; // error: 0xFE/0xFF not used by UTF-8
235 }
236 if (char_len > inbytes_remaining)
237 char_len = inbytes_remaining;
238 p += char_len;
239 inbytes_remaining -= char_len;
240 *outp++ = '?';
241 --outbytes_remaining;
242 if (inbytes_remaining > 0)
243 goto again;
244 }
245 break;
246
247default:
248 have_error = 1;
249 break;
250}
251 }
252
253 *outp = '\0';
254
255 if ((p - startp) != len)
256 have_error = 1;
257
258
259 free (startp);
260
261 iconv_close (cd);
262
263 if (have_error)
264 {
265 free (dest);
266 dest = NULL;
267 }
268
269 return dest;
270}
271
272#else /* HAVE_ICONV */
273
274const char *
275stringprep_locale_charset ()
276{
277 return "ASCII";
278}
279
280char *
281stringprep_convert (const char *str,
282 const char *to_codeset, const char *from_codeset,
283 int best_effort)
284{
285 char *p;
286 fprintf (stderr, "libidn: warning: libiconv not installed, cannot "
287 "convert data from %s to %s\n", from_codeset, to_codeset);
288 p = malloc (strlen (str) + 1);
289 if (!p)
290 return NULL;
291 strcpy (p, str);
292 return p;
293}
294
295#endif /* HAVE_ICONV */
296
297/**
298 * stringprep_locale_to_utf8:
299 * @str: input zero terminated string.
300 *
301 * Convert string encoded in the locale's character set into UTF-8 by
302 * using stringprep_convert().
303 *
304 * Return value: Returns newly allocated zero-terminated string which
305 * is @str transcoded into UTF-8.
306 **/
307char *
308stringprep_locale_to_utf8 (const char *str)
309{
310 return stringprep_convert (str, "UTF-8", stringprep_locale_charset (), 0);
311}
312
313/**
314 * stringprep_utf8_to_locale:
315 * @str: input zero terminated string.
316 *
317 * Convert string encoded in UTF-8 into the locale's character set by
318 * using stringprep_convert().
319 *
320 * Return value: Returns newly allocated zero-terminated string which
321 * is @str transcoded into the locale's character set.
322 **/
323char *
324stringprep_utf8_to_locale (const char *str)
325{
326 return stringprep_convert (str, stringprep_locale_charset (), "UTF-8", 0);
327}

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status