monotone

monotone Mtn Source Tree

Root/idna/nfkc.c

1/* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
21
22#if HAVE_CONFIG_H
23# include "config.h"
24#endif
25
26#include <stdlib.h>
27#include <string.h>
28
29#include "idna/stringprep.h"
30
31/* This file contains functions from GLIB, including gutf8.c and
32 * gunidecomp.c, all licensed under LGPL and copyright hold by:
33 *
34 * Copyright (C) 1999, 2000 Tom Tromey
35 * Copyright 2000 Red Hat, Inc.
36 */
37
38/* Hacks to make syncing with GLIB code easier. */
39#define gboolean int
40#define gchar char
41#define guchar unsigned char
42#define glong long
43#define gint int
44#define guint unsigned int
45#define gushort unsigned short
46#define gint16 int16_t
47#define guint16 uint16_t
48#define gunichar uint32_t
49#define gsize size_t
50#define gssize ssize_t
51#define g_malloc malloc
52#define g_free free
53#define GError void
54#define g_set_error(a,b,c,d) /* */
55#define g_new(struct_type, n_structs) \
56 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
57# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
58# define G_STMT_START (void)(
59# define G_STMT_END )
60# else
61# if (defined (sun) || defined (__sun__))
62# define G_STMT_START if (1)
63# define G_STMT_END else (void)0
64# else
65# define G_STMT_START do
66# define G_STMT_END while (0)
67# endif
68# endif
69#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
70#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
71#define TRUE 1
72#define FALSE 0
73
74/* Code from GLIB gunicode.h starts here. */
75
76typedef enum
77{
78 G_NORMALIZE_DEFAULT,
79 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
80 G_NORMALIZE_DEFAULT_COMPOSE,
81 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
82 G_NORMALIZE_ALL,
83 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
84 G_NORMALIZE_ALL_COMPOSE,
85 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
86}
87GNormalizeMode;
88
89/* Code from GLIB gutf8.c starts here. */
90
91#define UTF8_COMPUTE(Char, Mask, Len) \
92 if (Char < 128) \
93 { \
94 Len = 1; \
95 Mask = 0x7f; \
96 } \
97 else if ((Char & 0xe0) == 0xc0) \
98 { \
99 Len = 2; \
100 Mask = 0x1f; \
101 } \
102 else if ((Char & 0xf0) == 0xe0) \
103 { \
104 Len = 3; \
105 Mask = 0x0f; \
106 } \
107 else if ((Char & 0xf8) == 0xf0) \
108 { \
109 Len = 4; \
110 Mask = 0x07; \
111 } \
112 else if ((Char & 0xfc) == 0xf8) \
113 { \
114 Len = 5; \
115 Mask = 0x03; \
116 } \
117 else if ((Char & 0xfe) == 0xfc) \
118 { \
119 Len = 6; \
120 Mask = 0x01; \
121 } \
122 else \
123 Len = -1;
124
125#define UTF8_LENGTH(Char) \
126 ((Char) < 0x80 ? 1 : \
127 ((Char) < 0x800 ? 2 : \
128 ((Char) < 0x10000 ? 3 : \
129 ((Char) < 0x200000 ? 4 : \
130 ((Char) < 0x4000000 ? 5 : 6)))))
131
132
133#define UTF8_GET(Result, Chars, Count, Mask, Len) \
134 (Result) = (Chars)[0] & (Mask); \
135 for ((Count) = 1; (Count) < (Len); ++(Count)) \
136 { \
137 if (((Chars)[(Count)] & 0xc0) != 0x80) \
138 { \
139 (Result) = -1; \
140 break; \
141 } \
142 (Result) <<= 6; \
143 (Result) |= ((Chars)[(Count)] & 0x3f); \
144 }
145
146#define UNICODE_VALID(Char) \
147 ((Char) < 0x110000 && \
148 (((Char) & 0xFFFFF800) != 0xD800) && \
149 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
150 ((Char) & 0xFFFE) != 0xFFFE)
151
152
153static const gchar utf8_skip_data[256] = {
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 1, 1, 1, 1, 1, 1, 1,
166 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
167 2, 2, 2, 2, 2, 2, 2,
168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
169 5, 5, 5, 6, 6, 1, 1
170};
171
172const gchar *const g_utf8_skip = utf8_skip_data;
173
174#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
175
176/*
177 * g_utf8_strlen:
178 * @p: pointer to the start of a UTF-8 encoded string.
179 * @max: the maximum number of bytes to examine. If @max
180 * is less than 0, then the string is assumed to be
181 * nul-terminated. If @max is 0, @p will not be examined and
182 * may be %NULL.
183 *
184 * Returns the length of the string in characters.
185 *
186 * Return value: the length of the string in characters
187 **/
188glong
189g_utf8_strlen (const gchar * p, gssize max)
190{
191 glong len = 0;
192 const gchar *start = p;
193 g_return_val_if_fail (p != NULL || max == 0, 0);
194
195 if (max < 0)
196 {
197 while (*p)
198 {
199 p = g_utf8_next_char (p);
200 ++len;
201 }
202 }
203 else
204 {
205 if (max == 0 || !*p)
206 return 0;
207
208 p = g_utf8_next_char (p);
209
210 while (p - start < max && *p)
211 {
212 ++len;
213 p = g_utf8_next_char (p);
214 }
215
216 /* only do the last len increment if we got a complete
217 * char (don't count partial chars)
218 */
219 if (p - start == max)
220 ++len;
221 }
222
223 return len;
224}
225
226/*
227 * g_utf8_get_char:
228 * @p: a pointer to Unicode character encoded as UTF-8
229 *
230 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
231 * If @p does not point to a valid UTF-8 encoded character, results are
232 * undefined. If you are not sure that the bytes are complete
233 * valid Unicode characters, you should use g_utf8_get_char_validated()
234 * instead.
235 *
236 * Return value: the resulting character
237 **/
238static gunichar
239g_utf8_get_char (const gchar * p)
240{
241 int i, mask = 0, len;
242 gunichar result;
243 unsigned char c = (unsigned char) *p;
244
245 UTF8_COMPUTE (c, mask, len);
246 if (len == -1)
247 return (gunichar) - 1;
248 UTF8_GET (result, p, i, mask, len);
249
250 return result;
251}
252
253/*
254 * g_unichar_to_utf8:
255 * @c: a ISO10646 character code
256 * @outbuf: output buffer, must have at least 6 bytes of space.
257 * If %NULL, the length will be computed and returned
258 * and nothing will be written to @outbuf.
259 *
260 * Converts a single character to UTF-8.
261 *
262 * Return value: number of bytes written
263 **/
264static int
265g_unichar_to_utf8 (gunichar c, gchar * outbuf)
266{
267 guint len = 0;
268 int first;
269 int i;
270
271 if (c < 0x80)
272 {
273 first = 0;
274 len = 1;
275 }
276 else if (c < 0x800)
277 {
278 first = 0xc0;
279 len = 2;
280 }
281 else if (c < 0x10000)
282 {
283 first = 0xe0;
284 len = 3;
285 }
286 else if (c < 0x200000)
287 {
288 first = 0xf0;
289 len = 4;
290 }
291 else if (c < 0x4000000)
292 {
293 first = 0xf8;
294 len = 5;
295 }
296 else
297 {
298 first = 0xfc;
299 len = 6;
300 }
301
302 if (outbuf)
303 {
304 for (i = len - 1; i > 0; --i)
305 {
306 outbuf[i] = (c & 0x3f) | 0x80;
307 c >>= 6;
308 }
309 outbuf[0] = c | first;
310 }
311
312 return len;
313}
314
315/*
316 * g_utf8_to_ucs4_fast:
317 * @str: a UTF-8 encoded string
318 * @len: the maximum length of @str to use. If @len < 0, then
319 * the string is nul-terminated.
320 * @items_written: location to store the number of characters in the
321 * result, or %NULL.
322 *
323 * Convert a string from UTF-8 to a 32-bit fixed width
324 * representation as UCS-4, assuming valid UTF-8 input.
325 * This function is roughly twice as fast as g_utf8_to_ucs4()
326 * but does no error checking on the input.
327 *
328 * Return value: a pointer to a newly allocated UCS-4 string.
329 * This value must be freed with g_free().
330 **/
331static gunichar *
332g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
333{
334 gint j, charlen;
335 gunichar *result;
336 gint n_chars, i;
337 const gchar *p;
338
339 g_return_val_if_fail (str != NULL, NULL);
340
341 p = str;
342 n_chars = 0;
343 if (len < 0)
344 {
345 while (*p)
346 {
347 p = g_utf8_next_char (p);
348 ++n_chars;
349 }
350 }
351 else
352 {
353 while (p < str + len && *p)
354 {
355 p = g_utf8_next_char (p);
356 ++n_chars;
357 }
358 }
359
360 result = g_new (gunichar, n_chars + 1);
361 if (!result)
362 return NULL;
363
364 p = str;
365 for (i = 0; i < n_chars; i++)
366 {
367 gunichar wc = ((unsigned char *) p)[0];
368
369 if (wc < 0x80)
370 {
371 result[i] = wc;
372 p++;
373 }
374 else
375 {
376 if (wc < 0xe0)
377 {
378 charlen = 2;
379 wc &= 0x1f;
380 }
381 else if (wc < 0xf0)
382 {
383 charlen = 3;
384 wc &= 0x0f;
385 }
386 else if (wc < 0xf8)
387 {
388 charlen = 4;
389 wc &= 0x07;
390 }
391 else if (wc < 0xfc)
392 {
393 charlen = 5;
394 wc &= 0x03;
395 }
396 else
397 {
398 charlen = 6;
399 wc &= 0x01;
400 }
401
402 for (j = 1; j < charlen; j++)
403 {
404 wc <<= 6;
405 wc |= ((unsigned char *) p)[j] & 0x3f;
406 }
407
408 result[i] = wc;
409 p += charlen;
410 }
411 }
412 result[i] = 0;
413
414 if (items_written)
415 *items_written = i;
416
417 return result;
418}
419
420/*
421 * g_ucs4_to_utf8:
422 * @str: a UCS-4 encoded string
423 * @len: the maximum length of @str to use. If @len < 0, then
424 * the string is terminated with a 0 character.
425 * @items_read: location to store number of characters read read, or %NULL.
426 * @items_written: location to store number of bytes written or %NULL.
427 * The value here stored does not include the trailing 0
428 * byte.
429 * @error: location to store the error occuring, or %NULL to ignore
430 * errors. Any of the errors in #GConvertError other than
431 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
432 *
433 * Convert a string from a 32-bit fixed width representation as UCS-4.
434 * to UTF-8. The result will be terminated with a 0 byte.
435 *
436 * Return value: a pointer to a newly allocated UTF-8 string.
437 * This value must be freed with g_free(). If an
438 * error occurs, %NULL will be returned and
439 * @error set.
440 **/
441static gchar *
442g_ucs4_to_utf8 (const gunichar * str,
443 glong len,
444 glong * items_read, glong * items_written, GError ** error)
445{
446 gint result_length;
447 gchar *result = NULL;
448 gchar *p;
449 gint i;
450
451 result_length = 0;
452 for (i = 0; len < 0 || i < len; i++)
453 {
454 if (!str[i])
455 break;
456
457 if (str[i] >= 0x80000000)
458 {
459 if (items_read)
460 *items_read = i;
461
462 g_set_error (error, G_CONVERT_ERROR,
463 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
464 _("Character out of range for UTF-8"));
465 goto err_out;
466 }
467
468 result_length += UTF8_LENGTH (str[i]);
469 }
470
471 result = g_malloc (result_length + 1);
472 if (!result)
473 return NULL;
474 p = result;
475
476 i = 0;
477 while (p < result + result_length)
478 p += g_unichar_to_utf8 (str[i++], p);
479
480 *p = '\0';
481
482 if (items_written)
483 *items_written = p - result;
484
485err_out:
486 if (items_read)
487 *items_read = i;
488
489 return result;
490}
491
492/* Code from GLIB gunidecomp.c starts here. */
493
494#include "idna/gunidecomp.h"
495#include "idna/gunicomp.h"
496
497#define CC_PART1(Page, Char) \
498 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
499 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
500 : (cclass_data[combining_class_table_part1[Page]][Char]))
501
502#define CC_PART2(Page, Char) \
503 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
504 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
505 : (cclass_data[combining_class_table_part2[Page]][Char]))
506
507#define COMBINING_CLASS(Char) \
508 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
509 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
510 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
511 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
512 : 0))
513
514/* constants for hangul syllable [de]composition */
515#define SBase 0xAC00
516#define LBase 0x1100
517#define VBase 0x1161
518#define TBase 0x11A7
519#define LCount 19
520#define VCount 21
521#define TCount 28
522#define NCount (VCount * TCount)
523#define SCount (LCount * NCount)
524
525/*
526 * g_unicode_canonical_ordering:
527 * @string: a UCS-4 encoded string.
528 * @len: the maximum length of @string to use.
529 *
530 * Computes the canonical ordering of a string in-place.
531 * This rearranges decomposed characters in the string
532 * according to their combining classes. See the Unicode
533 * manual for more information.
534 **/
535static void
536g_unicode_canonical_ordering (gunichar * string, gsize len)
537{
538 gsize i;
539 int swap = 1;
540
541 while (swap)
542 {
543 int last;
544 swap = 0;
545 last = COMBINING_CLASS (string[0]);
546 for (i = 0; i < len - 1; ++i)
547 {
548 int next = COMBINING_CLASS (string[i + 1]);
549 if (next != 0 && last > next)
550 {
551 gsize j;
552 /* Percolate item leftward through string. */
553 for (j = i + 1; j > 0; --j)
554 {
555 gunichar t;
556 if (COMBINING_CLASS (string[j - 1]) <= next)
557 break;
558 t = string[j];
559 string[j] = string[j - 1];
560 string[j - 1] = t;
561 swap = 1;
562 }
563 /* We're re-entering the loop looking at the old
564 character again. */
565 next = last;
566 }
567 last = next;
568 }
569 }
570}
571
572/* http://www.unicode.org/unicode/reports/tr15/#Hangul
573 * r should be null or have sufficient space. Calling with r == NULL will
574 * only calculate the result_len; however, a buffer with space for three
575 * characters will always be big enough. */
576static void
577decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
578{
579 gint SIndex = s - SBase;
580
581 /* not a hangul syllable */
582 if (SIndex < 0 || SIndex >= SCount)
583 {
584 if (r)
585 r[0] = s;
586 *result_len = 1;
587 }
588 else
589 {
590 gunichar L = LBase + SIndex / NCount;
591 gunichar V = VBase + (SIndex % NCount) / TCount;
592 gunichar T = TBase + SIndex % TCount;
593
594 if (r)
595 {
596 r[0] = L;
597 r[1] = V;
598 }
599
600 if (T != TBase)
601 {
602 if (r)
603 r[2] = T;
604 *result_len = 3;
605 }
606 else
607 *result_len = 2;
608 }
609}
610
611/* returns a pointer to a null-terminated UTF-8 string */
612static const gchar *
613find_decomposition (gunichar ch, gboolean compat)
614{
615 int start = 0;
616 int end = G_N_ELEMENTS (decomp_table);
617
618 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619 {
620 while (TRUE)
621 {
622 int half = (start + end) / 2;
623 if (ch == decomp_table[half].ch)
624 {
625 int offset;
626
627 if (compat)
628 {
629 offset = decomp_table[half].compat_offset;
630 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631 offset = decomp_table[half].canon_offset;
632 }
633 else
634 {
635 offset = decomp_table[half].canon_offset;
636 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637 return NULL;
638 }
639
640 return &(decomp_expansion_string[offset]);
641 }
642 else if (half == start)
643 break;
644 else if (ch > decomp_table[half].ch)
645 start = half;
646 else
647 end = half;
648 }
649 }
650
651 return NULL;
652}
653
654/* L,V => LV and LV,T => LVT */
655static gboolean
656combine_hangul (gunichar a, gunichar b, gunichar * result)
657{
658 gint LIndex = a - LBase;
659 gint SIndex = a - SBase;
660
661 gint VIndex = b - VBase;
662 gint TIndex = b - TBase;
663
664 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
665 {
666 *result = SBase + (LIndex * VCount + VIndex) * TCount;
667 return TRUE;
668 }
669 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
670 && 0 <= TIndex && TIndex <= TCount)
671 {
672 *result = a + TIndex;
673 return TRUE;
674 }
675
676 return FALSE;
677}
678
679#define CI(Page, Char) \
680 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
681 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
682 : (compose_data[compose_table[Page]][Char]))
683
684#define COMPOSE_INDEX(Char) \
685 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
686
687static gboolean
688combine (gunichar a, gunichar b, gunichar * result)
689{
690 gushort index_a, index_b;
691
692 if (combine_hangul (a, b, result))
693 return TRUE;
694
695 index_a = COMPOSE_INDEX (a);
696
697 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
698 {
699 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
700 {
701 *result =
702 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
703 return TRUE;
704 }
705 else
706 return FALSE;
707 }
708
709 index_b = COMPOSE_INDEX (b);
710
711 if (index_b >= COMPOSE_SECOND_SINGLE_START)
712 {
713 if (a ==
714 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
715 {
716 *result =
717 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
718 return TRUE;
719 }
720 else
721 return FALSE;
722 }
723
724 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
725 && index_b >= COMPOSE_SECOND_START
726 && index_b < COMPOSE_SECOND_SINGLE_START)
727 {
728 gunichar res =
729 compose_array[index_a - COMPOSE_FIRST_START][index_b -
730 COMPOSE_SECOND_START];
731
732 if (res)
733 {
734 *result = res;
735 return TRUE;
736 }
737 }
738
739 return FALSE;
740}
741
742static gunichar *
743_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
744{
745 gsize n_wc;
746 gunichar *wc_buffer;
747 const char *p;
748 gsize last_start;
749 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
750 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
751
752 n_wc = 0;
753 p = str;
754 while ((max_len < 0 || p < str + max_len) && *p)
755 {
756 const gchar *decomp;
757 gunichar wc = g_utf8_get_char (p);
758
759 if (wc >= 0xac00 && wc <= 0xd7af)
760 {
761 gsize result_len;
762 decompose_hangul (wc, NULL, &result_len);
763 n_wc += result_len;
764 }
765 else
766 {
767 decomp = find_decomposition (wc, do_compat);
768
769 if (decomp)
770 n_wc += g_utf8_strlen (decomp, -1);
771 else
772 n_wc++;
773 }
774
775 p = g_utf8_next_char (p);
776 }
777
778 wc_buffer = g_new (gunichar, n_wc + 1);
779 if (!wc_buffer)
780 return NULL;
781
782 last_start = 0;
783 n_wc = 0;
784 p = str;
785 while ((max_len < 0 || p < str + max_len) && *p)
786 {
787 gunichar wc = g_utf8_get_char (p);
788 const gchar *decomp;
789 int cc;
790 gsize old_n_wc = n_wc;
791
792 if (wc >= 0xac00 && wc <= 0xd7af)
793 {
794 gsize result_len;
795 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
796 n_wc += result_len;
797 }
798 else
799 {
800 decomp = find_decomposition (wc, do_compat);
801
802 if (decomp)
803 {
804 const char *pd;
805 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
806 wc_buffer[n_wc++] = g_utf8_get_char (pd);
807 }
808 else
809 wc_buffer[n_wc++] = wc;
810 }
811
812 if (n_wc > 0)
813 {
814 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
815
816 if (cc == 0)
817 {
818 g_unicode_canonical_ordering (wc_buffer + last_start,
819 n_wc - last_start);
820 last_start = old_n_wc;
821 }
822 }
823
824 p = g_utf8_next_char (p);
825 }
826
827 if (n_wc > 0)
828 {
829 g_unicode_canonical_ordering (wc_buffer + last_start,
830 n_wc - last_start);
831 last_start = n_wc;
832 }
833
834 wc_buffer[n_wc] = 0;
835
836 /* All decomposed and reordered */
837
838 if (do_compose && n_wc > 0)
839 {
840 gsize i, j;
841 int last_cc = 0;
842 last_start = 0;
843
844 for (i = 0; i < n_wc; i++)
845 {
846 int cc = COMBINING_CLASS (wc_buffer[i]);
847
848 if (i > 0 &&
849 (last_cc == 0 || last_cc != cc) &&
850 combine (wc_buffer[last_start], wc_buffer[i],
851 &wc_buffer[last_start]))
852 {
853 for (j = i + 1; j < n_wc; j++)
854 wc_buffer[j - 1] = wc_buffer[j];
855 n_wc--;
856 i--;
857
858 if (i == last_start)
859 last_cc = 0;
860 else
861 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
862
863 continue;
864 }
865
866 if (cc == 0)
867 last_start = i;
868
869 last_cc = cc;
870 }
871 }
872
873 wc_buffer[n_wc] = 0;
874
875 return wc_buffer;
876}
877
878/*
879 * g_utf8_normalize:
880 * @str: a UTF-8 encoded string.
881 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
882 * @mode: the type of normalization to perform.
883 *
884 * Converts a string into canonical form, standardizing
885 * such issues as whether a character with an accent
886 * is represented as a base character and combining
887 * accent or as a single precomposed character. You
888 * should generally call g_utf8_normalize() before
889 * comparing two Unicode strings.
890 *
891 * The normalization mode %G_NORMALIZE_DEFAULT only
892 * standardizes differences that do not affect the
893 * text content, such as the above-mentioned accent
894 * representation. %G_NORMALIZE_ALL also standardizes
895 * the "compatibility" characters in Unicode, such
896 * as SUPERSCRIPT THREE to the standard forms
897 * (in this case DIGIT THREE). Formatting information
898 * may be lost but for most text operations such
899 * characters should be considered the same.
900 * For example, g_utf8_collate() normalizes
901 * with %G_NORMALIZE_ALL as its first step.
902 *
903 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
904 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
905 * but returned a result with composed forms rather
906 * than a maximally decomposed form. This is often
907 * useful if you intend to convert the string to
908 * a legacy encoding or pass it to a system with
909 * less capable Unicode handling.
910 *
911 * Return value: a newly allocated string, that is the
912 * normalized form of @str.
913 **/
914static gchar *
915g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
916{
917 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
918 gchar *result;
919
920 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
921 g_free (result_wc);
922
923 return result;
924}
925
926/* Public Libidn API starts here. */
927
928/**
929 * stringprep_utf8_to_unichar:
930 * @p: a pointer to Unicode character encoded as UTF-8
931 *
932 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
933 * If @p does not point to a valid UTF-8 encoded character, results are
934 * undefined.
935 *
936 * Return value: the resulting character.
937 **/
938uint32_t
939stringprep_utf8_to_unichar (const char *p)
940{
941 return g_utf8_get_char (p);
942}
943
944/**
945 * stringprep_unichar_to_utf8:
946 * @c: a ISO10646 character code
947 * @outbuf: output buffer, must have at least 6 bytes of space.
948 * If %NULL, the length will be computed and returned
949 * and nothing will be written to @outbuf.
950 *
951 * Converts a single character to UTF-8.
952 *
953 * Return value: number of bytes written.
954 **/
955int
956stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
957{
958 return g_unichar_to_utf8 (c, outbuf);
959}
960
961/**
962 * stringprep_utf8_to_ucs4:
963 * @str: a UTF-8 encoded string
964 * @len: the maximum length of @str to use. If @len < 0, then
965 * the string is nul-terminated.
966 * @items_written: location to store the number of characters in the
967 * result, or %NULL.
968 *
969 * Convert a string from UTF-8 to a 32-bit fixed width
970 * representation as UCS-4, assuming valid UTF-8 input.
971 * This function does no error checking on the input.
972 *
973 * Return value: a pointer to a newly allocated UCS-4 string.
974 * This value must be freed with free().
975 **/
976uint32_t *
977stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
978{
979 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
980}
981
982/**
983 * stringprep_ucs4_to_utf8:
984 * @str: a UCS-4 encoded string
985 * @len: the maximum length of @str to use. If @len < 0, then
986 * the string is terminated with a 0 character.
987 * @items_read: location to store number of characters read read, or %NULL.
988 * @items_written: location to store number of bytes written or %NULL.
989 * The value here stored does not include the trailing 0
990 * byte.
991 *
992 * Convert a string from a 32-bit fixed width representation as UCS-4.
993 * to UTF-8. The result will be terminated with a 0 byte.
994 *
995 * Return value: a pointer to a newly allocated UTF-8 string.
996 * This value must be freed with free(). If an
997 * error occurs, %NULL will be returned and
998 * @error set.
999 **/
1000char *
1001stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1002 size_t * items_read, size_t * items_written)
1003{
1004 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1005 (glong *) items_written, NULL);
1006}
1007
1008/**
1009 * stringprep_utf8_nfkc_normalize:
1010 * @str: a UTF-8 encoded string.
1011 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1012 *
1013 * Converts a string into canonical form, standardizing
1014 * such issues as whether a character with an accent
1015 * is represented as a base character and combining
1016 * accent or as a single precomposed character.
1017 *
1018 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1019 * differences that do not affect the text content, such as the
1020 * above-mentioned accent representation. It standardizes the
1021 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1022 * the standard forms (in this case DIGIT THREE). Formatting
1023 * information may be lost but for most text operations such
1024 * characters should be considered the same. It returns a result with
1025 * composed forms rather than a maximally decomposed form.
1026 *
1027 * Return value: a newly allocated string, that is the
1028 * NFKC normalized form of @str.
1029 **/
1030char *
1031stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1032{
1033 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1034}
1035
1036/**
1037 * stringprep_ucs4_nfkc_normalize:
1038 * @str: a Unicode string.
1039 * @len: length of @str array, or -1 if @str is nul-terminated.
1040 *
1041 * Converts UCS4 string into UTF-8 and runs
1042 * stringprep_utf8_nfkc_normalize().
1043 *
1044 * Return value: a newly allocated Unicode string, that is the NFKC
1045 * normalized form of @str.
1046 **/
1047uint32_t *
1048stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1049{
1050 char *p;
1051 uint32_t *result_wc;
1052
1053 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1054 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1055 free (p);
1056
1057 return result_wc;
1058}

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status