21 #include <unicode/utf.h>
22 #include <unicode/uchar.h>
23 #include <unicode/unorm.h>
24 #include <unicode/ustring.h>
26 #define MAX_UNICODE 0x110000
27 #define BUF_SIZE 0x100
30 ucs2utf(
unsigned int i,
unsigned char *buf)
32 unsigned char *p = buf;
37 *p++ = (i >> 6) | 0xc0;
40 *p++ = (i >> 12) | 0xe0;
43 *p++ = (i >> 18) | 0xf0;
46 *p++ = (i >> 24) | 0xf8;
47 }
else if (i < 0x80000000) {
48 *p++ = (i >> 30) | 0xfc;
49 *p++ = ((i >> 24) & 0x3f) | 0x80;
51 *p++ = ((i >> 18) & 0x3f) | 0x80;
53 *p++ = ((i >> 12) & 0x3f) | 0x80;
55 *p++ = ((i >> 6) & 0x3f) | 0x80;
57 *p++ = (0x3f &
i) | 0x80;
67 unsigned char *p, src[7];
68 UBlockCode code, lc = -1;
70 if (!U_IS_UNICODE_CHAR(ch)) {
continue; }
71 code = ublock_getCode(ch);
74 for (p = src; *p; p++) {
77 printf(
"\t%04x\t%d\n", ch, code);
84 normalize(
const char *str,
char *res, UNormalizationMode mode)
90 u_strFromUTF8(ubuf,
BUF_SIZE, &ulen, str, -1, &rc);
91 if (rc != U_ZERO_ERROR ) {
95 nlen = unorm_normalize(ubuf, ulen, mode, 0, nbuf,
BUF_SIZE, &rc);
96 if (rc != U_ZERO_ERROR ) {
100 u_strToUTF8(res,
BUF_SIZE, NULL, nbuf, nlen, &rc);
101 if (rc != U_ZERO_ERROR ) {
113 if (!U_IS_UNICODE_CHAR(ch)) {
continue; }
114 ucs2utf(ch, (
unsigned char *)str);
116 printf(
"ch=%04x error occure\n", ch);
119 if (strcmp(norm, str)) {
120 printf(
"%04x\t%s\t%s\n", ch, str, norm);
131 if (!U_IS_UNICODE_CHAR(ch)) {
continue; }
132 ucs2utf(ch, (
unsigned char *)str);
134 printf(
"ch=%04x error occure\n", ch);
138 printf(
"ch=%04x error occure\n", ch);
141 if (strcmp(nfd, nfc)) {
142 printf(
"%04x\t%s\t%s\n", ch, nfd, nfc);
158 static const char *ctypes[] = {
173 unsigned char *p, src[7];
178 if (!U_IS_UNICODE_CHAR(ch)) {
continue; }
179 code = ublock_getCode(ch);
181 case UBLOCK_CJK_RADICALS_SUPPLEMENT:
182 case UBLOCK_KANGXI_RADICALS:
183 case UBLOCK_BOPOMOFO:
184 case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
186 case UBLOCK_BOPOMOFO_EXTENDED:
187 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
188 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
189 case UBLOCK_YI_SYLLABLES:
190 case UBLOCK_YI_RADICALS:
191 case UBLOCK_HANGUL_SYLLABLES:
192 case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
193 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
194 case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
195 case UBLOCK_CJK_STROKES:
198 case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
199 case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS:
200 case UBLOCK_CJK_COMPATIBILITY:
201 case UBLOCK_CJK_COMPATIBILITY_FORMS:
204 case UBLOCK_HIRAGANA:
207 case UBLOCK_KATAKANA:
208 case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS:
212 cat = u_charType(ch);
214 case U_UPPERCASE_LETTER:
215 case U_LOWERCASE_LETTER:
216 case U_TITLECASE_LETTER:
217 case U_MODIFIER_LETTER:
221 case U_DECIMAL_DIGIT_NUMBER:
222 case U_LETTER_NUMBER:
226 case U_DASH_PUNCTUATION:
227 case U_START_PUNCTUATION:
228 case U_END_PUNCTUATION:
229 case U_CONNECTOR_PUNCTUATION:
230 case U_OTHER_PUNCTUATION:
232 case U_CURRENCY_SYMBOL:
233 case U_MODIFIER_SYMBOL:
245 for (p = src; *p; p++) {
248 printf(
"\t%04x\t%s\n", ch, ctypes[ctype]);
255 {
"bc", 0, NULL,
'b'},
256 {
"nfd", 0, NULL,
'd'},
257 {
"nfkd", 0, NULL,
'D'},
258 {
"nfc", 0, NULL,
'c'},
259 {
"nfkc", 0, NULL,
'C'},
260 {
"cc", 0, NULL,
'o'},
261 {
"gc", 0, NULL,
'g'},
267 switch (getopt_long(argc, argv,
"bdDcCog",
options, NULL)) {
290 fputs(
"usage: icudump --[bc|nfd|nfkd|nfc|nfkc|cc|gc]\n", stderr);