Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
icudump.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2010 Brazil
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License version 2.1 as published by the Free Software Foundation.
7 
8  This library is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  Lesser General Public License for more details.
12 
13  You should have received a copy of the GNU Lesser General Public
14  License along with this library; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 #include <stdio.h>
18 #include <getopt.h>
19 #include <unistd.h>
20 #include <string.h>
21 #include <unicode/utf.h>
22 #include <unicode/uchar.h>
23 #include <unicode/unorm.h>
24 #include <unicode/ustring.h>
25 
26 #define MAX_UNICODE 0x110000
27 #define BUF_SIZE 0x100
28 
29 static int
30 ucs2utf(unsigned int i, unsigned char *buf)
31 {
32  unsigned char *p = buf;
33  if (i < 0x80) {
34  *p++ = i;
35  } else {
36  if (i < 0x800) {
37  *p++ = (i >> 6) | 0xc0;
38  } else {
39  if (i < 0x00010000) {
40  *p++ = (i >> 12) | 0xe0;
41  } else {
42  if (i < 0x00200000) {
43  *p++ = (i >> 18) | 0xf0;
44  } else {
45  if (i < 0x04000000) {
46  *p++ = (i >> 24) | 0xf8;
47  } else if (i < 0x80000000) {
48  *p++ = (i >> 30) | 0xfc;
49  *p++ = ((i >> 24) & 0x3f) | 0x80;
50  }
51  *p++ = ((i >> 18) & 0x3f) | 0x80;
52  }
53  *p++ = ((i >> 12) & 0x3f) | 0x80;
54  }
55  *p++ = ((i >> 6) & 0x3f) | 0x80;
56  }
57  *p++ = (0x3f & i) | 0x80;
58  }
59  *p = '\0';
60  return (p - buf);
61 }
62 
63 void
64 blockcode(void)
65 {
66  UChar32 ch;
67  unsigned char *p, src[7];
68  UBlockCode code, lc = -1;
69  for (ch = 1; ch < MAX_UNICODE; ch++) {
70  if (!U_IS_UNICODE_CHAR(ch)) { continue; }
71  code = ublock_getCode(ch);
72  if (code != lc) {
73  ucs2utf(ch, src);
74  for (p = src; *p; p++) {
75  printf("%x:", *p);
76  }
77  printf("\t%04x\t%d\n", ch, code);
78  }
79  lc = code;
80  }
81 }
82 
83 int
84 normalize(const char *str, char *res, UNormalizationMode mode)
85 {
86  UErrorCode rc;
87  int32_t ulen, nlen;
88  UChar ubuf[BUF_SIZE], nbuf[BUF_SIZE];
89  rc = U_ZERO_ERROR;
90  u_strFromUTF8(ubuf, BUF_SIZE, &ulen, str, -1, &rc);
91  if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
92  return -1;
93  }
94  rc = U_ZERO_ERROR;
95  nlen = unorm_normalize(ubuf, ulen, mode, 0, nbuf, BUF_SIZE, &rc);
96  if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
97  return -1;
98  }
99  rc = U_ZERO_ERROR;
100  u_strToUTF8(res, BUF_SIZE, NULL, nbuf, nlen, &rc);
101  if (rc != U_ZERO_ERROR /*&& rc != U_BUFFER_OVERFLOW_ERROR*/) {
102  return -1;
103  }
104  return 0;
105 }
106 
107 void
108 dump(UNormalizationMode mode)
109 {
110  UChar32 ch;
111  char str[7], norm[BUF_SIZE];
112  for (ch = 1; ch < MAX_UNICODE; ch++) {
113  if (!U_IS_UNICODE_CHAR(ch)) { continue; }
114  ucs2utf(ch, (unsigned char *)str);
115  if (normalize(str, norm, mode)) {
116  printf("ch=%04x error occure\n", ch);
117  continue;
118  }
119  if (strcmp(norm, str)) {
120  printf("%04x\t%s\t%s\n", ch, str, norm);
121  }
122  }
123 }
124 
125 void
126 ccdump(void)
127 {
128  UChar32 ch;
129  char str[7], nfd[BUF_SIZE], nfc[BUF_SIZE];
130  for (ch = 1; ch < MAX_UNICODE; ch++) {
131  if (!U_IS_UNICODE_CHAR(ch)) { continue; }
132  ucs2utf(ch, (unsigned char *)str);
133  if (normalize(str, nfd, UNORM_NFD)) {
134  printf("ch=%04x error occure\n", ch);
135  continue;
136  }
137  if (normalize(str, nfc, UNORM_NFC)) {
138  printf("ch=%04x error occure\n", ch);
139  continue;
140  }
141  if (strcmp(nfd, nfc)) {
142  printf("%04x\t%s\t%s\n", ch, nfd, nfc);
143  }
144  }
145 }
146 
147 enum {
156 };
157 
158 static const char *ctypes[] = {
159  "grn_str_null",
160  "grn_str_alpha",
161  "grn_str_digit",
162  "grn_str_symbol",
163  "grn_str_hiragana",
164  "grn_str_katakana",
165  "grn_str_kanji",
166  "grn_str_others"
167 };
168 
169 void
170 gcdump(void)
171 {
172  UChar32 ch;
173  unsigned char *p, src[7];
174  int ctype, lc = -1;
175  for (ch = 1; ch < MAX_UNICODE; ch++) {
176  UCharCategory cat;
177  UBlockCode code;
178  if (!U_IS_UNICODE_CHAR(ch)) { continue; }
179  code = ublock_getCode(ch);
180  switch (code) {
181  case UBLOCK_CJK_RADICALS_SUPPLEMENT: /* cjk radicals */
182  case UBLOCK_KANGXI_RADICALS: /* kanji radicals */
183  case UBLOCK_BOPOMOFO: /* bopomofo letter */
184  case UBLOCK_HANGUL_COMPATIBILITY_JAMO: /* hangul letter */
185  case UBLOCK_KANBUN: /* kaeri ten used in kanbun ex. re-ten */
186  case UBLOCK_BOPOMOFO_EXTENDED: /* bopomofo extended letter */
187  case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: /* cjk letter */
188  case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: /* cjk letter */
189  case UBLOCK_YI_SYLLABLES: /* Yi syllables */
190  case UBLOCK_YI_RADICALS: /* Yi radicals */
191  case UBLOCK_HANGUL_SYLLABLES: /* hangul syllables */
192  case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS: /* cjk letter */
193  case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: /* cjk letter */
194  case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: /* cjk letter */
195  case UBLOCK_CJK_STROKES: /* kakijun*/
196  ctype = ctype_kanji;
197  break;
198  case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: /* symbols ex. JIS mark */
199  case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: /* ex. (kabu) */
200  case UBLOCK_CJK_COMPATIBILITY: /* symbols ex. ton doll */
201  case UBLOCK_CJK_COMPATIBILITY_FORMS: /* symbols ex. tategaki kagi-kakko */
202  ctype = ctype_symbol;
203  break;
204  case UBLOCK_HIRAGANA:
205  ctype = ctype_hiragana;
206  break;
207  case UBLOCK_KATAKANA:
208  case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS:
209  ctype = ctype_katakana;
210  break;
211  default:
212  cat = u_charType(ch);
213  switch (cat) {
214  case U_UPPERCASE_LETTER:
215  case U_LOWERCASE_LETTER:
216  case U_TITLECASE_LETTER:
217  case U_MODIFIER_LETTER:
218  case U_OTHER_LETTER:
219  ctype = ctype_alpha;
220  break;
221  case U_DECIMAL_DIGIT_NUMBER:
222  case U_LETTER_NUMBER:
223  case U_OTHER_NUMBER:
224  ctype = ctype_digit;
225  break;
226  case U_DASH_PUNCTUATION:
227  case U_START_PUNCTUATION:
228  case U_END_PUNCTUATION:
229  case U_CONNECTOR_PUNCTUATION:
230  case U_OTHER_PUNCTUATION:
231  case U_MATH_SYMBOL:
232  case U_CURRENCY_SYMBOL:
233  case U_MODIFIER_SYMBOL:
234  case U_OTHER_SYMBOL:
235  ctype = ctype_symbol;
236  break;
237  default:
238  ctype = ctype_others;
239  break;
240  }
241  break;
242  }
243  if (ctype != lc) {
244  ucs2utf(ch, src);
245  for (p = src; *p; p++) {
246  printf("%x:", *p);
247  }
248  printf("\t%04x\t%s\n", ch, ctypes[ctype]);
249  }
250  lc = ctype;
251  }
252 }
253 
254 struct option options[] = {
255  {"bc", 0, NULL, 'b'},
256  {"nfd", 0, NULL, 'd'},
257  {"nfkd", 0, NULL, 'D'},
258  {"nfc", 0, NULL, 'c'},
259  {"nfkc", 0, NULL, 'C'},
260  {"cc", 0, NULL, 'o'},
261  {"gc", 0, NULL, 'g'},
262 };
263 
264 int
265 main(int argc, char **argv)
266 {
267  switch (getopt_long(argc, argv, "bdDcCog", options, NULL)) {
268  case 'b' :
269  blockcode();
270  break;
271  case 'd' :
272  dump(UNORM_NFD);
273  break;
274  case 'D' :
275  dump(UNORM_NFKD);
276  break;
277  case 'c' :
278  dump(UNORM_NFC);
279  break;
280  case 'C' :
281  dump(UNORM_NFKC);
282  break;
283  case 'o' :
284  ccdump();
285  break;
286  case 'g' :
287  gcdump();
288  break;
289  default :
290  fputs("usage: icudump --[bc|nfd|nfkd|nfc|nfkc|cc|gc]\n", stderr);
291  break;
292  }
293  return 0;
294 }