MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
uctypedump.c
1 /* Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
15 
16 /*
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 */
21 #include <my_global.h>
22 #include <m_string.h>
23 #include <m_ctype.h>
24 
25 
26 typedef struct my_ctype_name_st
27 {
28  const char *name;
29  int val;
31 
32 
33 static MY_CTYPE_NAME_ST my_ctype_name[]=
34 {
35  {"Lu", _MY_U}, /* Letter, Uppercase */
36  {"Ll", _MY_L}, /* Letter, Lowercase */
37  {"Lt", _MY_U}, /* Letter, Titlecase */
38  {"Lm", _MY_L}, /* Letter, Modifier */
39  {"Lo", _MY_L}, /* Letter, other */
40 
41  {"Nd", _MY_NMR}, /* Number, Decimal Digit */
42  {"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */
43  {"No", _MY_NMR|_MY_PNT}, /* Number, Other */
44 
45  {"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */
46  {"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */
47  {"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */
48 
49  {"Pc", _MY_PNT}, /* Punctuation, Connector */
50  {"Pd", _MY_PNT}, /* Punctuation, Dash */
51  {"Ps", _MY_PNT}, /* Punctuation, Open */
52  {"Pe", _MY_PNT}, /* Punctuation, Close */
53  {"Pi", _MY_PNT}, /* Punctuation, Initial quote */
54  {"Pf", _MY_PNT}, /* Punctuation, Final quote */
55  {"Po", _MY_PNT}, /* Punctuation, Other */
56 
57  {"Sm", _MY_PNT}, /* Symbol, Math */
58  {"Sc", _MY_PNT}, /* Symbol, Currency */
59  {"Sk", _MY_PNT}, /* Symbol, Modifier */
60  {"So", _MY_PNT}, /* Symbol, Other */
61 
62  {"Zs", _MY_SPC}, /* Separator, Space */
63  {"Zl", _MY_SPC}, /* Separator, Line */
64  {"Zp", _MY_SPC}, /* Separator, Paragraph */
65 
66  {"Cc", _MY_CTR}, /* Other, Control */
67  {"Cf", _MY_CTR}, /* Other, Format */
68  {"Cs", _MY_CTR}, /* Other, Surrogate */
69  {"Co", _MY_CTR}, /* Other, Private Use */
70  {"Cn", _MY_CTR}, /* Other, Not Assigned */
71  {NULL, 0}
72 };
73 
74 
75 static int
76 ctypestr2num(const char *tok)
77 {
79  for (p= my_ctype_name; p->name; p++)
80  {
81  if (!strncasecmp(p->name, tok, 2))
82  return p->val;
83  }
84  return 0;
85 }
86 
87 
88 #define MAX_CHAR 0x10FFFF
89 #define MAX_DECOMPOSITION_LENGTH 2
90 
91 
92 typedef struct
93 {
94  uint code;
95  char *name;
96  char general_category[3];
97  int combining_class;
98  int bidirectional_category;
99  uint decomposition_mapping[MAX_DECOMPOSITION_LENGTH];
100  uint decimal_digit_value; /* 0-9 */
101  uint digit_value; /* 0-9 */
102  char *numeric_value; /* Examples: 0, 1, 10, 100, 1000, 1/2, 5/2 */
103  my_bool mirrored; /* Y or N */
104  char *unicode_1_0_name;
105  char *iso10646_comment_field;
106  uint uppercase_mapping;
107  uint lowercase_mapping;
108  uint titlecase_mapping;
109 
110  int mysql_ctype; /* ctype in MySQL format */
111 
113 
114 
115 typedef struct
116 {
117  int maxchar;
118  int debug;
119  int ctype;
120  int decomp;
121  const char *fname;
122  const char *varname;
124 
125 
126 
127 static void
128 unidata_param_init(MY_UNIDATA_PARAM *p)
129 {
130  p->maxchar= MAX_CHAR;
131  p->debug= 0;
132  p->ctype= 1;
133  p->decomp= 1;
134  p->fname= NULL;
135  p->varname= "";
136 }
137 
138 
139 static void
140 load_unidata(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *chr)
141 {
142  char str[1024];
143  FILE *f= prm->fname ? fopen(prm->fname, "r") : stdin;
144  if (!f)
145  {
146  fprintf(stderr, "Can't open file %s\n", prm->fname);
147  exit(1);
148  }
149 
150  while (fgets(str, sizeof(str), f))
151  {
152  size_t n;
153  char *s, *e;
154  MY_UNIDATA_CHAR ch;
155  memset(&ch, 0, sizeof(ch));
156 
157  for(n= 0, s= str; s; n++)
158  {
159  char *end, tok[1024]= "";
160 
161  if((e= strchr(s, ';')))
162  {
163  strncpy(tok, s, (unsigned int) (e - s));
164  tok[e - s]= 0;
165  }
166  else
167  {
168  strcpy(tok, s);
169  }
170 
171  end= tok + strlen(tok);
172 
173  switch(n)
174  {
175  case 0: ch.code= strtol(tok, &end, 16); break;
176  case 1: break; /* Character name */
177  case 2: /* General category */
178  ch.general_category[0]= tok[0];
179  ch.general_category[1]= tok[1];
180  ch.general_category[2]= '\0';
181  ch.mysql_ctype= ctypestr2num(tok);
182  break;
183 
184  case 3: /* Canonical Combining Class */
185  ch.combining_class= atoi(tok);
186  /*
187  if (ch.combining_class)
188  printf("YYY[%04X]=%d\n", ch.code, ch.combining_class);
189  */
190  break;
191  case 4: break; /* Bidirectional Category */
192  case 5: /* Character Decomposition Mapping */
193  if (*tok != '<')
194  {
195  size_t i;
196  char *dec, *endptr;
197  for (dec= strtok_r(tok, " \t", &endptr), i= 0;
198  dec;
199  dec= strtok_r(NULL, " \t", &endptr), i++)
200  {
201  if (i >= MAX_DECOMPOSITION_LENGTH)
202  {
203  fprintf(stderr, "Decomposition length is too long for character %04X\n", ch.code);
204  exit(1);
205  }
206  ch.decomposition_mapping[i]= strtol(dec, NULL, 16);
207  }
208  }
209  break;
210 
211  case 6: /* Decimal digit value */
212  ch.decimal_digit_value= atoi(tok);
213  break;
214 
215  case 7: /* Digit value */
216  ch.digit_value= atoi(tok);
217  break;
218 
219  case 8: /* Numeric value */
220  break;
221 
222  case 9: break; /* Mirrored */
223  case 10: break; /* Unicode 1.0 Name */
224  case 11: break; /* 10646 comment field */
225  case 12: break; /* Uppercase */
226  case 13: break; /* Lowecase */
227  case 14: break; /* Titlecase */
228  }
229  s= e ? e + 1 : e;
230  }
231  if(ch.code <= prm->maxchar)
232  chr[ch.code]= ch;
233  }
234 }
235 
236 
237 static void
238 unidata_char_set_cjk(MY_UNIDATA_CHAR *unidata, int max_char, int cur_char)
239 {
240  if (cur_char < max_char)
241  {
242  MY_UNIDATA_CHAR *ch= &unidata[cur_char];
243  ch->mysql_ctype= _MY_L | _MY_U;
244  strcpy(ch->general_category, "Lo");
245  }
246 }
247 
248 
249 static void
250 fill_implicit_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
251 {
252  int i;
253  /* Fill digits */
254  for (i= '0'; i <= '9'; i++)
255  unidata[i].mysql_ctype= _MY_NMR;
256  /* Fill hex digits */
257  for (i= 'a'; i <= 'z'; i++)
258  unidata[i].mysql_ctype|= _MY_X;
259  for (i= 'A'; i <= 'Z'; i++)
260  unidata[i].mysql_ctype|= _MY_X;
261 
262  /* Fill ideographs */
263  /* CJK Ideographs Extension A (U+3400 - U+4DB5) */
264  for(i= 0x3400; i <= 0x4DB5; i++)
265  unidata_char_set_cjk(unidata, prm->maxchar, i);
266 
267  /* CJK Ideographs (U+4E00 - U+9FA5) */
268  for(i= 0x4E00; i <= 0x9FA5; i++) /* 9FCB in 5.2.0 */
269  unidata_char_set_cjk(unidata, prm->maxchar, i);
270 
271  /* Hangul Syllables (U+AC00 - U+D7A3) */
272  for(i= 0xAC00; i <= 0xD7A3; i++)
273  unidata_char_set_cjk(unidata, prm->maxchar, i);
274 
275  /*
276  20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
277  2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
278  */
279  for (i= 0x20000; i <= 0x2A6D6; i++)
280  unidata_char_set_cjk(unidata, prm->maxchar, i);
281 
282  /*
283  2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
284  2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
285  */
286  for (i= 0x2A700; i <= 0x2B734; i++)
287  unidata_char_set_cjk(unidata, prm->maxchar, i);
288 
289 
290  /*
291  TODO:
292  D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
293  DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
294  DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
295  DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
296  DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
297  DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
298 
299  E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
300  F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
301  F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;
302  FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;
303  100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
304  10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;0
305  */
306 }
307 
308 
309 /*
310  Check if ctype for the entire page consisting of "nchars"
311  characters is the same.
312  Return -1 otherwise.
313 */
314 static int
315 page_ctype(MY_UNIDATA_CHAR *data, size_t nchars)
316 {
317  size_t i;
318  for (i= 1; i < nchars; i++)
319  {
320  if (data[i].mysql_ctype != data->mysql_ctype)
321  return -1;
322  }
323  return data->mysql_ctype;
324 }
325 
326 
327 static void
328 dump_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
329 {
330  int page, max_page= (prm->maxchar + 255) / 256;
331 
332  printf("/*\n");
333  printf(" Unicode ctype data\n");
334  printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin");
335  printf("*/\n");
336 
337  /* Dump planes with mixed ctype */
338  for(page= 0; page < max_page; page++)
339  {
340  if (page_ctype(unidata + page * 256, 256) < 0)
341  {
342  size_t charnum, num;
343  printf("static unsigned char uctype%s_page%02X[256]=\n{\n",
344  prm->varname, page);
345  for(num= 0, charnum=0; charnum < 256; charnum++)
346  {
347  printf(" %2d%s", unidata[page * 256 + charnum].mysql_ctype,
348  charnum < 255 ? "," : "");
349  if(++num == 16)
350  {
351  printf("\n");
352  num= 0;
353  }
354  }
355  printf("};\n\n");
356  }
357  }
358 
359  /* Dump ctype page index */
360  printf("MY_UNI_CTYPE my_uni_ctype%s[%d]={\n", prm->varname, max_page);
361  for(page= 0; page < max_page; page++)
362  {
363  char page_name[128]="NULL";
364  int ctype;
365  if ((ctype= page_ctype(unidata + page * 256, 256)) < 0)
366  {
367  sprintf(page_name,"uctype%s_page%02X", prm->varname, page);
368  ctype= 0;
369  }
370  printf("\t{%d,%s}%s\n", ctype, page_name, page < max_page - 1 ? "," : "");
371  }
372  printf("};\n\n\n");
373 }
374 
375 
376 /*
377 static int
378 decomposition_length(MY_UNIDATA_CHAR *ch)
379 {
380  if (ch->decomposition_mapping[1])
381  return 2;
382  if (ch->decomposition_mapping[0])
383  return 1;
384  return 0;
385 }
386 */
387 
388 static void
389 dump_decomposition_page(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata,
390  uint pageno, uint nchars)
391 {
392  uint i, ofs= pageno * 256;
393  printf("static MY_UNI_DECOMPOSITION decomp%s_p%02X[256]= {\n",
394  prm->varname, pageno);
395  for (i= 0; i < nchars; i++)
396  {
397  MY_UNIDATA_CHAR *ch= &unidata[ofs + i];
398 
399  printf("/* %04X */ {0x%04X,0x%04X},",
400  ofs + i, ch->decomposition_mapping[0], ch->decomposition_mapping[1]);
401 
402  if (ch->decomposition_mapping[0])
403  printf(" %s/* [%s-%s][%d-%d] */",
404  ch->decomposition_mapping[0] < 0x10000 ? " " : "",
405  unidata[ch->decomposition_mapping[0]].general_category,
406  unidata[ch->decomposition_mapping[1]].general_category,
407  unidata[ch->decomposition_mapping[0]].combining_class,
408  unidata[ch->decomposition_mapping[1]].combining_class);
409  printf("\n");
410  }
411  printf("};\n\n\n");
412 }
413 
414 
415 static size_t
416 calc_decompositions(MY_UNIDATA_CHAR *unidata, size_t nchars)
417 {
418  size_t i, n;
419  for (n= i= 0; i < nchars; i++)
420  {
421  if (unidata[i].decomposition_mapping[0])
422  n++;
423  }
424  return n;
425 }
426 
427 
428 static void
429 dump_decomposition(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
430 {
431  int i, npages= (prm->maxchar + 255) / 256;
432 
433  printf("/*\n");
434  printf(" Unicode canonical decomposition data\n");
435  printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin");
436  printf("*/\n");
437 
438  /* Dump pages */
439  for (i= 0; i < npages; i++)
440  {
441  MY_UNIDATA_CHAR *page= unidata + i * 256;
442  if (calc_decompositions(page, 256))
443  dump_decomposition_page(prm, unidata, i, 256);
444  }
445 
446  /* Dump decompositions */
447  printf("static MY_UNI_DECOMPOSITION *my_uni_decomp%s[%d]=\n{\n",
448  prm->varname, npages);
449  for (i= 0; i < npages; i++)
450  {
451  MY_UNIDATA_CHAR *page= unidata + i * 256;
452  if (calc_decompositions(page, 256))
453  printf("decom%s_p%02X,", prm->varname, i);
454  else
455  printf("NULL,");
456  if ((i % 8) == 7)
457  printf("\n");
458  }
459  printf("};\n");
460 }
461 
462 
463 static void
464 usage(FILE *f, int rc)
465 {
466  exit(rc);
467 }
468 
469 
470 static int
471 get_int_option(const char *str, const char *name, int *num)
472 {
473  size_t namelen= strlen(name);
474  if (!strncmp(str, name, namelen))
475  {
476  const char *val= str + namelen;
477  if (val[0] == '0' && val[1] == 'x')
478  {
479  *num= strtol(val, NULL, 16);
480  }
481  else
482  {
483  *num= atoi(val);
484  if (*num == 0 && *val !='0')
485  {
486  fprintf(stderr, "\nBad numeric option value: %s\n\n", str);
487  usage(stderr, 1);
488  }
489  }
490  return 1;
491  }
492  return 0;
493 }
494 
495 
496 static int
497 get_const_str_option(const char *str, const char *name, const char **val)
498 {
499  size_t namelen= strlen(name);
500  if (!strncmp(str, name, namelen))
501  {
502  *val= str + namelen;
503  return 1;
504  }
505  return 0;
506 }
507 
508 
509 static void
510 process_options(MY_UNIDATA_PARAM *prm, int ac, char **av)
511 {
512  int i;
513  unidata_param_init(prm);
514  for (i= 1; i < ac ; i++)
515  {
516  /* printf("[%d]=%s\n", i, av[i]); */
517  if (av[i][0] != '-' || av[i][1] != '-')
518  break;
519  if (!get_const_str_option(av[i], "--name=", &prm->varname) &&
520  !get_int_option(av[i], "--maxchar=", &prm->maxchar) &&
521  !get_int_option(av[i], "--ctype=", &prm->ctype) &&
522  !get_int_option(av[i], "--decomp=", &prm->decomp) &&
523  !get_int_option(av[i], "--debug=", &prm->debug))
524  {
525  fprintf(stderr, "\nUnknown option: %s\n\n", av[i]);
526  usage(stderr, 1);
527  }
528  }
529  prm->fname= av[i];
530 }
531 
532 
533 int main(int ac, char ** av)
534 {
535  MY_UNIDATA_PARAM prm;
536  static MY_UNIDATA_CHAR unidata[MAX_CHAR + 1];
537 
538  process_options(&prm, ac, av);
539  memset(unidata, 0, sizeof(unidata));
540  fill_implicit_ctype(&prm, unidata);
541  load_unidata(&prm, unidata);
542 
543  if (prm.ctype)
544  dump_ctype(&prm, unidata);
545 
546  if (prm.decomp)
547  dump_decomposition(&prm, unidata);
548 
549  return 0;
550 }