MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ctype.c
1 /* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
15 
16 #include <my_global.h>
17 #include <m_ctype.h>
18 #include <my_xml.h>
19 #ifndef SCO
20 #include <m_string.h>
21 #endif
22 
23 
24 /*
25 
26  This files implements routines which parse XML based
27  character set and collation description files.
28 
29  Unicode collations are encoded according to
30 
31  Unicode Technical Standard #35
32  Locale Data Markup Language (LDML)
33  http://www.unicode.org/reports/tr35/
34 
35  and converted into ICU string according to
36 
37  Collation Customization
38  http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
39 
40 */
41 
42 
43 /*
44  Avoid using my_snprintf
45  We cannot use my_snprintf() here, because ctype.o is
46  used to build conf_to_src, which must require minimun
47  dependency.
48 */
49 
50 #undef my_snprinf
51 #define my_snprintf "We cannot use my_snprintf in this file"
52 
53 
54 int (*my_string_stack_guard)(int)= NULL;
55 
56 static char *mstr(char *str,const char *src,size_t l1,size_t l2)
57 {
58  l1= l1<l2 ? l1 : l2;
59  memcpy(str,src,l1);
60  str[l1]='\0';
61  return str;
62 }
63 
65 {
66  int state;
67  const char *str;
68 };
69 
70 #define _CS_MISC 1
71 #define _CS_ID 2
72 #define _CS_CSNAME 3
73 #define _CS_FAMILY 4
74 #define _CS_ORDER 5
75 #define _CS_COLNAME 6
76 #define _CS_FLAG 7
77 #define _CS_CHARSET 8
78 #define _CS_COLLATION 9
79 #define _CS_UPPERMAP 10
80 #define _CS_LOWERMAP 11
81 #define _CS_UNIMAP 12
82 #define _CS_COLLMAP 13
83 #define _CS_CTYPEMAP 14
84 #define _CS_PRIMARY_ID 15
85 #define _CS_BINARY_ID 16
86 #define _CS_CSDESCRIPT 17
87 
88 
89 /* Special purpose commands */
90 #define _CS_UCA_VERSION 100
91 #define _CS_CL_SUPPRESS_CONTRACTIONS 101
92 #define _CS_CL_OPTIMIZE 102
93 #define _CS_CL_SHIFT_AFTER_METHOD 103
94 
95 
96 /* Collation Settings */
97 #define _CS_ST_SETTINGS 200
98 #define _CS_ST_STRENGTH 201
99 #define _CS_ST_ALTERNATE 202
100 #define _CS_ST_BACKWARDS 203
101 #define _CS_ST_NORMALIZATION 204
102 #define _CS_ST_CASE_LEVEL 205
103 #define _CS_ST_CASE_FIRST 206
104 #define _CS_ST_HIRAGANA_QUATERNARY 207
105 #define _CS_ST_NUMERIC 208
106 #define _CS_ST_VARIABLE_TOP 209
107 #define _CS_ST_MATCH_BOUNDARIES 210
108 #define _CS_ST_MATCH_STYLE 211
109 
110 
111 /* Rules */
112 #define _CS_RULES 300
113 #define _CS_RESET 301
114 #define _CS_DIFF1 302
115 #define _CS_DIFF2 303
116 #define _CS_DIFF3 304
117 #define _CS_DIFF4 305
118 #define _CS_IDENTICAL 306
119 
120 /* Rules: Expansions */
121 #define _CS_EXP_X 320
122 #define _CS_EXP_EXTEND 321
123 #define _CS_EXP_DIFF1 322
124 #define _CS_EXP_DIFF2 323
125 #define _CS_EXP_DIFF3 324
126 #define _CS_EXP_DIFF4 325
127 #define _CS_EXP_IDENTICAL 326
128 
129 /* Rules: Abbreviating Ordering Specifications */
130 #define _CS_A_DIFF1 351
131 #define _CS_A_DIFF2 352
132 #define _CS_A_DIFF3 353
133 #define _CS_A_DIFF4 354
134 #define _CS_A_IDENTICAL 355
135 
136 /* Rules: previous context */
137 #define _CS_CONTEXT 370
138 
139 /* Rules: Placing Characters Before Others*/
140 #define _CS_RESET_BEFORE 380
141 
142 /* Rules: Logical Reset Positions */
143 #define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401
144 #define _CS_RESET_LAST_PRIMARY_IGNORABLE 402
145 #define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403
146 #define _CS_RESET_LAST_SECONDARY_IGNORABLE 404
147 #define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405
148 #define _CS_RESET_LAST_TERTIARY_IGNORABLE 406
149 #define _CS_RESET_FIRST_TRAILING 407
150 #define _CS_RESET_LAST_TRAILING 408
151 #define _CS_RESET_FIRST_VARIABLE 409
152 #define _CS_RESET_LAST_VARIABLE 410
153 #define _CS_RESET_FIRST_NON_IGNORABLE 411
154 #define _CS_RESET_LAST_NON_IGNORABLE 412
155 
156 
157 
158 static struct my_cs_file_section_st sec[] =
159 {
160  {_CS_MISC, "xml"},
161  {_CS_MISC, "xml/version"},
162  {_CS_MISC, "xml/encoding"},
163  {_CS_MISC, "charsets"},
164  {_CS_MISC, "charsets/max-id"},
165  {_CS_MISC, "charsets/copyright"},
166  {_CS_MISC, "charsets/description"},
167  {_CS_CHARSET, "charsets/charset"},
168  {_CS_PRIMARY_ID, "charsets/charset/primary-id"},
169  {_CS_BINARY_ID, "charsets/charset/binary-id"},
170  {_CS_CSNAME, "charsets/charset/name"},
171  {_CS_FAMILY, "charsets/charset/family"},
172  {_CS_CSDESCRIPT, "charsets/charset/description"},
173  {_CS_MISC, "charsets/charset/alias"},
174  {_CS_MISC, "charsets/charset/ctype"},
175  {_CS_CTYPEMAP, "charsets/charset/ctype/map"},
176  {_CS_MISC, "charsets/charset/upper"},
177  {_CS_UPPERMAP, "charsets/charset/upper/map"},
178  {_CS_MISC, "charsets/charset/lower"},
179  {_CS_LOWERMAP, "charsets/charset/lower/map"},
180  {_CS_MISC, "charsets/charset/unicode"},
181  {_CS_UNIMAP, "charsets/charset/unicode/map"},
182  {_CS_COLLATION, "charsets/charset/collation"},
183  {_CS_COLNAME, "charsets/charset/collation/name"},
184  {_CS_ID, "charsets/charset/collation/id"},
185  {_CS_ORDER, "charsets/charset/collation/order"},
186  {_CS_FLAG, "charsets/charset/collation/flag"},
187  {_CS_COLLMAP, "charsets/charset/collation/map"},
188 
189  /* Special purpose commands */
190  {_CS_UCA_VERSION, "charsets/charset/collation/version"},
191  {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
192  {_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
193  {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
194 
195  /* Collation Settings */
196  {_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
197  {_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength"},
198  {_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate"},
199  {_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards"},
200  {_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization"},
201  {_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel"},
202  {_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst"},
203  {_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary"},
204  {_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric"},
205  {_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop"},
206  {_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries"},
207  {_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style"},
208 
209  /* Rules */
210  {_CS_RULES, "charsets/charset/collation/rules"},
211  {_CS_RESET, "charsets/charset/collation/rules/reset"},
212  {_CS_DIFF1, "charsets/charset/collation/rules/p"},
213  {_CS_DIFF2, "charsets/charset/collation/rules/s"},
214  {_CS_DIFF3, "charsets/charset/collation/rules/t"},
215  {_CS_DIFF4, "charsets/charset/collation/rules/q"},
216  {_CS_IDENTICAL, "charsets/charset/collation/rules/i"},
217 
218  /* Rules: expansions */
219  {_CS_EXP_X, "charsets/charset/collation/rules/x"},
220  {_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend"},
221  {_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p"},
222  {_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s"},
223  {_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t"},
224  {_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q"},
225  {_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i"},
226 
227  /* Rules: previous context */
228  {_CS_CONTEXT, "charsets/charset/collation/rules/x/context"},
229 
230  /* Rules: Abbreviating Ordering Specifications */
231  {_CS_A_DIFF1, "charsets/charset/collation/rules/pc"},
232  {_CS_A_DIFF2, "charsets/charset/collation/rules/sc"},
233  {_CS_A_DIFF3, "charsets/charset/collation/rules/tc"},
234  {_CS_A_DIFF4, "charsets/charset/collation/rules/qc"},
235  {_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic"},
236 
237  /* Rules: Placing Characters Before Others*/
238  {_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before"},
239 
240  /* Rules: Logical Reset Positions */
241  {_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable"},
242  {_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable"},
243  {_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable"},
244  {_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable"},
245  {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
246  {_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
247  {_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
248  {_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
249  {_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing"},
250  {_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing"},
251  {_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable"},
252  {_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable"},
253 
254  {0, NULL}
255 };
256 
257 static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
258 {
259  struct my_cs_file_section_st *s;
260  for (s= sec; s->str; s++)
261  {
262  if (!strncmp(attr, s->str, len) && s->str[len] == 0)
263  return s;
264  }
265  return NULL;
266 }
267 
268 #define MY_CS_CSDESCR_SIZE 64
269 #define MY_CS_TAILORING_SIZE 32*1024
270 #define MY_CS_UCA_VERSION_SIZE 64
271 #define MY_CS_CONTEXT_SIZE 64
272 
273 typedef struct my_cs_file_info
274 {
275  char csname[MY_CS_NAME_SIZE];
276  char name[MY_CS_NAME_SIZE];
277  uchar ctype[MY_CS_CTYPE_TABLE_SIZE];
278  uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
279  uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
280  uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
281  uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
282  char comment[MY_CS_CSDESCR_SIZE];
283  char *tailoring;
284  size_t tailoring_length;
285  size_t tailoring_alloced_length;
286  char context[MY_CS_CONTEXT_SIZE];
287  CHARSET_INFO cs;
288  MY_CHARSET_LOADER *loader;
290 
291 
292 static void
293 my_charset_file_reset_charset(MY_CHARSET_FILE *i)
294 {
295  memset(&i->cs, 0, sizeof(i->cs));
296 }
297 
298 
299 static void
300 my_charset_file_reset_collation(MY_CHARSET_FILE *i)
301 {
302  i->tailoring_length= 0;
303  i->context[0]= '\0';
304 }
305 
306 
307 static void
308 my_charset_file_init(MY_CHARSET_FILE *i)
309 {
310  my_charset_file_reset_charset(i);
311  my_charset_file_reset_collation(i);
312  i->tailoring= NULL;
313  i->tailoring_alloced_length= 0;
314 }
315 
316 
317 static void
318 my_charset_file_free(MY_CHARSET_FILE *i)
319 {
320  i->loader->free(i->tailoring);
321 }
322 
323 
324 static int
325 my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
326 {
327  if (i->tailoring_alloced_length > newlen ||
328  (i->tailoring= i->loader->realloc(i->tailoring,
329  (i->tailoring_alloced_length=
330  (newlen + 32*1024)))))
331  {
332  return MY_XML_OK;
333  }
334  return MY_XML_ERROR;
335 }
336 
337 
338 static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
339 {
340  uint i= 0;
341  const char *s, *b, *e=str+len;
342 
343  for (s=str ; s < e ; i++)
344  {
345  for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
346  b=s;
347  for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
348  if (s == b || i > size)
349  break;
350  a[i]= (uchar) strtoul(b,NULL,16);
351  }
352  return 0;
353 }
354 
355 static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
356 {
357  uint i= 0;
358 
359  const char *s, *b, *e=str+len;
360  for (s=str ; s < e ; i++)
361  {
362  for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
363  b=s;
364  for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
365  if (s == b || i > size)
366  break;
367  a[i]= (uint16) strtol(b,NULL,16);
368  }
369  return 0;
370 }
371 
372 
373 
374 
375 static int
376 tailoring_append(MY_XML_PARSER *st,
377  const char *fmt, size_t len, const char *attr)
378 {
379  struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
380  size_t newlen= i->tailoring_length + len + 64; /* 64 for format */
381  if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
382  {
383  char *dst= i->tailoring + i->tailoring_length;
384  sprintf(dst, fmt, (int) len, attr);
385  i->tailoring_length+= strlen(dst);
386  return MY_XML_OK;
387  }
388  return MY_XML_ERROR;
389 }
390 
391 
392 static int
393 tailoring_append2(MY_XML_PARSER *st,
394  const char *fmt,
395  size_t len1, const char *attr1,
396  size_t len2, const char *attr2)
397 {
398  struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
399  size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */
400  if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
401  {
402  char *dst= i->tailoring + i->tailoring_length;
403  sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
404  i->tailoring_length+= strlen(dst);
405  return MY_XML_OK;
406  }
407  return MY_XML_ERROR;
408 }
409 
410 
411 static size_t
412 scan_one_character(const char *s, const char *e, my_wc_t *wc)
413 {
414  CHARSET_INFO *cs= &my_charset_utf8_general_ci;
415  if (s >= e)
416  return 0;
417 
418  /* Escape sequence: \uXXXX */
419  if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2]))
420  {
421  size_t len= 3; /* We have at least one digit */
422  for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++)
423  {
424  }
425  wc[0]= 0;
426  return len;
427  }
428  else if (s[0] > 0) /* 7-bit character */
429  {
430  wc[0]= 0;
431  return 1;
432  }
433  else /* Non-escaped character */
434  {
435  int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e);
436  if (rc > 0)
437  return (size_t) rc;
438  }
439  return 0;
440 }
441 
442 
443 static int
444 tailoring_append_abbreviation(MY_XML_PARSER *st,
445  const char *fmt, size_t len, const char *attr)
446 {
447  size_t clen;
448  const char *attrend= attr + len;
449  my_wc_t wc;
450 
451  for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen)
452  {
453  DBUG_ASSERT(attr < attrend);
454  if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
455  return MY_XML_ERROR;
456  }
457  return MY_XML_OK;
458 }
459 
460 
461 static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
462 {
463  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
464  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
465  int state= s ? s->state : 0;
466 
467  switch (state) {
468  case 0:
469  i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
470  break;
471 
472  case _CS_CHARSET:
473  my_charset_file_reset_charset(i);
474  break;
475 
476  case _CS_COLLATION:
477  my_charset_file_reset_collation(i);
478  break;
479 
480  case _CS_RESET:
481  return tailoring_append(st, " &", 0, NULL);
482 
483  default:
484  break;
485  }
486  return MY_XML_OK;
487 }
488 
489 
490 static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
491 {
492  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
493  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
494  int state= s ? s->state : 0;
495  int rc;
496 
497  switch(state){
498  case _CS_COLLATION:
499  if (i->tailoring_length)
500  i->cs.tailoring= i->tailoring;
501  rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
502  break;
503 
504  /* Rules: Logical Reset Positions */
505  case _CS_RESET_FIRST_NON_IGNORABLE:
506  rc= tailoring_append(st, "[first non-ignorable]", 0, NULL);
507  break;
508 
509  case _CS_RESET_LAST_NON_IGNORABLE:
510  rc= tailoring_append(st, "[last non-ignorable]", 0, NULL);
511  break;
512 
513  case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
514  rc= tailoring_append(st, "[first primary ignorable]", 0, NULL);
515  break;
516 
517  case _CS_RESET_LAST_PRIMARY_IGNORABLE:
518  rc= tailoring_append(st, "[last primary ignorable]", 0, NULL);
519  break;
520 
521  case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
522  rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL);
523  break;
524 
525  case _CS_RESET_LAST_SECONDARY_IGNORABLE:
526  rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL);
527  break;
528 
529  case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
530  rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL);
531  break;
532 
533  case _CS_RESET_LAST_TERTIARY_IGNORABLE:
534  rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL);
535  break;
536 
537  case _CS_RESET_FIRST_TRAILING:
538  rc= tailoring_append(st, "[first trailing]", 0, NULL);
539  break;
540 
541  case _CS_RESET_LAST_TRAILING:
542  rc= tailoring_append(st, "[last trailing]", 0, NULL);
543  break;
544 
545  case _CS_RESET_FIRST_VARIABLE:
546  rc= tailoring_append(st, "[first variable]", 0, NULL);
547  break;
548 
549  case _CS_RESET_LAST_VARIABLE:
550  rc= tailoring_append(st, "[last variable]", 0, NULL);
551  break;
552 
553  default:
554  rc=MY_XML_OK;
555  }
556  return rc;
557 }
558 
559 
560 static const char *diff_fmt[5]=
561 {
562  "<%.*s",
563  "<<%.*s",
564  "<<<%.*s",
565  "<<<<%.*s",
566  "=%.*s"
567 };
568 
569 
570 static const char *context_diff_fmt[5]=
571 {
572  "<%.*s|%.*s",
573  "<<%.*s|%.*s",
574  "<<<%.*s|%.*s",
575  "<<<<%.*s|%.*s",
576  "=%.*s|%.*s"
577 };
578 
579 
580 static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
581 {
582  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
583  struct my_cs_file_section_st *s;
584  int state= (int)((s= cs_file_sec(st->attr.start,
585  st->attr.end - st->attr.start)) ?
586  s->state : 0);
587  int rc= MY_XML_OK;
588 
589  switch (state) {
590  case _CS_MISC:
591  case _CS_FAMILY:
592  case _CS_ORDER:
593  break;
594  case _CS_ID:
595  i->cs.number= strtol(attr,(char**)NULL,10);
596  break;
597  case _CS_BINARY_ID:
598  i->cs.binary_number= strtol(attr,(char**)NULL,10);
599  break;
600  case _CS_PRIMARY_ID:
601  i->cs.primary_number= strtol(attr,(char**)NULL,10);
602  break;
603  case _CS_COLNAME:
604  i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
605  break;
606  case _CS_CSNAME:
607  i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
608  break;
609  case _CS_CSDESCRIPT:
610  i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
611  break;
612  case _CS_FLAG:
613  if (!strncmp("primary",attr,len))
614  i->cs.state|= MY_CS_PRIMARY;
615  else if (!strncmp("binary",attr,len))
616  i->cs.state|= MY_CS_BINSORT;
617  else if (!strncmp("compiled",attr,len))
618  i->cs.state|= MY_CS_COMPILED;
619  break;
620  case _CS_UPPERMAP:
621  fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
622  i->cs.to_upper=i->to_upper;
623  break;
624  case _CS_LOWERMAP:
625  fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
626  i->cs.to_lower=i->to_lower;
627  break;
628  case _CS_UNIMAP:
629  fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
630  i->cs.tab_to_uni=i->tab_to_uni;
631  break;
632  case _CS_COLLMAP:
633  fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
634  i->cs.sort_order=i->sort_order;
635  break;
636  case _CS_CTYPEMAP:
637  fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
638  i->cs.ctype=i->ctype;
639  break;
640 
641  /* Special purpose commands */
642  case _CS_UCA_VERSION:
643  rc= tailoring_append(st, "[version %.*s]", len, attr);
644  break;
645 
646  case _CS_CL_SUPPRESS_CONTRACTIONS:
647  rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
648  break;
649 
650  case _CS_CL_OPTIMIZE:
651  rc= tailoring_append(st, "[optimize %.*s]", len, attr);
652  break;
653 
654  case _CS_CL_SHIFT_AFTER_METHOD:
655  rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
656  break;
657 
658  /* Collation Settings */
659  case _CS_ST_STRENGTH:
660  /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
661  rc= tailoring_append(st, "[strength %.*s]", len, attr);
662  break;
663 
664  case _CS_ST_ALTERNATE:
665  /* non-ignorable, shifted */
666  rc= tailoring_append(st, "[alternate %.*s]", len, attr);
667  break;
668 
669  case _CS_ST_BACKWARDS:
670  /* on, off, 2 */
671  rc= tailoring_append(st, "[backwards %.*s]", len, attr);
672  break;
673 
674  case _CS_ST_NORMALIZATION:
675  /*
676  TODO for WL#896: check collations for normalization: vi.xml
677  We want precomposed characters work well at this point.
678  */
679  /* on, off */
680  rc= tailoring_append(st, "[normalization %.*s]", len, attr);
681  break;
682 
683  case _CS_ST_CASE_LEVEL:
684  /* on, off */
685  rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
686  break;
687 
688  case _CS_ST_CASE_FIRST:
689  /* upper, lower, off */
690  rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
691  break;
692 
693  case _CS_ST_HIRAGANA_QUATERNARY:
694  /* on, off */
695  rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
696  break;
697 
698  case _CS_ST_NUMERIC:
699  /* on, off */
700  rc= tailoring_append(st, "[numeric %.*s]", len, attr);
701  break;
702 
703  case _CS_ST_VARIABLE_TOP:
704  /* TODO for WL#896: check value format */
705  rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
706  break;
707 
708  case _CS_ST_MATCH_BOUNDARIES:
709  /* none, whole-character, whole-word */
710  rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
711  break;
712 
713  case _CS_ST_MATCH_STYLE:
714  /* minimal, medial, maximal */
715  rc= tailoring_append(st, "[match-style %.*s]", len, attr);
716  break;
717 
718 
719  /* Rules */
720  case _CS_RESET:
721  rc= tailoring_append(st, "%.*s", len, attr);
722  break;
723 
724  case _CS_DIFF1:
725  case _CS_DIFF2:
726  case _CS_DIFF3:
727  case _CS_DIFF4:
728  case _CS_IDENTICAL:
729  rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
730  break;
731 
732 
733  /* Rules: Expansion */
734  case _CS_EXP_EXTEND:
735  rc= tailoring_append(st, " / %.*s", len, attr);
736  break;
737 
738  case _CS_EXP_DIFF1:
739  case _CS_EXP_DIFF2:
740  case _CS_EXP_DIFF3:
741  case _CS_EXP_DIFF4:
742  case _CS_EXP_IDENTICAL:
743  if (i->context[0])
744  {
745  rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
746  strlen(i->context), i->context, len, attr);
747  i->context[0]= 0;
748  }
749  else
750  rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr);
751  break;
752 
753  /* Rules: Context */
754  case _CS_CONTEXT:
755  if (len < sizeof(i->context) + 1)
756  {
757  memcpy(i->context, attr, len);
758  i->context[len]= '\0';
759  }
760  break;
761 
762  /* Rules: Abbreviating Ordering Specifications */
763  case _CS_A_DIFF1:
764  case _CS_A_DIFF2:
765  case _CS_A_DIFF3:
766  case _CS_A_DIFF4:
767  case _CS_A_IDENTICAL:
768  rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
769  break;
770 
771  /* Rules: Placing Characters Before Others */
772  case _CS_RESET_BEFORE:
773  /*
774  TODO for WL#896: Add this check into text customization parser:
775  It is an error if the strength of the before relation is not identical
776  to the relation after the reset. We'll need this for WL#896.
777  */
778  rc= tailoring_append(st, "[before %.*s]", len, attr);
779  break;
780 
781 
782  default:
783  break;
784  }
785 
786  return rc;
787 }
788 
789 
790 my_bool
791 my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len)
792 {
793  MY_XML_PARSER p;
794  struct my_cs_file_info info;
795  my_bool rc;
796 
797  my_charset_file_init(&info);
798  my_xml_parser_create(&p);
799  my_xml_set_enter_handler(&p,cs_enter);
800  my_xml_set_value_handler(&p,cs_value);
801  my_xml_set_leave_handler(&p,cs_leave);
802  info.loader= loader;
803  my_xml_set_user_data(&p, (void *) &info);
804  rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
805  my_xml_parser_free(&p);
806  my_charset_file_free(&info);
807  if (rc != MY_XML_OK)
808  {
809  const char *errstr= my_xml_error_string(&p);
810  if (sizeof(loader->error) > 32 + strlen(errstr))
811  {
812  /* We cannot use my_snprintf() here. See previous comment. */
813  sprintf(loader->error, "at line %d pos %d: %s",
814  my_xml_error_lineno(&p)+1,
815  (int) my_xml_error_pos(&p),
816  my_xml_error_string(&p));
817  }
818  }
819  return rc;
820 }
821 
822 
823 /*
824  Check repertoire: detect pure ascii strings
825 */
826 uint
827 my_string_repertoire(const CHARSET_INFO *cs, const char *str, ulong length)
828 {
829  const char *strend= str + length;
830  if (cs->mbminlen == 1)
831  {
832  for ( ; str < strend; str++)
833  {
834  if (((uchar) *str) > 0x7F)
835  return MY_REPERTOIRE_UNICODE30;
836  }
837  }
838  else
839  {
840  my_wc_t wc;
841  int chlen;
842  for (;
843  (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
844  str+= chlen)
845  {
846  if (wc > 0x7F)
847  return MY_REPERTOIRE_UNICODE30;
848  }
849  }
850  return MY_REPERTOIRE_ASCII;
851 }
852 
853 
854 /*
855  Returns repertoire for charset
856 */
857 uint my_charset_repertoire(const CHARSET_INFO *cs)
858 {
859  return cs->state & MY_CS_PUREASCII ?
860  MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
861 }
862 
863 
864 /*
865  Detect whether a character set is ASCII compatible.
866 
867  Returns TRUE for:
868 
869  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
870  (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
871 
872  - all multi-byte character sets having mbminlen == 1
873  (ignores ucs2 whose mbminlen is 2)
874 
875  TODO:
876 
877  When merging to 5.2, this function should be changed
878  to check a new flag MY_CS_NONASCII,
879 
880  return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
881 
882  This flag was previously added into 5.2 under terms
883  of WL#3759 "Optimize identifier conversion in client-server protocol"
884  especially to mark character sets not compatible with ASCII.
885 
886  We won't backport this flag to 5.0 or 5.1.
887  This function is Ok for 5.0 and 5.1, because we're not going
888  to introduce new tricky character sets between 5.0 and 5.2.
889 */
890 my_bool
891 my_charset_is_ascii_based(const CHARSET_INFO *cs)
892 {
893  return
894  (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
895  (cs->mbminlen == 1 && cs->mbmaxlen > 1);
896 }
897 
898 
899 /*
900  Detect if a character set is 8bit,
901  and it is pure ascii, i.e. doesn't have
902  characters outside U+0000..U+007F
903  This functions is shared between "conf_to_src"
904  and dynamic charsets loader in "mysqld".
905 */
906 my_bool
907 my_charset_is_8bit_pure_ascii(const CHARSET_INFO *cs)
908 {
909  size_t code;
910  if (!cs->tab_to_uni)
911  return 0;
912  for (code= 0; code < 256; code++)
913  {
914  if (cs->tab_to_uni[code] > 0x7F)
915  return 0;
916  }
917  return 1;
918 }
919 
920 
921 /*
922  Shared function between conf_to_src and mysys.
923  Check if a 8bit character set is compatible with
924  ascii on the range 0x00..0x7F.
925 */
926 my_bool
927 my_charset_is_ascii_compatible(const CHARSET_INFO *cs)
928 {
929  uint i;
930  if (!cs->tab_to_uni)
931  return 1;
932  for (i= 0; i < 128; i++)
933  {
934  if (cs->tab_to_uni[i] != i)
935  return 0;
936  }
937  return 1;
938 }
939 
940 
941 /*
942  Convert a string between two character sets.
943  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
944 
945  @param to[OUT] Store result here
946  @param to_length Size of "to" buffer
947  @param to_cs Character set of result string
948  @param from Copy from here
949  @param from_length Length of the "from" string
950  @param from_cs Character set of the "from" string
951  @param errors[OUT] Number of conversion errors
952 
953  @return Number of bytes copied to 'to' string
954 */
955 
956 static uint32
957 my_convert_internal(char *to, uint32 to_length,
958  const CHARSET_INFO *to_cs,
959  const char *from, uint32 from_length,
960  const CHARSET_INFO *from_cs, uint *errors)
961 {
962  int cnvres;
963  my_wc_t wc;
964  const uchar *from_end= (const uchar*) from + from_length;
965  char *to_start= to;
966  uchar *to_end= (uchar*) to + to_length;
967  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
968  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
969  uint error_count= 0;
970 
971  while (1)
972  {
973  if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
974  from+= cnvres;
975  else if (cnvres == MY_CS_ILSEQ)
976  {
977  error_count++;
978  from++;
979  wc= '?';
980  }
981  else if (cnvres > MY_CS_TOOSMALL)
982  {
983  /*
984  A correct multibyte sequence detected
985  But it doesn't have Unicode mapping.
986  */
987  error_count++;
988  from+= (-cnvres);
989  wc= '?';
990  }
991  else
992  break; // Not enough characters
993 
994 outp:
995  if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
996  to+= cnvres;
997  else if (cnvres == MY_CS_ILUNI && wc != '?')
998  {
999  error_count++;
1000  wc= '?';
1001  goto outp;
1002  }
1003  else
1004  break;
1005  }
1006  *errors= error_count;
1007  return (uint32) (to - to_start);
1008 }
1009 
1010 
1011 /*
1012  Convert a string between two character sets.
1013  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
1014  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
1015 
1016  @param to[OUT] Store result here
1017  @param to_length Size of "to" buffer
1018  @param to_cs Character set of result string
1019  @param from Copy from here
1020  @param from_length Length of the "from" string
1021  @param from_cs Character set of the "from" string
1022  @param errors[OUT] Number of conversion errors
1023 
1024  @return Number of bytes copied to 'to' string
1025 */
1026 
1027 uint32
1028 my_convert(char *to, uint32 to_length, const CHARSET_INFO *to_cs,
1029  const char *from, uint32 from_length,
1030  const CHARSET_INFO *from_cs, uint *errors)
1031 {
1032  uint32 length, length2;
1033  /*
1034  If any of the character sets is not ASCII compatible,
1035  immediately switch to slow mb_wc->wc_mb method.
1036  */
1037  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
1038  return my_convert_internal(to, to_length, to_cs,
1039  from, from_length, from_cs, errors);
1040 
1041  length= length2= MY_MIN(to_length, from_length);
1042 
1043 #if defined(__i386__)
1044  /*
1045  Special loop for i386, it allows to refer to a
1046  non-aligned memory block as UINT32, which makes
1047  it possible to copy four bytes at once. This
1048  gives about 10% performance improvement comparing
1049  to byte-by-byte loop.
1050  */
1051  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
1052  {
1053  if ((*(uint32*)from) & 0x80808080)
1054  break;
1055  *((uint32*) to)= *((const uint32*) from);
1056  }
1057 #endif /* __i386__ */
1058 
1059  for (; ; *to++= *from++, length--)
1060  {
1061  if (!length)
1062  {
1063  *errors= 0;
1064  return length2;
1065  }
1066  if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
1067  {
1068  uint32 copied_length= length2 - length;
1069  to_length-= copied_length;
1070  from_length-= copied_length;
1071  return copied_length + my_convert_internal(to, to_length, to_cs,
1072  from, from_length, from_cs,
1073  errors);
1074  }
1075  }
1076 
1077  DBUG_ASSERT(FALSE); // Should never get to here
1078  return 0; // Make compiler happy
1079 }