MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
xml.c
1 /* Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
15 
16 #include "my_global.h"
17 #include "m_string.h"
18 #include "my_xml.h"
19 
20 
21 #define MY_XML_UNKNOWN 'U'
22 #define MY_XML_EOF 'E'
23 #define MY_XML_STRING 'S'
24 #define MY_XML_IDENT 'I'
25 #define MY_XML_EQ '='
26 #define MY_XML_LT '<'
27 #define MY_XML_GT '>'
28 #define MY_XML_SLASH '/'
29 #define MY_XML_COMMENT 'C'
30 #define MY_XML_TEXT 'T'
31 #define MY_XML_QUESTION '?'
32 #define MY_XML_EXCLAM '!'
33 #define MY_XML_CDATA 'D'
34 
35 typedef struct xml_attr_st
36 {
37  const char *beg;
38  const char *end;
39 } MY_XML_ATTR;
40 
41 
42 /*
43  XML ctype:
44 */
45 #define MY_XML_ID0 0x01 /* Identifier initial character */
46 #define MY_XML_ID1 0x02 /* Identifier medial character */
47 #define MY_XML_SPC 0x08 /* Spacing character */
48 
49 
50 /*
51  http://www.w3.org/TR/REC-xml/
52  [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
53  CombiningChar | Extender
54  [5] Name ::= (Letter | '_' | ':') (NameChar)*
55 */
56 
57 static char my_xml_ctype[256]=
58 {
59 /*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
60 /*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 /*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
62 /*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
63 /*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
64 /*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
65 /*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
66 /*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
67 /*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
68 /*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
69 /*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
70 /*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
71 /*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
72 /*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
73 /*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74 /*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
75 };
76 
77 #define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
78 #define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
79 #define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
80 
81 
82 static const char *lex2str(int lex)
83 {
84  switch(lex)
85  {
86  case MY_XML_EOF: return "END-OF-INPUT";
87  case MY_XML_STRING: return "STRING";
88  case MY_XML_IDENT: return "IDENT";
89  case MY_XML_CDATA: return "CDATA";
90  case MY_XML_EQ: return "'='";
91  case MY_XML_LT: return "'<'";
92  case MY_XML_GT: return "'>'";
93  case MY_XML_SLASH: return "'/'";
94  case MY_XML_COMMENT: return "COMMENT";
95  case MY_XML_TEXT: return "TEXT";
96  case MY_XML_QUESTION: return "'?'";
97  case MY_XML_EXCLAM: return "'!'";
98  }
99  return "unknown token";
100 }
101 
102 static void my_xml_norm_text(MY_XML_ATTR *a)
103 {
104  for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
105  for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
106 }
107 
108 
109 static inline my_bool
110 my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
111 {
112  return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
113 }
114 
115 
116 static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
117 {
118  int lex;
119 
120  for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
121 
122  if (p->cur >= p->end)
123  {
124  a->beg=p->end;
125  a->end=p->end;
126  lex=MY_XML_EOF;
127  goto ret;
128  }
129 
130  a->beg=p->cur;
131  a->end=p->cur;
132 
133  if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
134  {
135  for (; p->cur < p->end; p->cur++)
136  {
137  if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
138  {
139  p->cur+= 3;
140  break;
141  }
142  }
143  a->end=p->cur;
144  lex=MY_XML_COMMENT;
145  }
146  else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
147  {
148  p->cur+= 9;
149  for (; p->cur < p->end - 2 ; p->cur++)
150  {
151  if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
152  {
153  p->cur+= 3;
154  a->end= p->cur;
155  break;
156  }
157  }
158  lex= MY_XML_CDATA;
159  }
160  else if (strchr("?=/<>!",p->cur[0]))
161  {
162  p->cur++;
163  a->end=p->cur;
164  lex=a->beg[0];
165  }
166  else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
167  {
168  /*
169  "string" or 'string' found.
170  Scan until the closing quote/doublequote, or until the END-OF-INPUT.
171  */
172  p->cur++;
173  for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
174  {}
175  a->end=p->cur;
176  if (p->cur < p->end) /* Closing quote or doublequote has been found */
177  p->cur++;
178  a->beg++;
179  if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
180  my_xml_norm_text(a);
181  lex=MY_XML_STRING;
182  }
183  else if (my_xml_is_id0(p->cur[0]))
184  {
185  p->cur++;
186  while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
187  p->cur++;
188  a->end=p->cur;
189  my_xml_norm_text(a);
190  lex=MY_XML_IDENT;
191  }
192  else
193  lex= MY_XML_UNKNOWN;
194 
195 #if 0
196  printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
197 #endif
198 
199 ret:
200  return lex;
201 }
202 
203 
204 static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
205 {
206  return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
207 }
208 
209 
222 static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
223 {
224  size_t ofs= st->attr.end - st->attr.start;
225  len++; // Add terminating zero.
226  if (ofs + len > st->attr.buffer_size)
227  {
228  st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
229  st->attr.buffer_size * 2 + len : SIZE_T_MAX;
230 
231  if (!st->attr.buffer)
232  {
233  st->attr.buffer= (char *) my_str_malloc(st->attr.buffer_size);
234  if (st->attr.buffer)
235  memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
236  }
237  else
238  st->attr.buffer= (char *) my_str_realloc(st->attr.buffer,
239  st->attr.buffer_size);
240  st->attr.start= st->attr.buffer;
241  st->attr.end= st->attr.start + ofs;
242 
243  return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
244  }
245  return MY_XML_OK;
246 }
247 
248 
250 static void my_xml_attr_rewind(MY_XML_PARSER *p)
251 {
252  /* keep the buffer already allocated */
253  p->attr.end= p->attr.start;
254 }
255 
256 
257 static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
258 {
259  if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
260  return MY_XML_ERROR;
261 
262  if (st->attr.end > st->attr.start)
263  {
264  st->attr.end[0]= '/';
265  st->attr.end++;
266  }
267  memcpy(st->attr.end, str, len);
268  st->attr.end+= len;
269  st->attr.end[0]= '\0';
270  if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
271  return st->enter ? st->enter(st, str, len) : MY_XML_OK;
272  else
273  return st->enter ?
274  st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
275 }
276 
277 
278 static void mstr(char *s,const char *src,size_t l1, size_t l2)
279 {
280  l1 = l1<l2 ? l1 : l2;
281  memcpy(s,src,l1);
282  s[l1]='\0';
283 }
284 
285 
286 static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
287 {
288  char *e;
289  size_t glen;
290  char s[32];
291  char g[32];
292  int rc;
293 
294  /* Find previous '/' or beginning */
295  for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
296  glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
297 
298  if (str && (slen != glen))
299  {
300  mstr(s,str,sizeof(s)-1,slen);
301  if (glen)
302  {
303  mstr(g,e+1,sizeof(g)-1,glen),
304  sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
305  }
306  else
307  sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
308  return MY_XML_ERROR;
309  }
310 
311  if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
312  rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
313  else
314  rc= (p->leave_xml ?
315  p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
316  MY_XML_OK);
317 
318  *e='\0';
319  p->attr.end= e;
320 
321  return rc;
322 }
323 
324 
325 int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
326 {
327 
328  my_xml_attr_rewind(p);
329 
330  p->beg=str;
331  p->cur=str;
332  p->end=str+len;
333 
334  while ( p->cur < p->end )
335  {
336  MY_XML_ATTR a;
337  if (p->cur[0] == '<')
338  {
339  int lex;
340  int question=0;
341  int exclam=0;
342 
343  lex=my_xml_scan(p,&a);
344 
345  if (MY_XML_COMMENT == lex)
346  continue;
347 
348  if (lex == MY_XML_CDATA)
349  {
350  a.beg+= 9;
351  a.end-= 3;
352  my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
353  continue;
354  }
355 
356  lex=my_xml_scan(p,&a);
357 
358  if (MY_XML_SLASH == lex)
359  {
360  if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
361  {
362  sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
363  return MY_XML_ERROR;
364  }
365  if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
366  return MY_XML_ERROR;
367  lex=my_xml_scan(p,&a);
368  goto gt;
369  }
370 
371  if (MY_XML_EXCLAM == lex)
372  {
373  lex=my_xml_scan(p,&a);
374  exclam=1;
375  }
376  else if (MY_XML_QUESTION == lex)
377  {
378  lex=my_xml_scan(p,&a);
379  question=1;
380  }
381 
382  if (MY_XML_IDENT == lex)
383  {
384  p->current_node_type= MY_XML_NODE_TAG;
385  if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
386  return MY_XML_ERROR;
387  }
388  else
389  {
390  sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
391  lex2str(lex));
392  return MY_XML_ERROR;
393  }
394 
395  while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
396  ((MY_XML_STRING == lex && exclam)))
397  {
398  MY_XML_ATTR b;
399  if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
400  {
401  lex=my_xml_scan(p,&b);
402  if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
403  {
404  p->current_node_type= MY_XML_NODE_ATTR;
405  if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
406  (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
407  (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
408  return MY_XML_ERROR;
409  }
410  else
411  {
412  sprintf(p->errstr,"%s unexpected (ident or string wanted)",
413  lex2str(lex));
414  return MY_XML_ERROR;
415  }
416  }
417  else if (MY_XML_IDENT == lex)
418  {
419  p->current_node_type= MY_XML_NODE_ATTR;
420  if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
421  (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
422  return MY_XML_ERROR;
423  }
424  else if ((MY_XML_STRING == lex) && exclam)
425  {
426  /*
427  We are in <!DOCTYPE>, e.g.
428  <!DOCTYPE name SYSTEM "SystemLiteral">
429  <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
430  Just skip "SystemLiteral" and "PublicidLiteral"
431  */
432  }
433  else
434  break;
435  }
436 
437  if (lex == MY_XML_SLASH)
438  {
439  if (MY_XML_OK != my_xml_leave(p,NULL,0))
440  return MY_XML_ERROR;
441  lex=my_xml_scan(p,&a);
442  }
443 
444 gt:
445  if (question)
446  {
447  if (lex != MY_XML_QUESTION)
448  {
449  sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
450  return MY_XML_ERROR;
451  }
452  if (MY_XML_OK != my_xml_leave(p,NULL,0))
453  return MY_XML_ERROR;
454  lex=my_xml_scan(p,&a);
455  }
456 
457  if (exclam)
458  {
459  if (MY_XML_OK != my_xml_leave(p,NULL,0))
460  return MY_XML_ERROR;
461  }
462 
463  if (lex != MY_XML_GT)
464  {
465  sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
466  return MY_XML_ERROR;
467  }
468  }
469  else
470  {
471  a.beg=p->cur;
472  for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
473  a.end=p->cur;
474 
475  if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
476  my_xml_norm_text(&a);
477  if (a.beg != a.end)
478  {
479  my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
480  }
481  }
482  }
483 
484  if (p->attr.start[0])
485  {
486  sprintf(p->errstr,"unexpected END-OF-INPUT");
487  return MY_XML_ERROR;
488  }
489  return MY_XML_OK;
490 }
491 
492 
493 void my_xml_parser_create(MY_XML_PARSER *p)
494 {
495  memset(p, 0, sizeof(p[0]));
496  /*
497  Use static buffer while it's sufficient.
498  */
499  p->attr.start= p->attr.end= p->attr.static_buffer;
500  p->attr.buffer_size= sizeof(p->attr.static_buffer);
501 }
502 
503 
504 void my_xml_parser_free(MY_XML_PARSER *p)
505 {
506  if (p->attr.buffer)
507  {
508  my_str_free(p->attr.buffer);
509  p->attr.buffer= NULL;
510  }
511 }
512 
513 
514 void my_xml_set_value_handler(MY_XML_PARSER *p,
515  int (*action)(MY_XML_PARSER *p, const char *s,
516  size_t l))
517 {
518  p->value=action;
519 }
520 
521 void my_xml_set_enter_handler(MY_XML_PARSER *p,
522  int (*action)(MY_XML_PARSER *p, const char *s,
523  size_t l))
524 {
525  p->enter=action;
526 }
527 
528 
529 void my_xml_set_leave_handler(MY_XML_PARSER *p,
530  int (*action)(MY_XML_PARSER *p, const char *s,
531  size_t l))
532 {
533  p->leave_xml=action;
534 }
535 
536 
537 void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
538 {
539  p->user_data=user_data;
540 }
541 
542 
543 const char *my_xml_error_string(MY_XML_PARSER *p)
544 {
545  return p->errstr;
546 }
547 
548 
549 size_t my_xml_error_pos(MY_XML_PARSER *p)
550 {
551  const char *beg=p->beg;
552  const char *s;
553  for ( s=p->beg ; s<p->cur; s++)
554  {
555  if (s[0] == '\n')
556  beg=s;
557  }
558  return (size_t) (p->cur-beg);
559 }
560 
561 uint my_xml_error_lineno(MY_XML_PARSER *p)
562 {
563  uint res=0;
564  const char *s;
565  for (s=p->beg ; s<p->cur; s++)
566  {
567  if (s[0] == '\n')
568  res++;
569  }
570  return res;
571 }