Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
normalizer.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /*
3  Copyright(C) 2012 Brazil
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 
19 #include <string.h>
20 
21 #include "normalizer_in.h"
22 #include "string_in.h"
23 #include <groonga/normalizer.h>
24 #include <groonga/tokenizer.h>
25 
26 grn_rc
28  const char *name_ptr,
29  int name_length,
30  grn_proc_func *init,
31  grn_proc_func *next,
33 {
34  grn_expr_var vars[] = {
35  { NULL, 0 }
36  };
37  GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL);
38 
39  if (name_length < 0) {
40  name_length = strlen(name_ptr);
41  }
42 
43  {
44  grn_obj * const normalizer = grn_proc_create(ctx,
45  name_ptr, name_length,
47  init, next, fin,
48  sizeof(*vars) / sizeof(vars),
49  vars);
50  if (!normalizer) {
52  "[normalizer] failed to register normalizer: <%.*s>",
53  name_length, name_ptr);
54  return ctx->rc;
55  }
56  }
57  return GRN_SUCCESS;
58 }
59 
60 grn_rc
62 {
63  return GRN_SUCCESS;
64 }
65 
66 grn_rc
68 {
69  return GRN_SUCCESS;
70 }
71 
72 static unsigned char symbol[] = {
73  ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
74  0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
75  '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76  '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77  '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
78 };
79 
80 inline static grn_obj *
81 eucjp_normalize(grn_ctx *ctx, grn_string *nstr)
82 {
83  static uint16_t hankana[] = {
84  0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
85  0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
86  0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
87  0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
88  0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
89  0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
90  0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
91  0xa1eb
92  };
93  static unsigned char dakuten[] = {
94  0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
95  0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
96  0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
97  0, 0xdc
98  };
99  static unsigned char handaku[] = {
100  0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
101  };
102  int16_t *ch;
103  const unsigned char *s, *s_, *e;
104  unsigned char *d, *d0, *d_, b;
105  uint_least8_t *cp, *ctypes, ctype;
106  size_t size = nstr->original_length_in_bytes, length = 0;
107  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
108  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
110  "[string][eucjp] failed to allocate normalized text space");
111  return NULL;
112  }
113  d0 = (unsigned char *) nstr->normalized;
114  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
115  if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
116  GRN_FREE(nstr->normalized);
117  nstr->normalized = NULL;
119  "[string][eucjp] failed to allocate checks space");
120  return NULL;
121  }
122  }
123  ch = nstr->checks;
124  if (nstr->flags & GRN_STRING_WITH_TYPES) {
125  if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
126  GRN_FREE(nstr->checks);
127  GRN_FREE(nstr->normalized);
128  nstr->checks = NULL;
129  nstr->normalized = NULL;
131  "[string][eucjp] failed to allocate character types space");
132  return NULL;
133  }
134  }
135  cp = ctypes = nstr->ctypes;
136  e = (unsigned char *)nstr->original + size;
137  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
138  if ((*s & 0x80)) {
139  if (((s + 1) < e) && (*(s + 1) & 0x80)) {
140  unsigned char c1 = *s++, c2 = *s, c3 = 0;
141  switch (c1 >> 4) {
142  case 0x08 :
143  if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
144  uint16_t c = hankana[c2 - 0xa0];
145  switch (c) {
146  case 0xa1ab :
147  if (d > d0 + 1 && d[-2] == 0xa5
148  && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
149  *(d - 1) = b;
150  if (ch) { ch[-1] += 2; s_ += 2; }
151  continue;
152  } else {
153  *d++ = c >> 8; *d = c & 0xff;
154  }
155  break;
156  case 0xa1eb :
157  if (d > d0 + 1 && d[-2] == 0xa5
158  && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
159  *(d - 1) = b;
160  if (ch) { ch[-1] += 2; s_ += 2; }
161  continue;
162  } else {
163  *d++ = c >> 8; *d = c & 0xff;
164  }
165  break;
166  default :
167  *d++ = c >> 8; *d = c & 0xff;
168  break;
169  }
170  ctype = GRN_CHAR_KATAKANA;
171  } else {
172  *d++ = c1; *d = c2;
173  ctype = GRN_CHAR_OTHERS;
174  }
175  break;
176  case 0x09 :
177  *d++ = c1; *d = c2;
178  ctype = GRN_CHAR_OTHERS;
179  break;
180  case 0x0a :
181  switch (c1 & 0x0f) {
182  case 1 :
183  switch (c2) {
184  case 0xbc :
185  *d++ = c1; *d = c2;
186  ctype = GRN_CHAR_KATAKANA;
187  break;
188  case 0xb9 :
189  *d++ = c1; *d = c2;
190  ctype = GRN_CHAR_KANJI;
191  break;
192  case 0xa1 :
193  if (removeblankp) {
194  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
195  continue;
196  } else {
197  *d = ' ';
199  }
200  break;
201  default :
202  if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
203  *d = c3;
204  ctype = GRN_CHAR_SYMBOL;
205  } else {
206  *d++ = c1; *d = c2;
207  ctype = GRN_CHAR_OTHERS;
208  }
209  break;
210  }
211  break;
212  case 2 :
213  *d++ = c1; *d = c2;
214  ctype = GRN_CHAR_SYMBOL;
215  break;
216  case 3 :
217  c3 = c2 - 0x80;
218  if ('a' <= c3 && c3 <= 'z') {
219  ctype = GRN_CHAR_ALPHA;
220  *d = c3;
221  } else if ('A' <= c3 && c3 <= 'Z') {
222  ctype = GRN_CHAR_ALPHA;
223  *d = c3 + 0x20;
224  } else if ('0' <= c3 && c3 <= '9') {
225  ctype = GRN_CHAR_DIGIT;
226  *d = c3;
227  } else {
228  ctype = GRN_CHAR_OTHERS;
229  *d++ = c1; *d = c2;
230  }
231  break;
232  case 4 :
233  *d++ = c1; *d = c2;
234  ctype = GRN_CHAR_HIRAGANA;
235  break;
236  case 5 :
237  *d++ = c1; *d = c2;
238  ctype = GRN_CHAR_KATAKANA;
239  break;
240  case 6 :
241  case 7 :
242  case 8 :
243  *d++ = c1; *d = c2;
244  ctype = GRN_CHAR_SYMBOL;
245  break;
246  default :
247  *d++ = c1; *d = c2;
248  ctype = GRN_CHAR_OTHERS;
249  break;
250  }
251  break;
252  default :
253  *d++ = c1; *d = c2;
254  ctype = GRN_CHAR_KANJI;
255  break;
256  }
257  } else {
258  /* skip invalid character */
259  continue;
260  }
261  } else {
262  unsigned char c = *s;
263  switch (c >> 4) {
264  case 0 :
265  case 1 :
266  /* skip unprintable ascii */
267  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
268  continue;
269  case 2 :
270  if (c == 0x20) {
271  if (removeblankp) {
272  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
273  continue;
274  } else {
275  *d = ' ';
277  }
278  } else {
279  *d = c;
280  ctype = GRN_CHAR_SYMBOL;
281  }
282  break;
283  case 3 :
284  *d = c;
285  ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
286  break;
287  case 4 :
288  *d = ('A' <= c) ? c + 0x20 : c;
289  ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
290  break;
291  case 5 :
292  *d = (c <= 'Z') ? c + 0x20 : c;
293  ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
294  break;
295  case 6 :
296  *d = c;
297  ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
298  break;
299  case 7 :
300  *d = c;
301  ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
302  break;
303  default :
304  *d = c;
305  ctype = GRN_CHAR_OTHERS;
306  break;
307  }
308  }
309  d++;
310  length++;
311  if (cp) { *cp++ = ctype; }
312  if (ch) {
313  *ch++ = (int16_t)(s + 1 - s_);
314  s_ = s + 1;
315  while (++d_ < d) { *ch++ = 0; }
316  }
317  }
318  if (cp) { *cp = GRN_CHAR_NULL; }
319  *d = '\0';
320  nstr->n_characters = length;
321  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
322  return NULL;
323 }
324 
325 inline static grn_obj *
326 sjis_normalize(grn_ctx *ctx, grn_string *nstr)
327 {
328  static uint16_t hankana[] = {
329  0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
330  0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
331  0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
332  0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
333  0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
334  0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
335  0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
336  0x814b
337  };
338  static unsigned char dakuten[] = {
339  0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
340  0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
341  0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
342  0, 0x7b
343  };
344  static unsigned char handaku[] = {
345  0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
346  };
347  int16_t *ch;
348  const unsigned char *s, *s_;
349  unsigned char *d, *d0, *d_, b, *e;
350  uint_least8_t *cp, *ctypes, ctype;
351  size_t size = nstr->original_length_in_bytes, length = 0;
352  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
353  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
355  "[string][sjis] failed to allocate normalized text space");
356  return NULL;
357  }
358  d0 = (unsigned char *) nstr->normalized;
359  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
360  if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
361  GRN_FREE(nstr->normalized);
362  nstr->normalized = NULL;
364  "[string][sjis] failed to allocate checks space");
365  return NULL;
366  }
367  }
368  ch = nstr->checks;
369  if (nstr->flags & GRN_STRING_WITH_TYPES) {
370  if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
371  GRN_FREE(nstr->checks);
372  GRN_FREE(nstr->normalized);
373  nstr->checks = NULL;
374  nstr->normalized = NULL;
376  "[string][sjis] failed to allocate character types space");
377  return NULL;
378  }
379  }
380  cp = ctypes = nstr->ctypes;
381  e = (unsigned char *)nstr->original + size;
382  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
383  if ((*s & 0x80)) {
384  if (0xa0 <= *s && *s <= 0xdf) {
385  uint16_t c = hankana[*s - 0xa0];
386  switch (c) {
387  case 0x814a :
388  if (d > d0 + 1 && d[-2] == 0x83
389  && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
390  *(d - 1) = b;
391  if (ch) { ch[-1]++; s_++; }
392  continue;
393  } else {
394  *d++ = c >> 8; *d = c & 0xff;
395  }
396  break;
397  case 0x814b :
398  if (d > d0 + 1 && d[-2] == 0x83
399  && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
400  *(d - 1) = b;
401  if (ch) { ch[-1]++; s_++; }
402  continue;
403  } else {
404  *d++ = c >> 8; *d = c & 0xff;
405  }
406  break;
407  default :
408  *d++ = c >> 8; *d = c & 0xff;
409  break;
410  }
411  ctype = GRN_CHAR_KATAKANA;
412  } else {
413  if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
414  unsigned char c1 = *s++, c2 = *s, c3 = 0;
415  if (0x81 <= c1 && c1 <= 0x87) {
416  switch (c1 & 0x0f) {
417  case 1 :
418  switch (c2) {
419  case 0x5b :
420  *d++ = c1; *d = c2;
421  ctype = GRN_CHAR_KATAKANA;
422  break;
423  case 0x58 :
424  *d++ = c1; *d = c2;
425  ctype = GRN_CHAR_KANJI;
426  break;
427  case 0x40 :
428  if (removeblankp) {
429  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
430  continue;
431  } else {
432  *d = ' ';
434  }
435  break;
436  default :
437  if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
438  *d = c3;
439  ctype = GRN_CHAR_SYMBOL;
440  } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
441  *d = c3;
442  ctype = GRN_CHAR_SYMBOL;
443  } else {
444  *d++ = c1; *d = c2;
445  ctype = GRN_CHAR_OTHERS;
446  }
447  break;
448  }
449  break;
450  case 2 :
451  c3 = c2 - 0x1f;
452  if (0x4f <= c2 && c2 <= 0x58) {
453  ctype = GRN_CHAR_DIGIT;
454  *d = c2 - 0x1f;
455  } else if (0x60 <= c2 && c2 <= 0x79) {
456  ctype = GRN_CHAR_ALPHA;
457  *d = c2 + 0x01;
458  } else if (0x81 <= c2 && c2 <= 0x9a) {
459  ctype = GRN_CHAR_ALPHA;
460  *d = c2 - 0x20;
461  } else if (0x9f <= c2 && c2 <= 0xf1) {
462  *d++ = c1; *d = c2;
463  ctype = GRN_CHAR_HIRAGANA;
464  } else {
465  *d++ = c1; *d = c2;
466  ctype = GRN_CHAR_OTHERS;
467  }
468  break;
469  case 3 :
470  if (0x40 <= c2 && c2 <= 0x96) {
471  *d++ = c1; *d = c2;
472  ctype = GRN_CHAR_KATAKANA;
473  } else {
474  *d++ = c1; *d = c2;
475  ctype = GRN_CHAR_SYMBOL;
476  }
477  break;
478  case 4 :
479  case 7 :
480  *d++ = c1; *d = c2;
481  ctype = GRN_CHAR_SYMBOL;
482  break;
483  default :
484  *d++ = c1; *d = c2;
485  ctype = GRN_CHAR_OTHERS;
486  break;
487  }
488  } else {
489  *d++ = c1; *d = c2;
490  ctype = GRN_CHAR_KANJI;
491  }
492  } else {
493  /* skip invalid character */
494  continue;
495  }
496  }
497  } else {
498  unsigned char c = *s;
499  switch (c >> 4) {
500  case 0 :
501  case 1 :
502  /* skip unprintable ascii */
503  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
504  continue;
505  case 2 :
506  if (c == 0x20) {
507  if (removeblankp) {
508  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
509  continue;
510  } else {
511  *d = ' ';
513  }
514  } else {
515  *d = c;
516  ctype = GRN_CHAR_SYMBOL;
517  }
518  break;
519  case 3 :
520  *d = c;
521  ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
522  break;
523  case 4 :
524  *d = ('A' <= c) ? c + 0x20 : c;
525  ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
526  break;
527  case 5 :
528  *d = (c <= 'Z') ? c + 0x20 : c;
529  ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
530  break;
531  case 6 :
532  *d = c;
533  ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
534  break;
535  case 7 :
536  *d = c;
537  ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
538  break;
539  default :
540  *d = c;
541  ctype = GRN_CHAR_OTHERS;
542  break;
543  }
544  }
545  d++;
546  length++;
547  if (cp) { *cp++ = ctype; }
548  if (ch) {
549  *ch++ = (int16_t)(s + 1 - s_);
550  s_ = s + 1;
551  while (++d_ < d) { *ch++ = 0; }
552  }
553  }
554  if (cp) { *cp = GRN_CHAR_NULL; }
555  *d = '\0';
556  nstr->n_characters = length;
557  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
558  return NULL;
559 }
560 
561 #ifdef GRN_WITH_NFKC
562 const char *grn_nfkc_map1(const unsigned char *str);
563 const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
564 
565 static inline int
566 grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
567 {
568  /* MEMO: This function allows non-null-terminated string as str. */
569  /* But requires the end of string. */
570  const unsigned char *p = str;
571  if (end <= p || !*p) { return 0; }
572  if (*p & 0x80) {
573  int b, w;
574  int size;
575  int i;
576  for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
577  if (!w) {
579  "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>",
580  (int)(end - p), p,
581  (int)(end - str), str);
582  return 0;
583  }
584  size = w + 1;
585  for (i = 1; i < size; i++) {
586  if (++p >= end) {
588  "invalid utf8 string: too short: "
589  "%d byte is required but %d byte is given: <%.*s>",
590  size, i,
591  (int)(end - str), str);
592  return 0;
593  }
594  if (!*p) {
596  "invalid utf8 string: NULL character is found: <%.*s>",
597  (int)(end - str), str);
598  return 0;
599  }
600  if ((*p & 0xc0) != 0x80) {
602  "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>",
603  (int)(end - p), p,
604  (int)(end - str), str);
605  return 0;
606  }
607  }
608  return size;
609  } else {
610  return 1;
611  }
612  return 0;
613 }
614 
615 inline static grn_obj *
616 utf8_normalize(grn_ctx *ctx, grn_string *nstr)
617 {
618  int16_t *ch;
619  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
620  unsigned char *d, *d_, *de;
621  uint_least8_t *cp;
622  size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
623  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
624  grn_bool remove_tokenized_delimiter_p =
626  if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
628  "[string][utf8] failed to allocate normalized text space");
629  return NULL;
630  }
631  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
632  if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
633  GRN_FREE(nstr->normalized);
634  nstr->normalized = NULL;
636  "[string][utf8] failed to allocate checks space");
637  return NULL;
638  }
639  }
640  ch = nstr->checks;
641  if (nstr->flags & GRN_STRING_WITH_TYPES) {
642  if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
643  if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
644  GRN_FREE(nstr->normalized); nstr->normalized = NULL;
646  "[string][utf8] failed to allocate character types space");
647  return NULL;
648  }
649  }
650  cp = nstr->ctypes;
651  d = (unsigned char *)nstr->normalized;
652  de = d + ds;
653  d_ = NULL;
654  e = (unsigned char *)nstr->original + size;
655  for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
656  if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
657  break;
658  }
659  if (remove_tokenized_delimiter_p &&
660  grn_tokenizer_is_tokenized_delimiter(ctx, (const char *)s, ls,
661  GRN_ENC_UTF8)) {
662  continue;
663  }
664  if ((p = (unsigned char *)grn_nfkc_map1(s))) {
665  pe = p + strlen((char *)p);
666  } else {
667  p = s;
668  pe = p + ls;
669  }
670  if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
671  p = p2;
672  pe = p + strlen((char *)p);
673  if (cp) { cp--; }
674  if (ch) {
675  ch -= (d - d_);
676  if (ch[0] >= 0) {
677  s_ = s__;
678  }
679  }
680  d = d_;
681  length--;
682  }
683  for (; ; p += lp) {
684  if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
685  break;
686  }
687  if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
688  if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
689  } else {
690  if (de <= d + lp) {
691  unsigned char *normalized;
692  ds += (ds >> 1) + lp;
693  if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
694  if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
695  if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
696  GRN_FREE(nstr->normalized); nstr->normalized = NULL;
698  "[string][utf8] failed to expand normalized text space");
699  return NULL;
700  }
701  de = normalized + ds;
702  d = normalized + (d - (unsigned char *)nstr->normalized);
703  nstr->normalized = (char *)normalized;
704  if (ch) {
705  int16_t *checks;
706  if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) {
707  if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
708  GRN_FREE(nstr->checks); nstr->checks = NULL;
709  GRN_FREE(nstr->normalized); nstr->normalized = NULL;
711  "[string][utf8] failed to expand checks space");
712  return NULL;
713  }
714  ch = checks + (ch - nstr->checks);
715  nstr->checks = checks;
716  }
717  if (cp) {
718  uint_least8_t *ctypes;
719  if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
720  GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
721  if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
722  GRN_FREE(nstr->normalized); nstr->normalized = NULL;
724  "[string][utf8] failed to expand character types space");
725  return NULL;
726  }
727  cp = ctypes + (cp - nstr->ctypes);
728  nstr->ctypes = ctypes;
729  }
730  }
731  memcpy(d, p, lp);
732  d_ = d;
733  d += lp;
734  length++;
735  if (cp) { *cp++ = grn_nfkc_char_type(p); }
736  if (ch) {
737  size_t i;
738  if (s_ == s + ls) {
739  *ch++ = -1;
740  } else {
741  *ch++ = (int16_t)(s + ls - s_);
742  s__ = s_;
743  s_ = s + ls;
744  }
745  for (i = lp; i > 1; i--) { *ch++ = 0; }
746  }
747  }
748  }
749  }
750  if (cp) { *cp = GRN_CHAR_NULL; }
751  *d = '\0';
752  nstr->n_characters = length;
753  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
754  return NULL;
755 }
756 #endif /* GRN_WITH_NFKC */
757 
758 inline static grn_obj *
759 ascii_normalize(grn_ctx *ctx, grn_string *nstr)
760 {
761  int16_t *ch;
762  const unsigned char *s, *s_, *e;
763  unsigned char *d, *d0, *d_;
764  uint_least8_t *cp, *ctypes, ctype;
765  size_t size = nstr->original_length_in_bytes, length = 0;
766  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
767  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
769  "[string][ascii] failed to allocate normalized text space");
770  return NULL;
771  }
772  d0 = (unsigned char *) nstr->normalized;
773  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
774  if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
775  GRN_FREE(nstr->normalized);
776  nstr->normalized = NULL;
778  "[string][ascii] failed to allocate checks space");
779  return NULL;
780  }
781  }
782  ch = nstr->checks;
783  if (nstr->flags & GRN_STRING_WITH_TYPES) {
784  if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
785  GRN_FREE(nstr->checks);
786  GRN_FREE(nstr->normalized);
787  nstr->checks = NULL;
788  nstr->normalized = NULL;
790  "[string][ascii] failed to allocate character types space");
791  return NULL;
792  }
793  }
794  cp = ctypes = nstr->ctypes;
795  e = (unsigned char *)nstr->original + size;
796  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
797  unsigned char c = *s;
798  switch (c >> 4) {
799  case 0 :
800  case 1 :
801  /* skip unprintable ascii */
802  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
803  continue;
804  case 2 :
805  if (c == 0x20) {
806  if (removeblankp) {
807  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
808  continue;
809  } else {
810  *d = ' ';
812  }
813  } else {
814  *d = c;
815  ctype = GRN_CHAR_SYMBOL;
816  }
817  break;
818  case 3 :
819  *d = c;
820  ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
821  break;
822  case 4 :
823  *d = ('A' <= c) ? c + 0x20 : c;
824  ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
825  break;
826  case 5 :
827  *d = (c <= 'Z') ? c + 0x20 : c;
828  ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
829  break;
830  case 6 :
831  *d = c;
832  ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
833  break;
834  case 7 :
835  *d = c;
836  ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
837  break;
838  default :
839  *d = c;
840  ctype = GRN_CHAR_OTHERS;
841  break;
842  }
843  d++;
844  length++;
845  if (cp) { *cp++ = ctype; }
846  if (ch) {
847  *ch++ = (int16_t)(s + 1 - s_);
848  s_ = s + 1;
849  while (++d_ < d) { *ch++ = 0; }
850  }
851  }
852  if (cp) { *cp = GRN_CHAR_NULL; }
853  *d = '\0';
854  nstr->n_characters = length;
855  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
856  return NULL;
857 }
858 
859 /* use cp1252 as latin1 */
860 inline static grn_obj *
861 latin1_normalize(grn_ctx *ctx, grn_string *nstr)
862 {
863  int16_t *ch;
864  const unsigned char *s, *s_, *e;
865  unsigned char *d, *d0, *d_;
866  uint_least8_t *cp, *ctypes, ctype;
867  size_t size = nstr->original_length_in_bytes, length = 0;
868  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
869  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
871  "[string][latin1] failed to allocate normalized text space");
872  return NULL;
873  }
874  d0 = (unsigned char *) nstr->normalized;
875  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
876  if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
877  GRN_FREE(nstr->normalized);
878  nstr->normalized = NULL;
880  "[string][latin1] failed to allocate checks space");
881  return NULL;
882  }
883  }
884  ch = nstr->checks;
885  if (nstr->flags & GRN_STRING_WITH_TYPES) {
886  if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
887  GRN_FREE(nstr->checks);
888  GRN_FREE(nstr->normalized);
889  nstr->checks = NULL;
890  nstr->normalized = NULL;
892  "[normalizer][latin1] failed to allocate character types space");
893  return NULL;
894  }
895  }
896  cp = ctypes = nstr->ctypes;
897  e = (unsigned char *)nstr->original + size;
898  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
899  unsigned char c = *s;
900  switch (c >> 4) {
901  case 0 :
902  case 1 :
903  /* skip unprintable ascii */
904  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
905  continue;
906  case 2 :
907  if (c == 0x20) {
908  if (removeblankp) {
909  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
910  continue;
911  } else {
912  *d = ' ';
914  }
915  } else {
916  *d = c;
917  ctype = GRN_CHAR_SYMBOL;
918  }
919  break;
920  case 3 :
921  *d = c;
922  ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
923  break;
924  case 4 :
925  *d = ('A' <= c) ? c + 0x20 : c;
926  ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
927  break;
928  case 5 :
929  *d = (c <= 'Z') ? c + 0x20 : c;
930  ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
931  break;
932  case 6 :
933  *d = c;
934  ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
935  break;
936  case 7 :
937  *d = c;
938  ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
939  break;
940  case 8 :
941  if (c == 0x8a || c == 0x8c || c == 0x8e) {
942  *d = c + 0x10;
943  ctype = GRN_CHAR_ALPHA;
944  } else {
945  *d = c;
946  ctype = GRN_CHAR_SYMBOL;
947  }
948  break;
949  case 9 :
950  if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
951  *d = (c == 0x9f) ? c + 0x60 : c;
952  ctype = GRN_CHAR_ALPHA;
953  } else {
954  *d = c;
955  ctype = GRN_CHAR_SYMBOL;
956  }
957  break;
958  case 0x0c :
959  *d = c + 0x20;
960  ctype = GRN_CHAR_ALPHA;
961  break;
962  case 0x0d :
963  *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
964  ctype = (c == 0xd7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
965  break;
966  case 0x0e :
967  *d = c;
968  ctype = GRN_CHAR_ALPHA;
969  break;
970  case 0x0f :
971  *d = c;
972  ctype = (c == 0xf7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
973  break;
974  default :
975  *d = c;
976  ctype = GRN_CHAR_OTHERS;
977  break;
978  }
979  d++;
980  length++;
981  if (cp) { *cp++ = ctype; }
982  if (ch) {
983  *ch++ = (int16_t)(s + 1 - s_);
984  s_ = s + 1;
985  while (++d_ < d) { *ch++ = 0; }
986  }
987  }
988  if (cp) { *cp = GRN_CHAR_NULL; }
989  *d = '\0';
990  nstr->n_characters = length;
991  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
992  return NULL;
993 }
994 
995 inline static grn_obj *
996 koi8r_normalize(grn_ctx *ctx, grn_string *nstr)
997 {
998  int16_t *ch;
999  const unsigned char *s, *s_, *e;
1000  unsigned char *d, *d0, *d_;
1001  uint_least8_t *cp, *ctypes, ctype;
1002  size_t size = nstr->original_length_in_bytes, length = 0;
1003  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
1004  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
1006  "[string][koi8r] failed to allocate normalized text space");
1007  return NULL;
1008  }
1009  d0 = (unsigned char *) nstr->normalized;
1010  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
1011  if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
1012  GRN_FREE(nstr->normalized);
1013  nstr->normalized = NULL;
1015  "[string][koi8r] failed to allocate checks space");
1016  return NULL;
1017  }
1018  }
1019  ch = nstr->checks;
1020  if (nstr->flags & GRN_STRING_WITH_TYPES) {
1021  if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
1022  GRN_FREE(nstr->checks);
1023  GRN_FREE(nstr->normalized);
1024  nstr->checks = NULL;
1025  nstr->normalized = NULL;
1027  "[string][koi8r] failed to allocate character types space");
1028  return NULL;
1029  }
1030  }
1031  cp = ctypes = nstr->ctypes;
1032  e = (unsigned char *)nstr->original + size;
1033  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
1034  unsigned char c = *s;
1035  switch (c >> 4) {
1036  case 0 :
1037  case 1 :
1038  /* skip unprintable ascii */
1039  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1040  continue;
1041  case 2 :
1042  if (c == 0x20) {
1043  if (removeblankp) {
1044  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1045  continue;
1046  } else {
1047  *d = ' ';
1049  }
1050  } else {
1051  *d = c;
1052  ctype = GRN_CHAR_SYMBOL;
1053  }
1054  break;
1055  case 3 :
1056  *d = c;
1057  ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
1058  break;
1059  case 4 :
1060  *d = ('A' <= c) ? c + 0x20 : c;
1061  ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1062  break;
1063  case 5 :
1064  *d = (c <= 'Z') ? c + 0x20 : c;
1065  ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
1066  break;
1067  case 6 :
1068  *d = c;
1069  ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1070  break;
1071  case 7 :
1072  *d = c;
1073  ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
1074  break;
1075  case 0x0a :
1076  *d = c;
1077  ctype = (c == 0xa3) ? GRN_CHAR_ALPHA : GRN_CHAR_OTHERS;
1078  break;
1079  case 0x0b :
1080  if (c == 0xb3) {
1081  *d = c - 0x10;
1082  ctype = GRN_CHAR_ALPHA;
1083  } else {
1084  *d = c;
1085  ctype = GRN_CHAR_OTHERS;
1086  }
1087  break;
1088  case 0x0c :
1089  case 0x0d :
1090  *d = c;
1091  ctype = GRN_CHAR_ALPHA;
1092  break;
1093  case 0x0e :
1094  case 0x0f :
1095  *d = c - 0x20;
1096  ctype = GRN_CHAR_ALPHA;
1097  break;
1098  default :
1099  *d = c;
1100  ctype = GRN_CHAR_OTHERS;
1101  break;
1102  }
1103  d++;
1104  length++;
1105  if (cp) { *cp++ = ctype; }
1106  if (ch) {
1107  *ch++ = (int16_t)(s + 1 - s_);
1108  s_ = s + 1;
1109  while (++d_ < d) { *ch++ = 0; }
1110  }
1111  }
1112  if (cp) { *cp = GRN_CHAR_NULL; }
1113  *d = '\0';
1114  nstr->n_characters = length;
1115  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
1116  return NULL;
1117 }
1118 
1119 static grn_obj *
1120 auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1121 {
1122  grn_string *string = (grn_string *)(args[0]);
1123  switch (string->encoding) {
1124  case GRN_ENC_EUC_JP :
1125  eucjp_normalize(ctx, string);
1126  break;
1127  case GRN_ENC_UTF8 :
1128 #ifdef GRN_WITH_NFKC
1129  utf8_normalize(ctx, string);
1130 #else /* GRN_WITH_NFKC */
1131  ascii_normalize(ctx, string);
1132 #endif /* GRN_WITH_NFKC */
1133  break;
1134  case GRN_ENC_SJIS :
1135  sjis_normalize(ctx, string);
1136  break;
1137  case GRN_ENC_LATIN1 :
1138  latin1_normalize(ctx, string);
1139  break;
1140  case GRN_ENC_KOI8R :
1141  koi8r_normalize(ctx, string);
1142  break;
1143  default :
1144  ascii_normalize(ctx, string);
1145  break;
1146  }
1147  return NULL;
1148 }
1149 
1150 #ifdef GRN_WITH_NFKC
1151 static grn_obj *
1152 nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1153 {
1154  grn_string *string = (grn_string *)(args[0]);
1155  utf8_normalize(ctx, string);
1156  return NULL;
1157 }
1158 #endif /* GRN_WITH_NFKC */
1159 
1160 grn_rc
1162 {
1163  grn_rc rc;
1164  int nargs = 0;
1165 
1166  grn_ctx_push(ctx, string);
1167  nargs++;
1168  rc = grn_proc_call(ctx, normalizer, nargs, NULL);
1169  grn_ctx_pop(ctx);
1170 
1171  return rc;
1172 }
1173 
1174 grn_rc
1176 {
1177  const char *normalizer_nfkc51_name = "NormalizerNFKC51";
1178 
1180  NULL, auto_next, NULL);
1181 
1182 #ifdef GRN_WITH_NFKC
1183  grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1184  NULL, nfkc51_next, NULL);
1185 #else /* GRN_WITH_NFKC */
1186  grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1187  NULL, NULL, NULL);
1188 #endif /* GRN_WITH_NFKC */
1189 /*
1190  grn_normalizer_register(ctx, "NormalizerUCA", -1,
1191  NULL, uca_next, NULL);
1192 */
1193 
1194  return GRN_SUCCESS;
1195 }