Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
token.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /*
3  Copyright(C) 2009-2012 Brazil
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 #include "groonga_in.h"
19 #include <string.h>
20 #include <ctype.h>
21 #include "ctx_impl.h"
22 #include "token.h"
23 #include "pat.h"
24 #include "dat.h"
25 #include "hash.h"
26 #include "string_in.h"
27 #include "plugin_in.h"
28 #include <groonga/tokenizer.h>
29 
31 
32 typedef struct {
36  uint32_t unit;
38 
39 static grn_obj *
40 uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
41 {
42  grn_obj *str, *flags;
43  grn_uvector_tokenizer *tokenizer;
44  if (!(flags = grn_ctx_pop(ctx))) {
45  ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags");
46  return NULL;
47  }
48  if (!(str = grn_ctx_pop(ctx))) {
49  ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string");
50  return NULL;
51  }
52  if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) {
54  "[tokenizer][uvector] "
55  "memory allocation to grn_uvector_tokenizer failed");
56  return NULL;
57  }
58  user_data->ptr = tokenizer;
59 
60  grn_tokenizer_token_init(ctx, &(tokenizer->token));
61  tokenizer->curr = (byte *)GRN_TEXT_VALUE(str);
62  tokenizer->tail = tokenizer->curr + GRN_TEXT_LEN(str);
63  tokenizer->unit = sizeof(grn_id);
64  return NULL;
65 }
66 
67 static grn_obj *
68 uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
69 {
70  grn_uvector_tokenizer *tokenizer = user_data->ptr;
71  byte *p = tokenizer->curr + tokenizer->unit;
72  if (tokenizer->tail < p) {
73  grn_tokenizer_token_push(ctx, &(tokenizer->token),
74  (const char *)tokenizer->curr, 0,
76  } else {
77  grn_tokenizer_status status;
78  if (tokenizer->tail == p) {
79  status = GRN_TOKENIZER_TOKEN_LAST;
80  } else {
82  }
83  grn_tokenizer_token_push(ctx, &(tokenizer->token),
84  (const char *)tokenizer->curr, tokenizer->unit,
85  status);
86  tokenizer->curr = p;
87  }
88  return NULL;
89 }
90 
91 static grn_obj *
92 uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
93 {
94  grn_uvector_tokenizer *tokenizer = user_data->ptr;
95  if (!tokenizer) {
96  return NULL;
97  }
98  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
99  GRN_FREE(tokenizer);
100  return NULL;
101 }
102 
103 typedef struct {
104  const uint8_t *delimiter;
105  uint32_t delimiter_len;
106  const unsigned char *next;
107  const unsigned char *end;
112 
113 static grn_obj *
114 delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
115  const uint8_t *delimiter, uint32_t delimiter_len)
116 {
117  grn_tokenizer_query *query;
118  unsigned int normalize_flags = 0;
119  const char *normalized;
120  unsigned int normalized_length_in_bytes;
121  grn_delimited_tokenizer *tokenizer;
122 
123  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
124  if (!query) {
125  return NULL;
126  }
127 
128  if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
130  "[tokenizer][delimit] "
131  "memory allocation to grn_delimited_tokenizer failed");
132  grn_tokenizer_query_close(ctx, query);
133  return NULL;
134  }
135  user_data->ptr = tokenizer;
136 
137  tokenizer->query = query;
138 
139  tokenizer->have_tokenized_delimiter =
141  tokenizer->query->ptr,
142  tokenizer->query->length,
143  tokenizer->query->encoding);
144  tokenizer->delimiter = delimiter;
145  tokenizer->delimiter_len = delimiter_len;
147  &normalized, &normalized_length_in_bytes,
148  NULL);
149  tokenizer->next = (const unsigned char *)normalized;
150  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
151 
152  grn_tokenizer_token_init(ctx, &(tokenizer->token));
153 
154  return NULL;
155 }
156 
157 static grn_obj *
158 delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
159 {
160  grn_delimited_tokenizer *tokenizer = user_data->ptr;
161 
162  if (tokenizer->have_tokenized_delimiter) {
163  unsigned int rest_length;
164  rest_length = tokenizer->end - tokenizer->next;
165  tokenizer->next =
167  ctx,
168  &(tokenizer->token),
169  (const char *)tokenizer->next,
170  rest_length,
171  tokenizer->query->encoding);
172  } else {
173  size_t cl;
174  const unsigned char *p = tokenizer->next, *r;
175  const unsigned char *e = tokenizer->end;
176  grn_tokenizer_status status;
177  for (r = p; r < e; r += cl) {
178  if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
179  tokenizer->query->encoding))) {
180  tokenizer->next = (unsigned char *)e;
181  break;
182  }
183  {
184  grn_bool found_delimiter = GRN_FALSE;
185  const unsigned char *current_end = r;
186  while (current_end + tokenizer->delimiter_len <= e &&
187  !memcmp(current_end,
188  tokenizer->delimiter, tokenizer->delimiter_len)) {
189  current_end += tokenizer->delimiter_len;
190  tokenizer->next = current_end;
191  found_delimiter = GRN_TRUE;
192  }
193  if (found_delimiter) {
194  break;
195  }
196  }
197  }
198  if (r == e) {
199  status = GRN_TOKENIZER_LAST;
200  } else {
201  status = GRN_TOKENIZER_CONTINUE;
202  }
204  &(tokenizer->token),
205  (const char *)p,
206  r - p,
207  status);
208  }
209 
210  return NULL;
211 }
212 
213 static grn_obj *
214 delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
215 {
216  grn_delimited_tokenizer *tokenizer = user_data->ptr;
217  if (!tokenizer) {
218  return NULL;
219  }
220  grn_tokenizer_query_close(ctx, tokenizer->query);
221  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
222  GRN_FREE(tokenizer);
223  return NULL;
224 }
225 
226 static grn_obj *
227 delimit_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
228 {
229  static const uint8_t delimiter[1] = {' '};
230  return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
231 }
232 
233 static grn_obj *
234 delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
235 {
236  static const uint8_t delimiter[1] = {'\0'};
237  return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
238 }
239 
240 /* ngram tokenizer */
241 
242 typedef struct {
245  uint8_t uni_alpha;
246  uint8_t uni_digit;
247  uint8_t uni_symbol;
248  uint8_t ngram_unit;
249  uint8_t ignore_blank;
250  uint8_t overlap;
251  int32_t pos;
252  uint32_t skip;
253  const unsigned char *next;
254  const unsigned char *end;
256  uint32_t len;
257  uint32_t tail;
259 
260 static grn_obj *
261 ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
262  uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
263 {
264  unsigned int normalize_flags =
268  grn_tokenizer_query *query;
269  const char *normalized;
270  unsigned int normalized_length_in_bytes;
271  grn_ngram_tokenizer *tokenizer;
272 
273  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
274  if (!query) {
275  return NULL;
276  }
277 
278  if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
279  grn_tokenizer_query_close(ctx, query);
281  "[tokenizer][ngram] "
282  "memory allocation to grn_ngram_tokenizer failed");
283  return NULL;
284  }
285  user_data->ptr = tokenizer;
286 
287  grn_tokenizer_token_init(ctx, &(tokenizer->token));
288  tokenizer->query = query;
289 
290  tokenizer->uni_alpha = uni_alpha;
291  tokenizer->uni_digit = uni_digit;
292  tokenizer->uni_symbol = uni_symbol;
293  tokenizer->ngram_unit = ngram_unit;
294  tokenizer->ignore_blank = ignore_blank;
295  tokenizer->overlap = 0;
296  tokenizer->pos = 0;
297  tokenizer->skip = 0;
298 
300  &normalized, &normalized_length_in_bytes,
301  &(tokenizer->len));
302  tokenizer->next = (const unsigned char *)normalized;
303  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
304  tokenizer->ctypes =
305  grn_string_get_types(ctx, tokenizer->query->normalized_query);
306  return NULL;
307 }
308 
309 static grn_obj *
310 unigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
311 { return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); }
312 
313 static grn_obj *
314 bigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
315 { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); }
316 
317 static grn_obj *
318 trigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
319 { return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); }
320 
321 static grn_obj *
322 bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
323 { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); }
324 
325 static grn_obj *
326 bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
327 { return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); }
328 
329 static grn_obj *
330 bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
331 { return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); }
332 
333 static grn_obj *
334 bigrami_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
335 { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); }
336 
337 static grn_obj *
338 bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
339 { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); }
340 
341 static grn_obj *
342 bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
343 { return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); }
344 
345 static grn_obj *
346 bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
347 { return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); }
348 
349 static grn_obj *
350 ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
351 {
352  size_t cl;
353  grn_ngram_tokenizer *tokenizer = user_data->ptr;
354  const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end;
355  int32_t len = 0, pos = tokenizer->pos + tokenizer->skip, status = 0;
356  const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
357  if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
358  while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
359  tokenizer->query->encoding))) {
360  len++;
361  r += cl;
362  if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
363  if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; }
364  }
365  tokenizer->next = r;
366  tokenizer->overlap = 0;
367  } else if (cp &&
368  tokenizer->uni_digit &&
369  GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
370  while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
371  tokenizer->query->encoding))) {
372  len++;
373  r += cl;
374  if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
375  if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; }
376  }
377  tokenizer->next = r;
378  tokenizer->overlap = 0;
379  } else if (cp &&
380  tokenizer->uni_symbol &&
381  GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
382  while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
383  tokenizer->query->encoding))) {
384  len++;
385  r += cl;
386  if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
387  if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; }
388  }
389  tokenizer->next = r;
390  tokenizer->overlap = 0;
391  } else {
392 #ifdef PRE_DEFINED_UNSPLIT_WORDS
393  const unsigned char *key = NULL;
394  // todo : grn_pat_lcp_search
395  if ((tid = grn_sym_common_prefix_search(sym, p))) {
396  if (!(key = _grn_sym_key(sym, tid))) {
397  tokenizer->status = GRN_TOKEN_NOT_FOUND;
398  return NULL;
399  }
400  len = grn_str_len(key, tokenizer->query->encoding, NULL);
401  }
402  r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding);
403  if (tid && (len > 1 || r == p)) {
404  if (r != p && pos + len - 1 <= tokenizer->tail) { continue; }
405  p += strlen(key);
406  if (!*p && tokenizer->mode == GRN_TOKEN_GET) {
407  tokenizer->status = GRN_TOKEN_DONE;
408  }
409  }
410 #endif /* PRE_DEFINED_UNSPLIT_WORDS */
411  if ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
412  tokenizer->query->encoding))) {
413  len++;
414  r += cl;
415  tokenizer->next = r;
416  while (len < tokenizer->ngram_unit &&
417  (cl = grn_charlen_(ctx, (char *)r, (char *)e,
418  tokenizer->query->encoding))) {
419  if (cp) {
420  if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
421  cp++;
422  if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) ||
423  (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) ||
424  (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) {
425  break;
426  }
427  }
428  len++;
429  r += cl;
430  }
431  if (tokenizer->overlap) {
432  status |= GRN_TOKENIZER_TOKEN_OVERLAP;
433  }
434  if (len < tokenizer->ngram_unit) {
436  }
437  tokenizer->overlap = (len > 1) ? 1 : 0;
438  }
439  }
440  tokenizer->pos = pos;
441  tokenizer->len = len;
442  tokenizer->tail = pos + len - 1;
443  if (p == r || tokenizer->next == e) {
444  tokenizer->skip = 0;
445  status |= GRN_TOKENIZER_TOKEN_LAST;
446  } else {
447  tokenizer->skip = tokenizer->overlap ? 1 : len;
448  }
449  if (r == e) { status |= GRN_TOKENIZER_TOKEN_REACH_END; }
451  &(tokenizer->token),
452  (const char *)p,
453  r - p,
454  status);
455  return NULL;
456 }
457 
458 static grn_obj *
459 ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
460 {
461  grn_ngram_tokenizer *tokenizer = user_data->ptr;
462  if (!tokenizer) {
463  return NULL;
464  }
465  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
466  grn_tokenizer_query_close(ctx, tokenizer->query);
467  GRN_FREE(tokenizer);
468  return NULL;
469 }
470 
471 /* external */
472 
473 grn_rc
475 {
476  static grn_proc _grn_token_uvector;
477  _grn_token_uvector.obj.db = NULL;
478  _grn_token_uvector.obj.id = GRN_ID_NIL;
479  _grn_token_uvector.obj.header.domain = GRN_ID_NIL;
480  _grn_token_uvector.obj.range = GRN_ID_NIL;
481  _grn_token_uvector.funcs[PROC_INIT] = uvector_init;
482  _grn_token_uvector.funcs[PROC_NEXT] = uvector_next;
483  _grn_token_uvector.funcs[PROC_FIN] = uvector_fin;
484  grn_token_uvector = (grn_obj *)&_grn_token_uvector;
485  return GRN_SUCCESS;
486 }
487 
488 grn_rc
490 {
491  return GRN_SUCCESS;
492 }
493 
494 grn_token *
495 grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
496  grn_token_mode mode, unsigned int flags)
497 {
498  grn_token *token;
499  grn_encoding encoding;
500  grn_obj *tokenizer;
501  grn_obj *normalizer;
502  grn_obj_flags table_flags;
503  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
504  &normalizer)) {
505  return NULL;
506  }
507  if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
508  token->table = table;
509  token->mode = mode;
510  token->encoding = encoding;
511  token->tokenizer = tokenizer;
512  token->orig = (const unsigned char *)str;
513  token->orig_blen = str_len;
514  token->curr = NULL;
515  token->nstr = NULL;
516  token->curr_size = 0;
517  token->pos = -1;
518  token->status = GRN_TOKEN_DOING;
519  token->force_prefix = 0;
520  if (tokenizer) {
521  grn_obj str_, flags_;
523  GRN_TEXT_SET_REF(&str_, str, str_len);
524  GRN_UINT32_INIT(&flags_, 0);
525  GRN_UINT32_SET(ctx, &flags_, flags);
526  token->pctx.caller = NULL;
527  token->pctx.user_data.ptr = NULL;
528  token->pctx.proc = (grn_proc *)tokenizer;
529  token->pctx.hooks = NULL;
530  token->pctx.currh = NULL;
531  token->pctx.phase = PROC_INIT;
532  grn_ctx_push(ctx, &str_);
533  grn_ctx_push(ctx, &flags_);
534  ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
535  grn_obj_close(ctx, &flags_);
536  grn_obj_close(ctx, &str_);
537  } else {
538  int nflags = 0;
539  token->nstr = grn_string_open_(ctx, str, str_len,
540  normalizer, nflags, token->encoding);
541  if (token->nstr) {
542  const char *normalized;
543  grn_string_get_normalized(ctx, token->nstr,
544  &normalized, &(token->curr_size), NULL);
545  token->curr = (const unsigned char *)normalized;
546  } else {
547  ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
548  }
549  }
550  if (ctx->rc) {
551  grn_token_close(ctx, token);
552  token = NULL;
553  }
554  return token;
555 }
556 
557 grn_id
559 {
560  int status;
561  grn_id tid = GRN_ID_NIL;
562  grn_obj *table = token->table;
563  grn_obj *tokenizer = token->tokenizer;
564  while (token->status != GRN_TOKEN_DONE) {
565  if (tokenizer) {
566  grn_obj *curr_, *stat_;
567  ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token->pctx.user_data);
568  stat_ = grn_ctx_pop(ctx);
569  curr_ = grn_ctx_pop(ctx);
570  token->curr = (const unsigned char *)GRN_TEXT_VALUE(curr_);
571  token->curr_size = GRN_TEXT_LEN(curr_);
572  status = GRN_UINT32_VALUE(stat_);
573  token->status = ((status & GRN_TOKENIZER_TOKEN_LAST) ||
574  (token->mode == GRN_TOKEN_GET &&
575  (status & GRN_TOKENIZER_TOKEN_REACH_END)))
577  token->force_prefix = 0;
578  if (token->curr_size == 0) {
579  char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
580  int tokenizer_name_length;
581  tokenizer_name_length =
582  grn_obj_name(ctx, token->tokenizer,
583  tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
584  GRN_LOG(ctx, GRN_WARN,
585  "[token_next] ignore an empty token: <%.*s>: <%.*s>",
586  tokenizer_name_length, tokenizer_name,
587  token->orig_blen, token->orig);
588  continue;
589  }
590  if (token->curr_size > GRN_TABLE_MAX_KEY_SIZE) {
591  GRN_LOG(ctx, GRN_WARN,
592  "[token_next] ignore too long token. "
593  "Token must be less than or equal to %d: <%d>(<%.*s>)",
595  token->curr_size,
596  token->curr_size, token->curr);
597  continue;
598  }
599  if (status & GRN_TOKENIZER_TOKEN_UNMATURED) {
600  if (status & GRN_TOKENIZER_TOKEN_OVERLAP) {
601  if (token->mode == GRN_TOKEN_GET) { token->pos++; continue; }
602  } else {
603  if (status & GRN_TOKENIZER_TOKEN_LAST) { token->force_prefix = 1; }
604  }
605  }
606  } else {
607  token->status = GRN_TOKEN_DONE;
608  }
609  if (token->mode == GRN_TOKEN_ADD) {
610  switch (table->header.type) {
611  case GRN_TABLE_PAT_KEY :
612  if (grn_io_lock(ctx, ((grn_pat *)table)->io, 10000000)) {
613  tid = GRN_ID_NIL;
614  } else {
615  tid = grn_pat_add(ctx, (grn_pat *)table, token->curr, token->curr_size,
616  NULL, NULL);
617  grn_io_unlock(((grn_pat *)table)->io);
618  }
619  break;
620  case GRN_TABLE_DAT_KEY :
621  if (grn_io_lock(ctx, ((grn_dat *)table)->io, 10000000)) {
622  tid = GRN_ID_NIL;
623  } else {
624  tid = grn_dat_add(ctx, (grn_dat *)table, token->curr, token->curr_size,
625  NULL, NULL);
626  grn_io_unlock(((grn_dat *)table)->io);
627  }
628  break;
629  case GRN_TABLE_HASH_KEY :
630  if (grn_io_lock(ctx, ((grn_hash *)table)->io, 10000000)) {
631  tid = GRN_ID_NIL;
632  } else {
633  tid = grn_hash_add(ctx, (grn_hash *)table, token->curr, token->curr_size,
634  NULL, NULL);
635  grn_io_unlock(((grn_hash *)table)->io);
636  }
637  break;
638  case GRN_TABLE_NO_KEY :
639  if (token->curr_size == sizeof(grn_id)) {
640  tid = *((grn_id *)token->curr);
641  } else {
642  tid = GRN_ID_NIL;
643  }
644  break;
645  }
646  } else {
647  switch (table->header.type) {
648  case GRN_TABLE_PAT_KEY :
649  tid = grn_pat_get(ctx, (grn_pat *)table, token->curr, token->curr_size, NULL);
650  break;
651  case GRN_TABLE_DAT_KEY :
652  tid = grn_dat_get(ctx, (grn_dat *)table, token->curr, token->curr_size, NULL);
653  break;
654  case GRN_TABLE_HASH_KEY :
655  tid = grn_hash_get(ctx, (grn_hash *)table, token->curr, token->curr_size, NULL);
656  break;
657  case GRN_TABLE_NO_KEY :
658  if (token->curr_size == sizeof(grn_id)) {
659  tid = *((grn_id *)token->curr);
660  } else {
661  tid = GRN_ID_NIL;
662  }
663  break;
664  }
665  }
666  if (tid == GRN_ID_NIL && token->status != GRN_TOKEN_DONE) {
667  token->status = GRN_TOKEN_NOT_FOUND;
668  }
669  token->pos++;
670  break;
671  }
672  return tid;
673 }
674 
675 grn_rc
677 {
678  if (token) {
679  if (token->tokenizer) {
680  ((grn_proc *)token->tokenizer)->funcs[PROC_FIN](ctx, 1, &token->table,
681  &token->pctx.user_data);
682  }
683  if (token->nstr) {
684  grn_obj_close(ctx, token->nstr);
685  }
686  GRN_FREE(token);
687  return GRN_SUCCESS;
688  } else {
689  return GRN_INVALID_ARGUMENT;
690  }
691 }
692 
693 grn_rc
695 {
696  switch (GRN_CTX_GET_ENCODING(ctx)) {
697  case GRN_ENC_EUC_JP :
698  case GRN_ENC_UTF8 :
699  case GRN_ENC_SJIS :
700  {
701  const char *mecab_plugin_name = "tokenizers/mecab";
702  char *path;
703  path = grn_plugin_find_path(ctx, mecab_plugin_name);
704  if (path) {
705  GRN_FREE(path);
706  return grn_plugin_register(ctx, mecab_plugin_name);
707  } else {
709  }
710  }
711  break;
712  default :
714  }
715 }
716 
717 #define DEF_TOKENIZER(name, init, next, fin, vars)\
718  (grn_proc_create(ctx, (name), (sizeof(name) - 1),\
719  GRN_PROC_TOKENIZER, (init), (next), (fin), 3, (vars)))
720 
721 grn_rc
723 {
724  grn_obj *obj;
725  grn_expr_var vars[] = {
726  {NULL, 0},
727  {NULL, 0},
728  {NULL, 0}
729  };
730  GRN_TEXT_INIT(&vars[0].value, 0);
731  GRN_TEXT_INIT(&vars[1].value, 0);
732  GRN_UINT32_INIT(&vars[2].value, 0);
733 
734  obj = DEF_TOKENIZER("TokenDelimit",
735  delimit_init, delimited_next, delimited_fin, vars);
736  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_DELIMIT) { return GRN_FILE_CORRUPT; }
737  obj = DEF_TOKENIZER("TokenUnigram",
738  unigram_init, ngram_next, ngram_fin, vars);
739  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_UNIGRAM) { return GRN_FILE_CORRUPT; }
740  obj = DEF_TOKENIZER("TokenBigram",
741  bigram_init, ngram_next, ngram_fin, vars);
742  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_BIGRAM) { return GRN_FILE_CORRUPT; }
743  obj = DEF_TOKENIZER("TokenTrigram",
744  trigram_init, ngram_next, ngram_fin, vars);
745  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_TRIGRAM) { return GRN_FILE_CORRUPT; }
746 
747  DEF_TOKENIZER("TokenBigramSplitSymbol",
748  bigrams_init, ngram_next, ngram_fin, vars);
749  DEF_TOKENIZER("TokenBigramSplitSymbolAlpha",
750  bigramsa_init, ngram_next, ngram_fin, vars);
751  DEF_TOKENIZER("TokenBigramSplitSymbolAlphaDigit",
752  bigramsad_init, ngram_next, ngram_fin, vars);
753  DEF_TOKENIZER("TokenBigramIgnoreBlank",
754  bigrami_init, ngram_next, ngram_fin, vars);
755  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbol",
756  bigramis_init, ngram_next, ngram_fin, vars);
757  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlpha",
758  bigramisa_init, ngram_next, ngram_fin, vars);
759  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlphaDigit",
760  bigramisad_init, ngram_next, ngram_fin, vars);
761  DEF_TOKENIZER("TokenDelimitNull",
762  delimit_null_init, delimited_next, delimited_fin, vars);
763  return GRN_SUCCESS;
764 }