54 "[tokenizer][uvector] "
55 "memory allocation to grn_uvector_tokenizer failed");
58 user_data->
ptr = tokenizer;
72 if (tokenizer->
tail < p) {
74 (
const char *)tokenizer->
curr, 0,
78 if (tokenizer->
tail == p) {
84 (
const char *)tokenizer->
curr, tokenizer->
unit,
115 const uint8_t *delimiter, uint32_t delimiter_len)
118 unsigned int normalize_flags = 0;
119 const char *normalized;
120 unsigned int normalized_length_in_bytes;
130 "[tokenizer][delimit] "
131 "memory allocation to grn_delimited_tokenizer failed");
135 user_data->
ptr = tokenizer;
137 tokenizer->
query = query;
147 &normalized, &normalized_length_in_bytes,
149 tokenizer->
next = (
const unsigned char *)normalized;
150 tokenizer->
end = tokenizer->
next + normalized_length_in_bytes;
163 unsigned int rest_length;
164 rest_length = tokenizer->
end - tokenizer->
next;
169 (
const char *)tokenizer->
next,
174 const unsigned char *p = tokenizer->
next, *r;
175 const unsigned char *e = tokenizer->
end;
177 for (r = p; r < e; r += cl) {
180 tokenizer->
next = (
unsigned char *)e;
185 const unsigned char *current_end = r;
190 tokenizer->
next = current_end;
193 if (found_delimiter) {
229 static const uint8_t delimiter[1] = {
' '};
230 return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
236 static const uint8_t delimiter[1] = {
'\0'};
237 return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
262 uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
264 unsigned int normalize_flags =
269 const char *normalized;
270 unsigned int normalized_length_in_bytes;
281 "[tokenizer][ngram] "
282 "memory allocation to grn_ngram_tokenizer failed");
285 user_data->
ptr = tokenizer;
288 tokenizer->
query = query;
300 &normalized, &normalized_length_in_bytes,
302 tokenizer->
next = (
const unsigned char *)normalized;
303 tokenizer->
end = tokenizer->
next + normalized_length_in_bytes;
311 {
return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); }
315 {
return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); }
319 {
return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); }
323 {
return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); }
327 {
return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); }
331 {
return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); }
335 {
return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); }
339 {
return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); }
343 {
return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); }
347 {
return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); }
354 const unsigned char *p = tokenizer->
next, *r = p, *e = tokenizer->
end;
355 int32_t len = 0, pos = tokenizer->
pos + tokenizer->
skip, status = 0;
392 #ifdef PRE_DEFINED_UNSPLIT_WORDS
393 const unsigned char *key = NULL;
395 if ((tid = grn_sym_common_prefix_search(
sym, p))) {
396 if (!(key = _grn_sym_key(
sym, tid))) {
403 if (tid && (len > 1 || r == p)) {
404 if (r != p && pos + len - 1 <= tokenizer->
tail) {
continue; }
416 while (len < tokenizer->ngram_unit &&
434 if (len < tokenizer->ngram_unit) {
437 tokenizer->
overlap = (len > 1) ? 1 : 0;
440 tokenizer->
pos = pos;
441 tokenizer->
len = len;
442 tokenizer->
tail = pos + len - 1;
443 if (p == r || tokenizer->
next == e) {
477 _grn_token_uvector.
obj.
db = NULL;
484 grn_token_uvector = (
grn_obj *)&_grn_token_uvector;
508 token->
table = table;
512 token->
orig = (
const unsigned char *)str;
540 normalizer, nflags, token->
encoding);
542 const char *normalized;
545 token->
curr = (
const unsigned char *)normalized;
580 int tokenizer_name_length;
581 tokenizer_name_length =
585 "[token_next] ignore an empty token: <%.*s>: <%.*s>",
586 tokenizer_name_length, tokenizer_name,
592 "[token_next] ignore too long token. "
593 "Token must be less than or equal to %d: <%d>(<%.*s>)",
701 const char *mecab_plugin_name =
"tokenizers/mecab";
717 #define DEF_TOKENIZER(name, init, next, fin, vars)\
718 (grn_proc_create(ctx, (name), (sizeof(name) - 1),\
719 GRN_PROC_TOKENIZER, (init), (next), (fin), 3, (vars)))
735 delimit_init, delimited_next, delimited_fin, vars);
738 unigram_init, ngram_next, ngram_fin, vars);
741 bigram_init, ngram_next, ngram_fin, vars);
744 trigram_init, ngram_next, ngram_fin, vars);
748 bigrams_init, ngram_next, ngram_fin, vars);
750 bigramsa_init, ngram_next, ngram_fin, vars);
752 bigramsad_init, ngram_next, ngram_fin, vars);
754 bigrami_init, ngram_next, ngram_fin, vars);
756 bigramis_init, ngram_next, ngram_fin, vars);
758 bigramisa_init, ngram_next, ngram_fin, vars);
760 bigramisad_init, ngram_next, ngram_fin, vars);
762 delimit_null_init, delimited_next, delimited_fin, vars);