28 static mecab_t *sole_mecab = NULL;
42 translate_mecab_charset_to_grn_encoding(
const char *charset)
44 if (strcasecmp(charset,
"euc-jp") == 0) {
46 }
else if (strcasecmp(charset,
"utf-8") == 0 ||
47 strcasecmp(charset,
"utf8") == 0) {
49 }
else if (strcasecmp(charset,
"shift_jis") == 0 ||
50 strcasecmp(charset,
"shift-jis") == 0 ||
51 strcasecmp(charset,
"sjis") == 0) {
58 get_mecab_encoding(mecab_t *mecab)
61 const mecab_dictionary_info_t *dictionary_info;
62 dictionary_info = mecab_dictionary_info(mecab);
63 if (dictionary_info) {
64 const char *charset = dictionary_info->charset;
65 encoding = translate_mecab_charset_to_grn_encoding(charset);
81 unsigned int normalizer_flags = 0;
84 const char *normalized_string;
85 unsigned int normalized_string_length;
94 sole_mecab = mecab_new2(
"-Owakati");
98 "mecab_new2() failed on mecab_init(): %s",
99 mecab_strerror(NULL));
101 sole_mecab_encoding = get_mecab_encoding(sole_mecab);
111 if (query->
encoding != sole_mecab_encoding) {
114 "[tokenizer][mecab] "
115 "MeCab dictionary charset (%s) does not match "
116 "the table encoding: <%s>",
125 "[tokenizer][mecab] "
126 "memory allocation to grn_mecab_tokenizer failed");
129 tokenizer->
mecab = sole_mecab;
130 tokenizer->
query = query;
136 &normalized_string_length,
140 tokenizer->
next = normalized_string;
141 tokenizer->
end = tokenizer->
next + normalized_string_length;
142 }
else if (normalized_string_length == 0) {
143 tokenizer->
next =
"";
144 tokenizer->
end = tokenizer->
next;
147 s = mecab_sparse_tostr2(tokenizer->
mecab,
149 normalized_string_length);
152 "[tokenizer][mecab] "
153 "mecab_sparse_tostr() failed len=%d err=%s",
154 normalized_string_length,
155 mecab_strerror(tokenizer->
mecab));
167 unsigned int bufsize;
172 for (p = buf + bufsize - 2;
173 buf <= p && isspace(*(
unsigned char *)p);
175 tokenizer->
next = buf;
176 tokenizer->
end = p + 1;
179 user_data->
ptr = tokenizer;
201 tokenizer->
end - tokenizer->
next,
205 const char *p = tokenizer->
next, *r;
206 const char *e = tokenizer->
end;
209 for (r = p; r < e; r += cl) {
216 while ((cl =
grn_isspace(q, encoding))) { q += cl; }
251 check_mecab_dictionary_encoding(
grn_ctx *ctx)
253 #ifdef HAVE_MECAB_DICTIONARY_INFO_T
256 mecab = mecab_new2(
"-Owakati");
259 int have_same_encoding_dictionary = 0;
262 have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab);
263 mecab_destroy(mecab);
265 if (!have_same_encoding_dictionary) {
267 "[tokenizer][mecab] "
268 "MeCab has no dictionary that uses the context encoding"
274 "[tokenizer][mecab] "
275 "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
276 mecab_strerror(NULL));
290 if (!sole_mecab_mutex) {
292 "[tokenizer][mecab] grn_plugin_mutex_open() failed");
296 check_mecab_dictionary_encoding(ctx);
309 mecab_init, mecab_next, mecab_fin);
329 mecab_destroy(sole_mecab);
332 if (sole_mecab_mutex) {
334 sole_mecab_mutex = NULL;