Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
mecab.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2009-2012 Brazil
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License version 2.1 as published by the Free Software Foundation.
7 
8  This library is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  Lesser General Public License for more details.
12 
13  You should have received a copy of the GNU Lesser General Public
14  License along with this library; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 #include <str.h>
19 
20 #include <groonga.h>
21 #include <groonga/tokenizer.h>
22 
23 #include <mecab.h>
24 
25 #include <string.h>
26 #include <ctype.h>
27 
28 static mecab_t *sole_mecab = NULL;
29 static grn_plugin_mutex *sole_mecab_mutex = NULL;
30 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
31 
32 typedef struct {
33  mecab_t *mecab;
35  const char *next;
36  const char *end;
40 
41 static grn_encoding
42 translate_mecab_charset_to_grn_encoding(const char *charset)
43 {
44  if (strcasecmp(charset, "euc-jp") == 0) {
45  return GRN_ENC_EUC_JP;
46  } else if (strcasecmp(charset, "utf-8") == 0 ||
47  strcasecmp(charset, "utf8") == 0) {
48  return GRN_ENC_UTF8;
49  } else if (strcasecmp(charset, "shift_jis") == 0 ||
50  strcasecmp(charset, "shift-jis") == 0 ||
51  strcasecmp(charset, "sjis") == 0) {
52  return GRN_ENC_SJIS;
53  }
54  return GRN_ENC_NONE;
55 }
56 
57 static grn_encoding
58 get_mecab_encoding(mecab_t *mecab)
59 {
60  grn_encoding encoding = GRN_ENC_NONE;
61  const mecab_dictionary_info_t *dictionary_info;
62  dictionary_info = mecab_dictionary_info(mecab);
63  if (dictionary_info) {
64  const char *charset = dictionary_info->charset;
65  encoding = translate_mecab_charset_to_grn_encoding(charset);
66  }
67  return encoding;
68 }
69 
70 /*
71  This function is called for a full text search query or a document to be
72  indexed. This means that both short/long strings are given.
73  The return value of this function is ignored. When an error occurs in this
74  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
75  */
76 static grn_obj *
77 mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
78 {
79  const char *s;
80  grn_mecab_tokenizer *tokenizer;
81  unsigned int normalizer_flags = 0;
82  grn_tokenizer_query *query;
83  grn_obj *normalized_query;
84  const char *normalized_string;
85  unsigned int normalized_string_length;
86 
87  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
88  if (!query) {
89  return NULL;
90  }
91  if (!sole_mecab) {
92  grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
93  if (!sole_mecab) {
94  sole_mecab = mecab_new2("-Owakati");
95  if (!sole_mecab) {
97  "[tokenizer][mecab] "
98  "mecab_new2() failed on mecab_init(): %s",
99  mecab_strerror(NULL));
100  } else {
101  sole_mecab_encoding = get_mecab_encoding(sole_mecab);
102  }
103  }
104  grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
105  }
106  if (!sole_mecab) {
107  grn_tokenizer_query_close(ctx, query);
108  return NULL;
109  }
110 
111  if (query->encoding != sole_mecab_encoding) {
112  grn_tokenizer_query_close(ctx, query);
114  "[tokenizer][mecab] "
115  "MeCab dictionary charset (%s) does not match "
116  "the table encoding: <%s>",
117  grn_encoding_to_string(sole_mecab_encoding),
119  return NULL;
120  }
121 
122  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
123  grn_tokenizer_query_close(ctx, query);
125  "[tokenizer][mecab] "
126  "memory allocation to grn_mecab_tokenizer failed");
127  return NULL;
128  }
129  tokenizer->mecab = sole_mecab;
130  tokenizer->query = query;
131 
132  normalized_query = query->normalized_query;
134  normalized_query,
135  &normalized_string,
136  &normalized_string_length,
137  NULL);
138  GRN_TEXT_INIT(&(tokenizer->buf), 0);
139  if (query->have_tokenized_delimiter) {
140  tokenizer->next = normalized_string;
141  tokenizer->end = tokenizer->next + normalized_string_length;
142  } else if (normalized_string_length == 0) {
143  tokenizer->next = "";
144  tokenizer->end = tokenizer->next;
145  } else {
146  grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
147  s = mecab_sparse_tostr2(tokenizer->mecab,
148  normalized_string,
149  normalized_string_length);
150  if (!s) {
152  "[tokenizer][mecab] "
153  "mecab_sparse_tostr() failed len=%d err=%s",
154  normalized_string_length,
155  mecab_strerror(tokenizer->mecab));
156  } else {
157  GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
158  }
159  grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
160  if (!s) {
161  grn_tokenizer_query_close(ctx, tokenizer->query);
162  GRN_PLUGIN_FREE(ctx, tokenizer);
163  return NULL;
164  }
165  {
166  char *buf, *p;
167  unsigned int bufsize;
168 
169  buf = GRN_TEXT_VALUE(&(tokenizer->buf));
170  bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
171  /* A certain version of mecab returns trailing lf or spaces. */
172  for (p = buf + bufsize - 2;
173  buf <= p && isspace(*(unsigned char *)p);
174  p--) { *p = '\0'; }
175  tokenizer->next = buf;
176  tokenizer->end = p + 1;
177  }
178  }
179  user_data->ptr = tokenizer;
180 
181  grn_tokenizer_token_init(ctx, &(tokenizer->token));
182 
183  return NULL;
184 }
185 
186 /*
187  This function returns tokens one by one.
188  */
189 static grn_obj *
190 mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
191 {
192  /* grn_obj *table = args[0]; */
193  grn_mecab_tokenizer *tokenizer = user_data->ptr;
194  grn_encoding encoding = tokenizer->query->encoding;
195 
196  if (tokenizer->query->have_tokenized_delimiter) {
197  tokenizer->next =
199  &(tokenizer->token),
200  tokenizer->next,
201  tokenizer->end - tokenizer->next,
202  encoding);
203  } else {
204  size_t cl;
205  const char *p = tokenizer->next, *r;
206  const char *e = tokenizer->end;
207  grn_tokenizer_status status;
208 
209  for (r = p; r < e; r += cl) {
210  if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
211  tokenizer->next = e;
212  break;
213  }
214  if (grn_isspace(r, encoding)) {
215  const char *q = r;
216  while ((cl = grn_isspace(q, encoding))) { q += cl; }
217  tokenizer->next = q;
218  break;
219  }
220  }
221 
222  if (r == e) {
223  status = GRN_TOKENIZER_LAST;
224  } else {
225  status = GRN_TOKENIZER_CONTINUE;
226  }
227  grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
228  }
229 
230  return NULL;
231 }
232 
233 /*
234  This function finalizes a tokenization.
235  */
236 static grn_obj *
237 mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
238 {
239  grn_mecab_tokenizer *tokenizer = user_data->ptr;
240  if (!tokenizer) {
241  return NULL;
242  }
243  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
244  grn_tokenizer_query_close(ctx, tokenizer->query);
245  grn_obj_unlink(ctx, &(tokenizer->buf));
246  GRN_PLUGIN_FREE(ctx, tokenizer);
247  return NULL;
248 }
249 
250 static void
251 check_mecab_dictionary_encoding(grn_ctx *ctx)
252 {
253 #ifdef HAVE_MECAB_DICTIONARY_INFO_T
254  mecab_t *mecab;
255 
256  mecab = mecab_new2("-Owakati");
257  if (mecab) {
258  grn_encoding encoding;
259  int have_same_encoding_dictionary = 0;
260 
261  encoding = GRN_CTX_GET_ENCODING(ctx);
262  have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab);
263  mecab_destroy(mecab);
264 
265  if (!have_same_encoding_dictionary) {
267  "[tokenizer][mecab] "
268  "MeCab has no dictionary that uses the context encoding"
269  ": <%s>",
270  grn_encoding_to_string(encoding));
271  }
272  } else {
274  "[tokenizer][mecab] "
275  "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
276  mecab_strerror(NULL));
277  }
278 #endif
279 }
280 
281 /*
282  This function initializes a plugin. This function fails if there is no
283  dictionary that uses the context encoding of groonga.
284  */
285 grn_rc
287 {
288  sole_mecab = NULL;
289  sole_mecab_mutex = grn_plugin_mutex_open(ctx);
290  if (!sole_mecab_mutex) {
292  "[tokenizer][mecab] grn_plugin_mutex_open() failed");
293  return ctx->rc;
294  }
295 
296  check_mecab_dictionary_encoding(ctx);
297  return ctx->rc;
298 }
299 
300 /*
301  This function registers a plugin to a database.
302  */
303 grn_rc
305 {
306  grn_rc rc;
307 
308  rc = grn_tokenizer_register(ctx, "TokenMecab", 10,
309  mecab_init, mecab_next, mecab_fin);
310  if (rc == GRN_SUCCESS) {
311  grn_obj *token_mecab;
312  token_mecab = grn_ctx_get(ctx, "TokenMecab", 10);
313  /* Just for backward compatibility. TokenMecab was built-in not plugin. */
314  if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) {
315  rc = GRN_FILE_CORRUPT;
316  }
317  }
318 
319  return rc;
320 }
321 
322 /*
323  This function finalizes a plugin.
324  */
325 grn_rc
327 {
328  if (sole_mecab) {
329  mecab_destroy(sole_mecab);
330  sole_mecab = NULL;
331  }
332  if (sole_mecab_mutex) {
333  grn_plugin_mutex_close(ctx, sole_mecab_mutex);
334  sole_mecab_mutex = NULL;
335  }
336 
337  return GRN_SUCCESS;
338 }