Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
kytea.cpp
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2012 Brazil
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License version 2.1 as published by the Free Software Foundation.
7 
8  This library is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  Lesser General Public License for more details.
12 
13  You should have received a copy of the GNU Lesser General Public
14  License along with this library; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 #include <groonga/tokenizer.h>
19 
20 #include <kytea/kytea.h>
21 #include <kytea/string-util.h>
22 
23 #include <string.h>
24 
25 #include <string>
26 #include <vector>
27 
28 namespace {
29 
30 grn_plugin_mutex *kytea_mutex = NULL;
31 kytea::KyteaConfig *kytea_config = NULL;
32 kytea::Kytea *kytea_tagger = NULL;
33 kytea::StringUtil *kytea_util = NULL;
34 
35 void kytea_init(grn_ctx *ctx);
36 void kytea_fin(grn_ctx *ctx);
37 
38 void kytea_init(grn_ctx *ctx) {
39  if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) {
41  "[tokenizer][kytea] "
42  "TokenKytea is already initialized");
43  return;
44  }
45 
46  kytea_mutex = grn_plugin_mutex_open(ctx);
47  if (!kytea_mutex) {
48  kytea_fin(ctx);
50  "[tokenizer][kytea] "
51  "grn_plugin_mutex_open() failed");
52  return;
53  }
54 
55  kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>(
56  GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig)));
57  if (!config) {
58  kytea_fin(ctx);
60  "[tokenizer][kytea] "
61  "memory allocation to kytea::KyteaConfig failed");
62  return;
63  }
64 
65  try {
66  new (config) kytea::KyteaConfig;
67  kytea_config = config;
68  try {
69  kytea_config->setDebug(0);
70  kytea_config->setOnTraining(false);
71  kytea_config->parseRunCommandLine(0, NULL);
72  } catch (...) {
73  kytea_fin(ctx);
75  "[tokenizer][kytea] "
76  "kytea::KyteaConfig settings failed");
77  return;
78  }
79  } catch (...) {
80  GRN_PLUGIN_FREE(ctx, config);
81  kytea_fin(ctx);
83  "[tokenizer][kytea] "
84  "kytea::KyteaConfig initialization failed");
85  return;
86  }
87 
88  kytea::Kytea * const tagger = static_cast<kytea::Kytea *>(
89  GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea)));
90  if (!tagger) {
91  kytea_fin(ctx);
93  "[tokenizer][kytea] "
94  "memory allocation to kytea::Kytea failed");
95  return;
96  }
97 
98  try {
99  new (tagger) kytea::Kytea;
100  kytea_tagger = tagger;
101  try {
102  kytea_tagger->readModel(kytea_config->getModelFile().c_str());
103  } catch (...) {
104  kytea_fin(ctx);
106  "[tokenizer][kytea] "
107  "kytea::Kytea::readModel() failed");
108  return;
109  }
110  } catch (...) {
111  GRN_PLUGIN_FREE(ctx, tagger);
112  kytea_fin(ctx);
114  "[tokenizer][kytea] "
115  "kytea::Kytea initialization failed");
116  return;
117  }
118 
119  try {
120  kytea_util = kytea_tagger->getStringUtil();
121  } catch (...) {
122  kytea_fin(ctx);
124  "[tokenizer][kytea] "
125  "kytea::Kytea::getStringUtil() failed");
126  return;
127  }
128 }
129 
130 void kytea_fin(grn_ctx *ctx) {
131  kytea_util = NULL;
132 
133  if (kytea_tagger) {
134  kytea_tagger->~Kytea();
135  GRN_PLUGIN_FREE(ctx, kytea_tagger);
136  kytea_tagger = NULL;
137  }
138 
139  if (kytea_config) {
140  kytea_config->~KyteaConfig();
141  GRN_PLUGIN_FREE(ctx, kytea_config);
142  kytea_config = NULL;
143  }
144 
145  if (kytea_mutex) {
146  grn_plugin_mutex_close(ctx, kytea_mutex);
147  kytea_mutex = NULL;
148  }
149 }
150 
151 struct grn_tokenizer_kytea {
152  grn_tokenizer_query *query;
153  kytea::KyteaSentence sentence;
154  std::vector<std::string> tokens;
155  std::size_t id;
156  grn_tokenizer_token token;
157  const char *rest_query_string;
158  unsigned int rest_query_string_length;
159 
160  grn_tokenizer_kytea() :
161  query(NULL),
162  sentence(),
163  tokens(),
164  id(0),
165  token(),
166  rest_query_string(NULL)
167  {
168  }
169  ~grn_tokenizer_kytea() {}
170 };
171 
172 void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
173  new (tokenizer) grn_tokenizer_kytea;
174  grn_tokenizer_token_init(ctx, &tokenizer->token);
175 }
176 
177 void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
178  grn_tokenizer_token_fin(ctx, &tokenizer->token);
179  if (tokenizer->query) {
180  grn_tokenizer_query_close(ctx, tokenizer->query);
181  }
182  tokenizer->~grn_tokenizer_kytea();
183 }
184 
185 grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
186  grn_user_data *user_data) {
187  unsigned int normalizer_flags = 0;
188  grn_tokenizer_query * const query =
189  grn_tokenizer_query_open(ctx, num_args, args, normalizer_flags);
190  if (!query) {
191  return NULL;
192  }
193 
194  grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>(
195  GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea)));
196  if (!tokenizer) {
197  grn_tokenizer_query_close(ctx, query);
199  "[tokenizer][kytea] "
200  "memory allocation to grn_tokenizer_kytea failed");
201  return NULL;
202  }
203 
204  try {
205  grn_tokenizer_kytea_init(ctx, tokenizer);
206  } catch (...) {
207  grn_tokenizer_query_close(ctx, query);
209  "[tokenizer][kytea] "
210  "tokenizer initialization failed");
211  return NULL;
212  }
213 
214  tokenizer->query = query;
215 
216  grn_obj *normalized_query = query->normalized_query;
217  const char *normalized_string;
218  unsigned int normalized_string_length;
220  normalized_query,
221  &normalized_string,
222  &normalized_string_length,
223  NULL);
224  if (tokenizer->query->have_tokenized_delimiter) {
225  tokenizer->rest_query_string = normalized_string;
226  tokenizer->rest_query_string_length = normalized_string_length;
227  } else {
228  grn_plugin_mutex_lock(ctx, kytea_mutex);
229  try {
230  const std::string str(normalized_string, normalized_string_length);
231  const kytea::KyteaString &surface_str = kytea_util->mapString(str);
232  const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str);
233  tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str);
234  kytea_tagger->calculateWS(tokenizer->sentence);
235  } catch (...) {
236  grn_plugin_mutex_unlock(ctx, kytea_mutex);
238  "[tokenizer][kytea] "
239  "tokenization failed");
240  return NULL;
241  }
242  grn_plugin_mutex_unlock(ctx, kytea_mutex);
243 
244  try {
245  for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) {
246  const std::string &token =
247  kytea_util->showString(tokenizer->sentence.words[i].surface);
248  const char *ptr = token.c_str();
249  unsigned int left = static_cast<unsigned int>(token.length());
250  while (left > 0) {
251  const int char_length =
252  grn_tokenizer_charlen(ctx, ptr, left, query->encoding);
253  if ((char_length == 0) ||
254  (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) {
255  break;
256  }
257  ptr += char_length;
258  left -= char_length;
259  }
260  if (left == 0) {
261  tokenizer->tokens.push_back(token);
262  }
263  }
264  } catch (...) {
266  "[tokenizer][kytea] "
267  "adjustment failed");
268  return NULL;
269  }
270  }
271 
272  user_data->ptr = tokenizer;
273  return NULL;
274 }
275 
276 grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args,
277  grn_user_data *user_data) {
278  grn_tokenizer_kytea * const tokenizer =
279  static_cast<grn_tokenizer_kytea *>(user_data->ptr);
280 
281  if (tokenizer->query->have_tokenized_delimiter) {
282  unsigned int rest_query_string_length =
283  tokenizer->rest_query_string_length;
284  const char *rest_query_string =
286  &(tokenizer->token),
287  tokenizer->rest_query_string,
288  rest_query_string_length,
289  tokenizer->query->encoding);
290  if (rest_query_string) {
291  tokenizer->rest_query_string_length -=
292  rest_query_string - tokenizer->rest_query_string;
293  }
294  tokenizer->rest_query_string = rest_query_string;
295  } else {
296  const grn_tokenizer_status status =
297  ((tokenizer->id + 1) < tokenizer->tokens.size()) ?
299  if (tokenizer->id < tokenizer->tokens.size()) {
300  const std::string &token = tokenizer->tokens[tokenizer->id++];
301  grn_tokenizer_token_push(ctx, &tokenizer->token,
302  token.c_str(), token.length(), status);
303  } else {
304  grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status);
305  }
306  }
307 
308  return NULL;
309 }
310 
311 grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args,
312  grn_user_data *user_data) {
313  grn_tokenizer_kytea * const tokenizer =
314  static_cast<grn_tokenizer_kytea *>(user_data->ptr);
315  if (tokenizer) {
316  grn_tokenizer_kytea_fin(ctx, tokenizer);
317  GRN_PLUGIN_FREE(ctx, tokenizer);
318  }
319  return NULL;
320 }
321 
322 } // namespace
323 
324 extern "C" {
325 
326 /*
327  GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error
328  code must be set in `ctx->rc' on failure.
329  */
331  kytea_init(ctx);
332  return ctx->rc;
333 }
334 
335 /*
336  GRN_PLUGIN_REGISTER() registers this plugin to the database associated with
337  `ctx'. The registration requires the plugin name and the functions to be
338  called for tokenization.
339  */
341  return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init,
342  grn_kytea_next, grn_kytea_fin);
343 }
344 
345 /*
346  GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by
347  GRN_PLUGIN_INIT().
348  */
350  kytea_fin(ctx);
351  return GRN_SUCCESS;
352 }
353 
354 } // extern "C"