20 #include <kytea/kytea.h>
21 #include <kytea/string-util.h>
31 kytea::KyteaConfig *kytea_config = NULL;
32 kytea::Kytea *kytea_tagger = NULL;
33 kytea::StringUtil *kytea_util = NULL;
39 if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) {
42 "TokenKytea is already initialized");
51 "grn_plugin_mutex_open() failed");
55 kytea::KyteaConfig *
const config =
static_cast<kytea::KyteaConfig *
>(
61 "memory allocation to kytea::KyteaConfig failed");
66 new (config) kytea::KyteaConfig;
67 kytea_config = config;
69 kytea_config->setDebug(0);
70 kytea_config->setOnTraining(
false);
71 kytea_config->parseRunCommandLine(0, NULL);
76 "kytea::KyteaConfig settings failed");
84 "kytea::KyteaConfig initialization failed");
88 kytea::Kytea *
const tagger =
static_cast<kytea::Kytea *
>(
94 "memory allocation to kytea::Kytea failed");
99 new (tagger) kytea::Kytea;
100 kytea_tagger = tagger;
102 kytea_tagger->readModel(kytea_config->getModelFile().c_str());
106 "[tokenizer][kytea] "
107 "kytea::Kytea::readModel() failed");
114 "[tokenizer][kytea] "
115 "kytea::Kytea initialization failed");
120 kytea_util = kytea_tagger->getStringUtil();
124 "[tokenizer][kytea] "
125 "kytea::Kytea::getStringUtil() failed");
134 kytea_tagger->~Kytea();
140 kytea_config->~KyteaConfig();
151 struct grn_tokenizer_kytea {
153 kytea::KyteaSentence sentence;
154 std::vector<std::string> tokens;
157 const char *rest_query_string;
158 unsigned int rest_query_string_length;
160 grn_tokenizer_kytea() :
166 rest_query_string(NULL)
169 ~grn_tokenizer_kytea() {}
172 void grn_tokenizer_kytea_init(
grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
173 new (tokenizer) grn_tokenizer_kytea;
177 void grn_tokenizer_kytea_fin(
grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
179 if (tokenizer->query) {
182 tokenizer->~grn_tokenizer_kytea();
187 unsigned int normalizer_flags = 0;
194 grn_tokenizer_kytea *
const tokenizer =
static_cast<grn_tokenizer_kytea *
>(
199 "[tokenizer][kytea] "
200 "memory allocation to grn_tokenizer_kytea failed");
205 grn_tokenizer_kytea_init(ctx, tokenizer);
209 "[tokenizer][kytea] "
210 "tokenizer initialization failed");
214 tokenizer->query = query;
217 const char *normalized_string;
218 unsigned int normalized_string_length;
222 &normalized_string_length,
224 if (tokenizer->query->have_tokenized_delimiter) {
225 tokenizer->rest_query_string = normalized_string;
226 tokenizer->rest_query_string_length = normalized_string_length;
230 const std::string str(normalized_string, normalized_string_length);
231 const kytea::KyteaString &surface_str = kytea_util->mapString(str);
232 const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str);
233 tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str);
234 kytea_tagger->calculateWS(tokenizer->sentence);
238 "[tokenizer][kytea] "
239 "tokenization failed");
245 for (std::size_t
i = 0;
i < tokenizer->sentence.words.size(); ++
i) {
246 const std::string &token =
247 kytea_util->showString(tokenizer->sentence.words[
i].surface);
248 const char *ptr = token.c_str();
249 unsigned int left =
static_cast<unsigned int>(token.length());
251 const int char_length =
253 if ((char_length == 0) ||
261 tokenizer->tokens.push_back(token);
266 "[tokenizer][kytea] "
267 "adjustment failed");
272 user_data->
ptr = tokenizer;
278 grn_tokenizer_kytea *
const tokenizer =
279 static_cast<grn_tokenizer_kytea *
>(user_data->
ptr);
281 if (tokenizer->query->have_tokenized_delimiter) {
282 unsigned int rest_query_string_length =
283 tokenizer->rest_query_string_length;
284 const char *rest_query_string =
287 tokenizer->rest_query_string,
288 rest_query_string_length,
289 tokenizer->query->encoding);
290 if (rest_query_string) {
291 tokenizer->rest_query_string_length -=
292 rest_query_string - tokenizer->rest_query_string;
294 tokenizer->rest_query_string = rest_query_string;
297 ((tokenizer->id + 1) < tokenizer->tokens.size()) ?
299 if (tokenizer->id < tokenizer->tokens.size()) {
300 const std::string &token = tokenizer->tokens[tokenizer->id++];
302 token.c_str(), token.length(), status);
313 grn_tokenizer_kytea *
const tokenizer =
314 static_cast<grn_tokenizer_kytea *
>(user_data->
ptr);
316 grn_tokenizer_kytea_fin(ctx, tokenizer);
342 grn_kytea_next, grn_kytea_fin);