Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
tokenizer.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /*
3  Copyright(C) 2012 Brazil
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 #include "groonga_in.h"
19 #include "groonga/tokenizer.h"
20 
21 #include <stdarg.h>
22 #include <stdio.h>
23 #include <string.h>
24 
25 #include "ctx.h"
26 #include "db.h"
27 #include "str.h"
28 #include "string_in.h"
29 #include "token.h"
30 
31 /*
32  Just for backward compatibility. See grn_plugin_charlen() instead.
33  */
34 int
35 grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
36  unsigned int str_length, grn_encoding encoding)
37 {
38  return grn_plugin_charlen(ctx, str_ptr, str_length, encoding);
39 }
40 
41 /*
42  Just for backward compatibility. See grn_plugin_isspace() instead.
43  */
44 int
45 grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
46  unsigned int str_length, grn_encoding encoding)
47 {
48  return grn_plugin_isspace(ctx, str_ptr, str_length, encoding);
49 }
50 
53  const char *str_ptr,
54  unsigned int str_length,
55  grn_encoding encoding)
56 {
57  if (encoding != GRN_ENC_UTF8) {
58  return GRN_FALSE;
59  }
60 
62  return GRN_FALSE;
63  }
64 
65  return memcmp(str_ptr,
68 }
69 
72  const char *str_ptr,
73  unsigned int str_length,
74  grn_encoding encoding)
75 {
76  int char_length;
77  const char *current = str_ptr;
78  const char *end = str_ptr + str_length;
79 
80  if (encoding != GRN_ENC_UTF8) {
81  return GRN_FALSE;
82  }
83 
84  if (str_length == 0) {
85  return GRN_FALSE;
86  }
87 
88  while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) {
90  current, char_length,
91  encoding)) {
92  return GRN_TRUE;
93  }
94  current += char_length;
95  }
96  return GRN_FALSE;
97 }
98 
100 grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
101  unsigned int normalize_flags)
102 {
103  grn_obj *flags = grn_ctx_pop(ctx);
104  grn_obj *query_str = grn_ctx_pop(ctx);
105 
106  if (query_str == NULL) {
107  GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
108  return NULL;
109  }
110 
111  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
112  GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
113  return NULL;
114  }
115 
116  {
117  grn_tokenizer_query * const query =
119  if (query == NULL) {
120  return NULL;
121  }
122  query->normalized_query = NULL;
123  query->query_buf = NULL;
124  if (flags) {
125  query->flags = GRN_UINT32_VALUE(flags);
126  } else {
127  query->flags = 0;
128  }
129 
130  {
131  grn_obj * const table = args[0];
132  grn_obj_flags table_flags;
133  grn_encoding table_encoding;
134  unsigned int query_length = GRN_TEXT_LEN(query_str);
135  char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
136  grn_obj *normalizer = NULL;
137 
138  if (query_buf == NULL) {
139  GRN_PLUGIN_FREE(ctx, query);
141  "[tokenizer] failed to duplicate query");
142  return NULL;
143  }
144  grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
145  &normalizer);
146  {
147  grn_obj *normalized_query;
148  if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
149  normalizer = GRN_NORMALIZER_AUTO;
150  }
151  normalized_query = grn_string_open_(ctx,
152  GRN_TEXT_VALUE(query_str),
153  GRN_TEXT_LEN(query_str),
154  normalizer,
155  normalize_flags,
156  table_encoding);
157  if (!normalized_query) {
158  GRN_PLUGIN_FREE(ctx, query_buf);
159  GRN_PLUGIN_FREE(ctx, query);
161  "[tokenizer] failed to open normalized string");
162  return NULL;
163  }
164  query->normalized_query = normalized_query;
165  memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
166  query_buf[query_length] = '\0';
167  query->query_buf = query_buf;
168  query->ptr = query_buf;
169  query->length = query_length;
170  }
171  query->encoding = table_encoding;
172 
174  const char *normalized_string;
175  unsigned int normalized_string_length;
176 
178  query->normalized_query,
179  &normalized_string,
180  &normalized_string_length,
181  NULL);
182  query->have_tokenized_delimiter =
184  normalized_string,
185  normalized_string_length,
186  query->encoding);
187  } else {
189  }
190  }
191  return query;
192  }
193 }
194 
196 grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
197 {
198  return grn_tokenizer_query_open(ctx, num_args, args, 0);
199 }
200 
201 void
203 {
204  if (query != NULL) {
205  if (query->normalized_query != NULL) {
206  grn_obj_unlink(ctx, query->normalized_query);
207  }
208  if (query->query_buf != NULL) {
209  GRN_PLUGIN_FREE(ctx, query->query_buf);
210  }
211  GRN_PLUGIN_FREE(ctx, query);
212  }
213 }
214 
215 void
217 {
218  grn_tokenizer_query_close(ctx, query);
219 }
220 
221 void
223 {
225  GRN_UINT32_INIT(&token->status, 0);
226 }
227 
228 void
230 {
231  GRN_OBJ_FIN(ctx, &(token->str));
232  GRN_OBJ_FIN(ctx, &(token->status));
233 }
234 
235 void
237  const char *str_ptr, unsigned int str_length,
238  grn_tokenizer_status status)
239 {
240  GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
241  GRN_UINT32_SET(ctx, &token->status, status);
242  grn_ctx_push(ctx, &token->str);
243  grn_ctx_push(ctx, &token->status);
244 }
245 
246 const char *
248  grn_tokenizer_token *token,
249  const char *str_ptr,
250  unsigned int str_length,
251  grn_encoding encoding)
252 {
253  size_t char_length = 0;
254  const char *start = str_ptr;
255  const char *current;
256  const char *end = str_ptr + str_length;
257  const char *next_start = NULL;
258  unsigned int token_length;
259  grn_tokenizer_status status;
260 
261  for (current = start; current < end; current += char_length) {
262  char_length = grn_charlen_(ctx, current, end, encoding);
263  if (char_length == 0) {
264  break;
265  }
266  if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
267  encoding)) {
268  next_start = str_ptr + (current - start + char_length);
269  break;
270  }
271  }
272 
273  token_length = current - start;
274  if (current == end) {
275  status = GRN_TOKENIZER_LAST;
276  } else {
277  status = GRN_TOKENIZER_CONTINUE;
278  }
279  grn_tokenizer_token_push(ctx, token, start, token_length, status);
280 
281  return next_start;
282 }
283 
284 grn_rc
285 grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
286  unsigned int plugin_name_length,
287  grn_proc_func *init, grn_proc_func *next,
289 {
290  grn_expr_var vars[] = {
291  { NULL, 0 },
292  { NULL, 0 },
293  { NULL, 0 }
294  };
295  GRN_TEXT_INIT(&vars[0].value, 0);
296  GRN_TEXT_INIT(&vars[1].value, 0);
297  GRN_UINT32_INIT(&vars[2].value, 0);
298 
299  {
300  /*
301  grn_proc_create() registers a plugin to the database which is associated
302  with `ctx'. A returned object must not be finalized here.
303  */
304  grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
305  plugin_name_length,
307  init, next, fin, 3, vars);
308  if (obj == NULL) {
309  GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed");
310  return ctx->rc;
311  }
312  }
313  return GRN_SUCCESS;
314 }