Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
suggest.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */
2 /* Copyright(C) 2010-2013 Brazil
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License version 2.1 as published by the Free Software Foundation.
7 
8  This library is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  Lesser General Public License for more details.
12 
13  You should have received a copy of the GNU Lesser General Public
14  License along with this library; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 #include "ctx.h"
19 #include "db.h"
20 #include "ii.h"
21 #include "token.h"
22 #include "output.h"
23 #include <groonga/plugin.h>
24 #include <string.h>
25 
26 #ifndef HAVE_STRNCASECMP
27 # ifdef HAVE__STRNICMP
28 # define strncasecmp(s1,s2,n) _strnicmp(s1,s2,n)
29 # endif /* HAVE__STRNICMP */
30 #endif /* HAVE_STRNCASECMP */
31 
32 #define VAR GRN_PROC_GET_VAR_BY_OFFSET
33 #define CONST_STR_LEN(x) x, x ? sizeof(x) - 1 : 0
34 #define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x)
35 
36 #define MIN_LEARN_DISTANCE (60 * GRN_TIME_USEC_PER_SEC)
37 
38 #define COMPLETE 1
39 #define CORRECT 2
40 #define SUGGEST 4
41 
42 typedef enum {
47 
48 typedef struct {
55 
57 
62  int64_t post_time_value;
63 
80 
82 
84 
87 
88  uint64_t key_prefix;
91 
92 static int
93 grn_parse_suggest_types(grn_obj *text)
94 {
95  const char *nptr = GRN_TEXT_VALUE(text);
96  const char *end = GRN_BULK_CURR(text);
97  int types = 0;
98  while (nptr < end) {
99  if (*nptr == '|') {
100  nptr += 1;
101  continue;
102  }
103  {
104  const char string[] = "complete";
105  size_t length = sizeof(string) - 1;
106  if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
107  types |= COMPLETE;
108  nptr += length;
109  continue;
110  }
111  }
112  {
113  const char string[] = "correct";
114  size_t length = sizeof(string) - 1;
115  if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
116  types |= CORRECT;
117  nptr += length;
118  continue;
119  }
120  }
121  {
122  const char string[] = "suggest";
123  size_t length = sizeof(string) - 1;
124  if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
125  types |= SUGGEST;
126  nptr += length;
127  continue;
128  }
129  }
130  break;
131  }
132  return types;
133 }
134 
135 static int32_t
136 cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id,
137  grn_obj *res, int query_type, int frequency_threshold,
138  double conditional_probability_threshold)
139 {
140  int32_t max_score = 0;
141  if (id) {
142  grn_ii_cursor *c;
143  grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co"));
144  grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co));
145  grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
146  grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
147  grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post"));
148  switch (query_type) {
149  case COMPLETE :
150  pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0"));
151  break;
152  case CORRECT :
153  pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1"));
154  break;
155  case SUGGEST :
156  pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2"));
157  break;
158  default :
159  return max_score;
160  }
161  if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX,
162  ((grn_ii *)co)->n_elements - 1, 0))) {
163  grn_ii_posting *p;
164  grn_obj post, pair_freq, item_freq, item_freq2, item_boost;
165  GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items));
166  GRN_INT32_INIT(&pair_freq, 0);
167  GRN_INT32_INIT(&item_freq, 0);
168  GRN_INT32_INIT(&item_freq2, 0);
169  GRN_INT32_INIT(&item_boost, 0);
170  while ((p = grn_ii_cursor_next(ctx, c))) {
171  grn_id post_id;
172  int pfreq, ifreq, ifreq2, boost;
173  double conditional_probability;
174  GRN_BULK_REWIND(&post);
175  GRN_BULK_REWIND(&pair_freq);
176  GRN_BULK_REWIND(&item_freq);
177  GRN_BULK_REWIND(&item_freq2);
178  GRN_BULK_REWIND(&item_boost);
179  grn_obj_get_value(ctx, pairs_post, p->rid, &post);
180  grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq);
181  post_id = GRN_RECORD_VALUE(&post);
182  grn_obj_get_value(ctx, items_freq, post_id, &item_freq);
183  grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2);
184  grn_obj_get_value(ctx, items_boost, post_id, &item_boost);
185  pfreq = GRN_INT32_VALUE(&pair_freq);
186  ifreq = GRN_INT32_VALUE(&item_freq);
187  ifreq2 = GRN_INT32_VALUE(&item_freq2);
188  if (ifreq2 > 0) {
189  conditional_probability = (double)pfreq / (double)ifreq2;
190  } else {
191  conditional_probability = 0.0;
192  }
193  boost = GRN_INT32_VALUE(&item_boost);
194  if (pfreq >= frequency_threshold && ifreq >= frequency_threshold &&
195  conditional_probability >= conditional_probability_threshold &&
196  boost >= 0) {
197  grn_rset_recinfo *ri;
198  void *value;
199  int32_t score = pfreq;
200  int added;
201  if (max_score < score + boost) { max_score = score + boost; }
202  /* put any formula if desired */
203  if (grn_hash_add(ctx, (grn_hash *)res,
204  &post_id, sizeof(grn_id), &value, &added)) {
205  ri = value;
206  ri->score += score;
207  if (added) {
208  ri->score += boost;
209  }
210  }
211  }
212  }
213  GRN_OBJ_FIN(ctx, &post);
214  GRN_OBJ_FIN(ctx, &pair_freq);
215  GRN_OBJ_FIN(ctx, &item_freq);
216  GRN_OBJ_FIN(ctx, &item_freq2);
217  GRN_OBJ_FIN(ctx, &item_boost);
218  grn_ii_cursor_close(ctx, c);
219  }
220  }
221  return max_score;
222 }
223 
224 #define DEFAULT_LIMIT 10
225 #define DEFAULT_SORTBY "-_score"
226 #define DEFAULT_OUTPUT_COLUMNS "_key,_score"
227 #define DEFAULT_FREQUENCY_THRESHOLD 100
228 #define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.2
229 
230 static void
231 output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid,
232  grn_obj *sortby, grn_obj *output_columns, int offset, int limit)
233 {
234  grn_obj *sorted;
235  if ((sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, res))) {
236  uint32_t nkeys;
237  grn_obj_format format;
238  grn_table_sort_key *keys;
239  const char *sortby_val = GRN_TEXT_VALUE(sortby);
240  unsigned int sortby_len = GRN_TEXT_LEN(sortby);
241  const char *oc_val = GRN_TEXT_VALUE(output_columns);
242  unsigned int oc_len = GRN_TEXT_LEN(output_columns);
243  if (!sortby_val || !sortby_len) {
244  sortby_val = DEFAULT_SORTBY;
245  sortby_len = sizeof(DEFAULT_SORTBY) - 1;
246  }
247  if (!oc_val || !oc_len) {
248  oc_val = DEFAULT_OUTPUT_COLUMNS;
249  oc_len = sizeof(DEFAULT_OUTPUT_COLUMNS) - 1;
250  }
251  if ((keys = grn_table_sort_key_from_str(ctx, sortby_val, sortby_len, res, &nkeys))) {
252  grn_table_sort(ctx, res, offset, limit, sorted, keys, nkeys);
254  ":", "sort(%d)", limit);
255  GRN_OBJ_FORMAT_INIT(&format, grn_table_size(ctx, res), 0, limit, offset);
256  format.flags =
259  grn_obj_columns(ctx, sorted, oc_val, oc_len, &format.columns);
260  GRN_OUTPUT_OBJ(sorted, &format);
261  GRN_OBJ_FORMAT_FIN(ctx, &format);
262  grn_table_sort_key_close(ctx, keys, nkeys);
263  }
264  grn_obj_unlink(ctx, sorted);
265  } else {
266  ERR(GRN_UNKNOWN_ERROR, "cannot create temporary sort table.");
267  }
268 }
269 
270 static inline void
271 complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold,
272  grn_obj *items_freq, grn_obj *items_boost,
273  grn_obj *item_freq, grn_obj *item_boost)
274 {
275  GRN_BULK_REWIND(item_freq);
276  GRN_BULK_REWIND(item_boost);
277  grn_obj_get_value(ctx, items_freq, id, item_freq);
278  grn_obj_get_value(ctx, items_boost, id, item_boost);
279  if (GRN_INT32_VALUE(item_boost) >= 0) {
280  int32_t score;
281  score = 1 +
282  GRN_INT32_VALUE(item_freq) +
283  GRN_INT32_VALUE(item_boost);
284  if (score >= frequency_threshold) {
285  void *value;
286  if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id),
287  &value, NULL)) {
288  grn_rset_recinfo *ri;
289  ri = value;
290  ri->score += score;
291  }
292  }
293  }
294 }
295 
296 static void
297 complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
298  grn_obj *query, grn_obj *sortby,
299  grn_obj *output_columns, int offset, int limit,
300  int frequency_threshold, double conditional_probability_threshold,
301  grn_suggest_search_mode prefix_search_mode)
302 {
303  grn_obj *res;
304  grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
305  grn_obj item_freq, item_boost;
306  GRN_INT32_INIT(&item_freq, 0);
307  GRN_INT32_INIT(&item_boost, 0);
308  if ((res = grn_table_create(ctx, NULL, 0, NULL,
309  GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
310  grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
311  grn_obj *string;
312  if (GRN_TEXT_LEN(query) &&
313  (string = grn_string_open(ctx, TEXT_VALUE_LEN(query),
314  GRN_NORMALIZER_AUTO, 0))) {
315  grn_table_cursor *cur;
316  /* RK search + prefix search */
317  grn_obj *index;
318  const char *normalized;
319  unsigned int normalized_length_in_bytes;
320  grn_string_get_normalized(ctx, string,
321  &normalized,
322  &normalized_length_in_bytes,
323  NULL);
324  /* FIXME: support index selection */
325  if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) {
326  if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain),
327  normalized,
328  normalized_length_in_bytes,
329  NULL, 0, 0, -1,
331  grn_id id;
332  while ((id = grn_table_cursor_next(ctx, cur))) {
333  grn_ii_cursor *icur;
334  if ((icur = grn_ii_cursor_open(ctx, (grn_ii *)index, id,
335  GRN_ID_NIL, GRN_ID_MAX, 1, 0))) {
336  grn_ii_posting *p;
337  while ((p = grn_ii_cursor_next(ctx, icur))) {
338  complete_add_item(ctx, p->rid, res, frequency_threshold,
339  items_freq, items_boost,
340  &item_freq, &item_boost);
341  }
342  grn_ii_cursor_close(ctx, icur);
343  }
344  }
345  grn_table_cursor_close(ctx, cur);
346  } else {
347  ERR(GRN_UNKNOWN_ERROR, "cannot open cursor for prefix RK search.");
348  }
349  } else {
350  ERR(GRN_UNKNOWN_ERROR, "cannot find index for prefix RK search.");
351  }
352  cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE,
353  frequency_threshold,
354  conditional_probability_threshold);
355  if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) ||
356  (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO &&
357  !grn_table_size(ctx, res))) &&
358  (cur = grn_table_cursor_open(ctx, items,
359  normalized,
360  normalized_length_in_bytes,
361  NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) {
362  grn_id id;
363  while ((id = grn_table_cursor_next(ctx, cur))) {
364  complete_add_item(ctx, id, res, frequency_threshold,
365  items_freq, items_boost, &item_freq, &item_boost);
366  }
367  grn_table_cursor_close(ctx, cur);
368  }
369  grn_obj_close(ctx, string);
370  }
371  output(ctx, items, res, tid, sortby, output_columns, offset, limit);
372  grn_obj_close(ctx, res);
373  } else {
374  ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
375  }
376  GRN_OBJ_FIN(ctx, &item_boost);
377  GRN_OBJ_FIN(ctx, &item_freq);
378 }
379 
380 static void
381 correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
382  grn_obj *query, grn_obj *sortby,
383  grn_obj *output_columns, int offset, int limit,
384  int frequency_threshold, double conditional_probability_threshold,
385  grn_suggest_search_mode similar_search_mode)
386 {
387  grn_obj *res;
388  grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
389  grn_obj item_freq2, item_boost;
390  GRN_INT32_INIT(&item_freq2, 0);
391  GRN_INT32_INIT(&item_boost, 0);
392  if ((res = grn_table_create(ctx, NULL, 0, NULL,
393  GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
394  grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
395  int32_t max_score;
396  max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT,
397  frequency_threshold,
398  conditional_probability_threshold);
400  ":", "cooccur(%d)", max_score);
401  if (GRN_TEXT_LEN(query) &&
402  ((similar_search_mode == GRN_SUGGEST_SEARCH_YES) ||
403  (similar_search_mode == GRN_SUGGEST_SEARCH_AUTO &&
404  max_score < frequency_threshold))) {
405  grn_obj *key, *index;
406  if ((key = grn_obj_column(ctx, items, CONST_STR_LEN("_key")))) {
407  if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) {
408  grn_select_optarg optarg;
409  memset(&optarg, 0, sizeof(grn_select_optarg));
410  optarg.mode = GRN_OP_SIMILAR;
411  optarg.similarity_threshold = 0;
412  optarg.max_size = 2;
413  grn_ii_select(ctx, (grn_ii *)index, TEXT_VALUE_LEN(query),
414  (grn_hash *)res, GRN_OP_OR, &optarg);
415  grn_obj_unlink(ctx, index);
417  ":", "similar(%d)", grn_table_size(ctx, res));
418  {
419  grn_hash_cursor *hc = grn_hash_cursor_open(ctx, (grn_hash *)res, NULL,
420  0, NULL, 0, 0, -1, 0);
421  if (hc) {
422  while (grn_hash_cursor_next(ctx, hc)) {
423  void *key, *value;
424  if (grn_hash_cursor_get_key_value(ctx, hc, &key, NULL, &value)) {
425  grn_id *rp;
426  rp = key;
427  GRN_BULK_REWIND(&item_freq2);
428  GRN_BULK_REWIND(&item_boost);
429  grn_obj_get_value(ctx, items_freq2, *rp, &item_freq2);
430  grn_obj_get_value(ctx, items_boost, *rp, &item_boost);
431  if (GRN_INT32_VALUE(&item_boost) >= 0) {
432  int32_t score;
433  grn_rset_recinfo *ri;
434  score = 1 +
435  (GRN_INT32_VALUE(&item_freq2) >> 4) +
436  GRN_INT32_VALUE(&item_boost);
437  ri = value;
438  ri->score += score;
439  if (score >= frequency_threshold) { continue; }
440  }
441  /* score < frequency_threshold || item_boost < 0 */
442  grn_hash_cursor_delete(ctx, hc, NULL);
443  }
444  }
445  grn_hash_cursor_close(ctx, hc);
446  }
447  }
449  ":", "filter(%d)", grn_table_size(ctx, res));
450  {
451  /* exec _score -= edit_distance(_key, "query string") for all records */
452  grn_obj *var;
453  grn_obj *expr;
454 
455  GRN_EXPR_CREATE_FOR_QUERY(ctx, res, expr, var);
456  if (expr) {
457  grn_table_cursor *tc;
458  grn_obj *score = grn_obj_column(ctx, res, CONST_STR_LEN("_score"));
459  grn_obj *key = grn_obj_column(ctx, res, CONST_STR_LEN("_key"));
460  grn_expr_append_obj(ctx, expr,
461  score,
462  GRN_OP_GET_VALUE, 1);
463  grn_expr_append_obj(ctx, expr,
464  grn_ctx_get(ctx, CONST_STR_LEN("edit_distance")),
465  GRN_OP_PUSH, 1);
466  grn_expr_append_obj(ctx, expr,
467  key,
468  GRN_OP_GET_VALUE, 1);
469  grn_expr_append_const(ctx, expr, query, GRN_OP_PUSH, 1);
470  grn_expr_append_op(ctx, expr, GRN_OP_CALL, 2);
472 
473  if ((tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, 0))) {
474  grn_id id;
475  grn_obj score_value;
476  GRN_INT32_INIT(&score_value, 0);
477  while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
478  GRN_RECORD_SET(ctx, var, id);
479  grn_expr_exec(ctx, expr, 0);
480  GRN_BULK_REWIND(&score_value);
481  grn_obj_get_value(ctx, score, id, &score_value);
482  if (GRN_INT32_VALUE(&score_value) < frequency_threshold) {
483  grn_table_cursor_delete(ctx, tc);
484  }
485  }
486  grn_obj_unlink(ctx, &score_value);
487  grn_table_cursor_close(ctx, tc);
488  }
489  grn_obj_unlink(ctx, score);
490  grn_obj_unlink(ctx, key);
491  grn_obj_unlink(ctx, expr);
492  } else {
494  "error on building expr. for calicurating edit distance");
495  }
496  }
497  }
498  grn_obj_unlink(ctx, key);
499  }
500  }
501  output(ctx, items, res, tid, sortby, output_columns, offset, limit);
502  grn_obj_close(ctx, res);
503  } else {
504  ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
505  }
506  GRN_OBJ_FIN(ctx, &item_boost);
507  GRN_OBJ_FIN(ctx, &item_freq2);
508 }
509 
510 static void
511 suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
512  grn_obj *query, grn_obj *sortby,
513  grn_obj *output_columns, int offset, int limit,
514  int frequency_threshold, double conditional_probability_threshold)
515 {
516  grn_obj *res;
517  if ((res = grn_table_create(ctx, NULL, 0, NULL,
518  GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
519  grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
520  cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST,
521  frequency_threshold, conditional_probability_threshold);
522  output(ctx, items, res, tid, sortby, output_columns, offset, limit);
523  grn_obj_close(ctx, res);
524  } else {
525  ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
526  }
527 }
528 
530 parse_search_mode(grn_ctx *ctx, grn_obj *mode_text)
531 {
533  int mode_length;
534 
535  mode_length = GRN_TEXT_LEN(mode_text);
536  if (mode_length == 3 &&
537  strncasecmp("yes", GRN_TEXT_VALUE(mode_text), 3) == 0) {
538  mode = GRN_SUGGEST_SEARCH_YES;
539  } else if (mode_length == 2 &&
540  strncasecmp("no", GRN_TEXT_VALUE(mode_text), 2) == 0) {
541  mode = GRN_SUGGEST_SEARCH_NO;
542  } else {
544  }
545 
546  return mode;
547 }
548 
549 static grn_obj *
550 command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
551 {
552  grn_obj *items, *col, *items_boost;
553  int types;
554  int offset = 0;
555  int limit = DEFAULT_LIMIT;
556  int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD;
557  double conditional_probability_threshold =
559  grn_suggest_search_mode prefix_search_mode;
560  grn_suggest_search_mode similar_search_mode;
561 
562  types = grn_parse_suggest_types(VAR(0));
563  if (GRN_TEXT_LEN(VAR(6)) > 0) {
564  offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL);
565  }
566  if (GRN_TEXT_LEN(VAR(7)) > 0) {
567  limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL);
568  }
569  if (GRN_TEXT_LEN(VAR(8)) > 0) {
570  frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL);
571  }
572  if (GRN_TEXT_LEN(VAR(9)) > 0) {
573  GRN_TEXT_PUTC(ctx, VAR(9), '\0');
574  conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL);
575  }
576 
577  prefix_search_mode = parse_search_mode(ctx, VAR(10));
578  similar_search_mode = parse_search_mode(ctx, VAR(11));
579 
580  if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) {
581  if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost")))) {
582  GRN_OUTPUT_MAP_OPEN("RESULT_SET", -1);
583  if (types & COMPLETE) {
584  if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) {
585  GRN_OUTPUT_CSTR("complete");
586  complete(ctx, items, items_boost, col, VAR(3), VAR(4),
587  VAR(5), offset, limit,
588  frequency_threshold, conditional_probability_threshold,
589  prefix_search_mode);
590  } else {
591  ERR(GRN_INVALID_ARGUMENT, "invalid column.");
592  }
593  }
594  if (types & CORRECT) {
595  GRN_OUTPUT_CSTR("correct");
596  correct(ctx, items, items_boost, VAR(3), VAR(4),
597  VAR(5), offset, limit,
598  frequency_threshold, conditional_probability_threshold,
599  similar_search_mode);
600  }
601  if (types & SUGGEST) {
602  GRN_OUTPUT_CSTR("suggest");
603  suggest(ctx, items, items_boost, VAR(3), VAR(4),
604  VAR(5), offset, limit,
605  frequency_threshold, conditional_probability_threshold);
606  }
608  } else {
609  ERR(GRN_INVALID_ARGUMENT, "nonexistent column: <%.*s.boost>",
610  (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1)));
611  }
612  grn_obj_unlink(ctx, items);
613  } else {
614  ERR(GRN_INVALID_ARGUMENT, "nonexistent table: <%.*s>",
615  (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1)));
616  }
617  return NULL;
618 }
619 
620 static void
621 learner_init_values(grn_ctx *ctx, grn_suggest_learner *learner)
622 {
623  learner->post_event_id = GRN_RECORD_VALUE(learner->post_event);
624  learner->post_type_id = GRN_RECORD_VALUE(learner->post_type);
625  learner->post_item_id = GRN_RECORD_VALUE(learner->post_item);
626  learner->seq_id = GRN_RECORD_VALUE(learner->seq);
627  learner->post_time_value = GRN_TIME_VALUE(learner->post_time);
628 }
629 
630 static void
631 learner_init(grn_ctx *ctx, grn_suggest_learner *learner,
632  grn_obj *post_event, grn_obj *post_type, grn_obj *post_item,
633  grn_obj *seq, grn_obj *post_time, grn_obj *pairs)
634 {
635  learner->post_event = post_event;
636  learner->post_type = post_type;
637  learner->post_item = post_item;
638  learner->seq = seq;
639  learner->post_time = post_time;
640  learner->pairs = pairs;
641 
642  learner->learn_distance_in_seconds = 0;
643 
644  learner_init_values(ctx, learner);
645 }
646 
647 static void
648 learner_init_columns(grn_ctx *ctx, grn_suggest_learner *learner)
649 {
650  grn_id events_id, event_types_id;
651  grn_obj *seqs, *events, *post_item, *items, *pairs;
652 
653  learner->seqs = seqs = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(learner->seq));
654  learner->seqs_events = grn_obj_column(ctx, seqs, CONST_STR_LEN("events"));
655 
656  events_id = grn_obj_get_range(ctx, learner->seqs_events);
657  learner->events = events = grn_ctx_at(ctx, events_id);
658  learner->events_item = grn_obj_column(ctx, events, CONST_STR_LEN("item"));
659  learner->events_type = grn_obj_column(ctx, events, CONST_STR_LEN("type"));
660  learner->events_time = grn_obj_column(ctx, events, CONST_STR_LEN("time"));
661 
662  event_types_id = grn_obj_get_range(ctx, learner->events_type);
663  learner->event_types = grn_obj_column(ctx, events, CONST_STR_LEN("time"));
664 
665  post_item = learner->post_item;
666  learner->items = items = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(post_item));
667  learner->items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
668  learner->items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
669  learner->items_last = grn_obj_column(ctx, items, CONST_STR_LEN("last"));
670 
671  pairs = learner->pairs;
672  learner->pairs_pre = grn_obj_column(ctx, pairs, CONST_STR_LEN("pre"));
673  learner->pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post"));
674  learner->pairs_freq0 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0"));
675  learner->pairs_freq1 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1"));
676  learner->pairs_freq2 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2"));
677 }
678 
679 static void
680 learner_fin_columns(grn_ctx *ctx, grn_suggest_learner *learner)
681 {
682  grn_obj_unlink(ctx, learner->seqs);
683  grn_obj_unlink(ctx, learner->seqs_events);
684 
685  grn_obj_unlink(ctx, learner->events);
686  grn_obj_unlink(ctx, learner->events_item);
687  grn_obj_unlink(ctx, learner->events_type);
688  grn_obj_unlink(ctx, learner->events_time);
689 
690  grn_obj_unlink(ctx, learner->event_types);
691 
692  grn_obj_unlink(ctx, learner->items);
693  grn_obj_unlink(ctx, learner->items_freq);
694  grn_obj_unlink(ctx, learner->items_freq2);
695  grn_obj_unlink(ctx, learner->items_last);
696 
697  grn_obj_unlink(ctx, learner->pairs_pre);
698  grn_obj_unlink(ctx, learner->pairs_post);
699  grn_obj_unlink(ctx, learner->pairs_freq0);
700  grn_obj_unlink(ctx, learner->pairs_freq1);
701  grn_obj_unlink(ctx, learner->pairs_freq2);
702 }
703 
704 static void
705 learner_init_weight(grn_ctx *ctx, grn_suggest_learner *learner)
706 {
707  grn_obj *weight_column = NULL;
708  unsigned int weight = 1;
709 
710  if (learner->configuration) {
711  weight_column = grn_obj_column(ctx,
712  learner->configuration,
713  CONST_STR_LEN("weight"));
714  }
715  if (weight_column) {
716  grn_id id;
717  id = grn_table_get(ctx, learner->configuration,
718  GRN_TEXT_VALUE(&(learner->dataset_name)),
719  GRN_TEXT_LEN(&(learner->dataset_name)));
720  if (id != GRN_ID_NIL) {
721  grn_obj weight_value;
722  GRN_UINT32_INIT(&weight_value, 0);
723  grn_obj_get_value(ctx, weight_column, id, &weight_value);
724  weight = GRN_UINT32_VALUE(&weight_value);
725  GRN_OBJ_FIN(ctx, &weight_value);
726  }
727  grn_obj_unlink(ctx, weight_column);
728  }
729 
730  GRN_UINT32_INIT(&(learner->weight), 0);
731  GRN_UINT32_SET(ctx, &(learner->weight), weight);
732 }
733 
734 static void
735 learner_init_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner)
736 {
737  char events_name[GRN_TABLE_MAX_KEY_SIZE];
738  unsigned int events_name_size;
739  unsigned int events_name_prefix_size;
740 
741  events_name_size = grn_obj_name(ctx, learner->events,
742  events_name, GRN_TABLE_MAX_KEY_SIZE);
743  GRN_TEXT_INIT(&(learner->dataset_name), 0);
744  events_name_prefix_size = strlen("event_");
745  if (events_name_size > events_name_prefix_size) {
746  GRN_TEXT_PUT(ctx,
747  &(learner->dataset_name),
748  events_name + events_name_prefix_size,
749  events_name_size - events_name_prefix_size);
750  }
751 }
752 
753 static void
754 learner_fin_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner)
755 {
756  GRN_OBJ_FIN(ctx, &(learner->dataset_name));
757 }
758 
759 static void
760 learner_init_configuration(grn_ctx *ctx, grn_suggest_learner *learner)
761 {
762  learner->configuration = grn_ctx_get(ctx, "configuration", -1);
763 }
764 
765 static void
766 learner_fin_configuration(grn_ctx *ctx, grn_suggest_learner *learner)
767 {
768  if (learner->configuration) {
769  grn_obj_unlink(ctx, learner->configuration);
770  }
771 }
772 
773 static void
774 learner_init_buffers(grn_ctx *ctx, grn_suggest_learner *learner)
775 {
776  learner_init_weight(ctx, learner);
777  GRN_RECORD_INIT(&(learner->pre_events), 0, grn_obj_id(ctx, learner->events));
778 }
779 
780 static void
781 learner_fin_buffers(grn_ctx *ctx, grn_suggest_learner *learner)
782 {
783  grn_obj_unlink(ctx, &(learner->weight));
784  grn_obj_unlink(ctx, &(learner->pre_events));
785 }
786 
787 static void
788 learner_init_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner)
789 {
790  grn_id items_id;
791 
792  learner->key_prefix = ((uint64_t)learner->post_item_id) << 32;
793 
794  items_id = grn_obj_get_range(ctx, learner->events_item);
795  GRN_RECORD_INIT(&(learner->pre_item), 0, items_id);
796 
797  grn_obj_get_value(ctx, learner->seqs_events, learner->seq_id,
798  &(learner->pre_events));
799 }
800 
801 static void
802 learner_fin_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner)
803 {
804  grn_obj_unlink(ctx, &(learner->pre_item));
805  GRN_BULK_REWIND(&(learner->pre_events));
806 }
807 
808 static grn_bool
809 learner_is_valid_input(grn_ctx *ctx, grn_suggest_learner *learner)
810 {
811  return learner->post_event_id && learner->post_item_id && learner->seq_id;
812 }
813 
814 static void
815 learner_increment(grn_ctx *ctx, grn_suggest_learner *learner,
816  grn_obj *column, grn_id record_id)
817 {
818  grn_obj_set_value(ctx, column, record_id, &(learner->weight), GRN_OBJ_INCR);
819 }
820 
821 static void
822 learner_increment_item_freq(grn_ctx *ctx, grn_suggest_learner *learner,
823  grn_obj *column)
824 {
825  learner_increment(ctx, learner, column, learner->post_item_id);
826 }
827 
828 static void
829 learner_set_last_post_time(grn_ctx *ctx, grn_suggest_learner *learner)
830 {
831  grn_obj_set_value(ctx, learner->items_last, learner->post_item_id,
832  learner->post_time, GRN_OBJ_SET);
833 }
834 
835 static void
836 learner_learn_for_complete_and_correcnt(grn_ctx *ctx,
837  grn_suggest_learner *learner)
838 {
839  grn_obj *pre_item, *post_item, *pre_events;
840  grn_obj pre_type, pre_time;
841  grn_id *ep, *es;
842  uint64_t key;
843  int64_t post_time_value;
844 
845  pre_item = &(learner->pre_item);
846  post_item = learner->post_item;
847  pre_events = &(learner->pre_events);
848  post_time_value = learner->post_time_value;
849  GRN_RECORD_INIT(&pre_type, 0, grn_obj_get_range(ctx, learner->events_type));
850  GRN_TIME_INIT(&pre_time, 0);
851  ep = (grn_id *)GRN_BULK_CURR(pre_events);
852  es = (grn_id *)GRN_BULK_HEAD(pre_events);
853  while (es < ep--) {
854  grn_id pair_id;
855  int added;
856  int64_t learn_distance;
857 
858  GRN_BULK_REWIND(&pre_type);
859  GRN_BULK_REWIND(&pre_time);
860  GRN_BULK_REWIND(pre_item);
861  grn_obj_get_value(ctx, learner->events_type, *ep, &pre_type);
862  grn_obj_get_value(ctx, learner->events_time, *ep, &pre_time);
863  grn_obj_get_value(ctx, learner->events_item, *ep, pre_item);
864  learn_distance = post_time_value - GRN_TIME_VALUE(&pre_time);
865  if (learn_distance >= MIN_LEARN_DISTANCE) {
866  learner->learn_distance_in_seconds =
867  (int)(learn_distance / GRN_TIME_USEC_PER_SEC);
868  break;
869  }
870  key = learner->key_prefix + GRN_RECORD_VALUE(pre_item);
871  pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t),
872  &added);
873  if (added) {
874  grn_obj_set_value(ctx, learner->pairs_pre, pair_id, pre_item,
875  GRN_OBJ_SET);
876  grn_obj_set_value(ctx, learner->pairs_post, pair_id, post_item,
877  GRN_OBJ_SET);
878  }
879  if (GRN_RECORD_VALUE(&pre_type)) {
880  learner_increment(ctx, learner, learner->pairs_freq1, pair_id);
881  break;
882  } else {
883  learner_increment(ctx, learner, learner->pairs_freq0, pair_id);
884  }
885  }
886  GRN_OBJ_FIN(ctx, &pre_type);
887  GRN_OBJ_FIN(ctx, &pre_time);
888 }
889 
890 static void
891 learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner)
892 {
893  char keybuf[GRN_TABLE_MAX_KEY_SIZE];
894  int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id,
895  keybuf, GRN_TABLE_MAX_KEY_SIZE);
896  unsigned int token_flags = 0;
897  grn_token *token = grn_token_open(ctx, learner->items, keybuf, keylen,
898  GRN_TOKEN_ADD, token_flags);
899  if (token) {
900  grn_id tid;
901  grn_obj *pre_item = &(learner->pre_item);
902  grn_obj *post_item = learner->post_item;
903  grn_hash *token_ids = NULL;
904  while ((tid = grn_token_next(ctx, token)) && tid != learner->post_item_id) {
905  uint64_t key;
906  int added;
907  grn_id pair_id;
908  key = learner->key_prefix + tid;
909  pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t),
910  &added);
911  if (added) {
912  GRN_RECORD_SET(ctx, pre_item, tid);
913  grn_obj_set_value(ctx, learner->pairs_pre, pair_id,
914  pre_item, GRN_OBJ_SET);
915  grn_obj_set_value(ctx, learner->pairs_post, pair_id,
916  post_item, GRN_OBJ_SET);
917  }
918  if (!token_ids) {
919  token_ids = grn_hash_create(ctx, NULL, sizeof(grn_id), 0,
921  }
922  if (token_ids) {
923  int token_added;
924  grn_hash_add(ctx, token_ids, &tid, sizeof(grn_id), NULL, &token_added);
925  if (token_added) {
926  learner_increment(ctx, learner, learner->pairs_freq2, pair_id);
927  }
928  }
929  }
930  if (token_ids) {
931  grn_hash_close(ctx, token_ids);
932  }
933  grn_token_close(ctx, token);
934  }
935 }
936 
937 static void
938 learner_append_post_event(grn_ctx *ctx, grn_suggest_learner *learner)
939 {
940  GRN_RECORD_SET(ctx, &(learner->pre_events), learner->post_event_id);
941  grn_obj_set_value(ctx, learner->seqs_events, learner->seq_id,
942  &(learner->pre_events), GRN_OBJ_APPEND);
943 }
944 
945 static void
946 learner_learn(grn_ctx *ctx, grn_suggest_learner *learner)
947 {
948  if (learner_is_valid_input(ctx, learner)) {
949  learner_init_columns(ctx, learner);
950  learner_init_dataset_name(ctx, learner);
951  learner_init_configuration(ctx, learner);
952  learner_init_buffers(ctx, learner);
953  learner_increment_item_freq(ctx, learner, learner->items_freq);
954  learner_set_last_post_time(ctx, learner);
955  if (learner->post_type_id) {
956  learner_init_submit_learn(ctx, learner);
957  learner_increment_item_freq(ctx, learner, learner->items_freq2);
958  learner_learn_for_complete_and_correcnt(ctx, learner);
959  learner_learn_for_suggest(ctx, learner);
960  learner_fin_submit_learn(ctx, learner);
961  }
962  learner_append_post_event(ctx, learner);
963  learner_fin_buffers(ctx, learner);
964  learner_fin_configuration(ctx, learner);
965  learner_fin_dataset_name(ctx, learner);
966  learner_fin_columns(ctx, learner);
967  }
968 }
969 
970 static grn_obj *
971 func_suggest_preparer(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
972 {
973  int learn_distance_in_seconds = 0;
974  grn_obj *obj;
975  if (nargs == 6) {
976  grn_obj *post_event = args[0];
977  grn_obj *post_type = args[1];
978  grn_obj *post_item = args[2];
979  grn_obj *seq = args[3];
980  grn_obj *post_time = args[4];
981  grn_obj *pairs = args[5];
982  grn_suggest_learner learner;
983  learner_init(ctx, &learner,
984  post_event, post_type, post_item, seq, post_time, pairs);
985  learner_learn(ctx, &learner);
986  learn_distance_in_seconds = learner.learn_distance_in_seconds;
987  }
988  if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) {
989  GRN_UINT32_SET(ctx, obj, learn_distance_in_seconds);
990  }
991  return obj;
992 }
993 
994 grn_rc
996 {
997  return GRN_SUCCESS;
998 }
999 
1000 grn_rc
1002 {
1003  grn_expr_var vars[] = {
1004  {CONST_STR_LEN("types")},
1005  {CONST_STR_LEN("table")},
1006  {CONST_STR_LEN("column")},
1007  {CONST_STR_LEN("query")},
1008  {CONST_STR_LEN("sortby")},
1009  {CONST_STR_LEN("output_columns")},
1010  {CONST_STR_LEN("offset")},
1011  {CONST_STR_LEN("limit")},
1012  {CONST_STR_LEN("frequency_threshold")},
1013  {CONST_STR_LEN("conditional_probability_threshold")},
1014  {CONST_STR_LEN("prefix_search")},
1015  {CONST_STR_LEN("similar_search")}
1016  };
1017  GRN_TEXT_INIT(&vars[0].value, 0);
1018  GRN_TEXT_INIT(&vars[1].value, 0);
1019  GRN_TEXT_INIT(&vars[2].value, 0);
1020  GRN_TEXT_INIT(&vars[3].value, 0);
1021  GRN_TEXT_INIT(&vars[4].value, 0);
1022  GRN_TEXT_INIT(&vars[5].value, 0);
1023  GRN_TEXT_INIT(&vars[6].value, 0);
1024  GRN_TEXT_INIT(&vars[7].value, 0);
1025  GRN_TEXT_INIT(&vars[8].value, 0);
1026  GRN_TEXT_INIT(&vars[9].value, 0);
1027  GRN_TEXT_INIT(&vars[10].value, 0);
1028  GRN_TEXT_INIT(&vars[11].value, 0);
1030  command_suggest, NULL, NULL, 12, vars);
1031 
1032  grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer"), GRN_PROC_FUNCTION,
1033  func_suggest_preparer, NULL, NULL, 0, NULL);
1034  return ctx->rc;
1035 }
1036 
1037 grn_rc
1039 {
1040  return GRN_SUCCESS;
1041 }