MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fts0que.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
27 #include "dict0dict.h" /* dict_table_get_n_rows() */
28 #include "ut0rbt.h"
29 #include "row0sel.h"
30 #include "fts0fts.h"
31 #include "fts0priv.h"
32 #include "fts0ast.h"
33 #include "fts0pars.h"
34 #include "fts0types.h"
35 #include "ha_prototypes.h"
36 #include <ctype.h>
37 
38 #ifndef UNIV_NONINL
39 #include "fts0types.ic"
40 #include "fts0vlc.ic"
41 #endif
42 
43 #include <string>
44 #include <vector>
45 #include <map>
46 
47 #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
48 
49 #define RANK_DOWNGRADE (-1.0F)
50 #define RANK_UPGRADE (1.0F)
51 
52 /* Maximum number of words supported in a proximity search.
53 FIXME, this limitation can be removed easily. Need to see
54 if we want to enforce such limitation */
55 #define MAX_PROXIMITY_ITEM 128
56 
57 /* Memory used by rbt itself for create and node add */
58 #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
59 #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
60 
61 /*Initial byte length for 'words' in fts_ranking_t */
62 #define RANKING_WORDS_INIT_LEN 4
63 
64 /* Coeffecient to use for normalize relevance ranking. */
65 static const double FTS_NORMALIZE_COEFF = 0.0115F;
66 
67 // FIXME: Need to have a generic iterator that traverses the ilist.
68 
69 typedef std::map<std::string, ulint> word_map_t;
70 typedef std::vector<std::string> word_vector_t;
71 
72 struct fts_word_freq_t;
73 
75 struct fts_query_t {
82  fts_table_t fts_common_table;
83 
86  ulint total_size;
95  word_map_t* word_map;
98  word_vector_t* word_vector;
112  que_t* read_nodes_graph;
113 
118  ibool collect_positions;
119 
120  ulint flags;
124  ulint distance;
144  ib_uint64_t total_docs;
146  ulint total_words;
155  bool multi_exist;
156 };
157 
160 struct fts_match_t {
163  ulint start;
169 };
170 
174 struct fts_select_t {
177  ulint min_pos;
181  ibool found;
187 };
188 
195  ulint n_pos;
198  ulint* min_pos;
200  ulint* max_pos;
202 };
203 
205 struct fts_phrase_t {
206  ibool found;
208  const fts_match_t*
211  const ib_vector_t*
214  ulint distance;
218  ulint zip_size;
222 };
223 
227  ulint freq;
228 };
229 
232  byte* word;
238  ib_uint64_t doc_count;
240  double idf;
241 };
242 
243 /********************************************************************
244 Callback function to fetch the rows in an FTS INDEX record.
245 @return always TRUE */
246 static
247 ibool
248 fts_query_index_fetch_nodes(
249 /*========================*/
250  void* row,
251  void* user_arg);
253 /********************************************************************
254 Read and filter nodes.
255 @return fts_node_t instance */
256 static
257 dberr_t
258 fts_query_filter_doc_ids(
259 /*=====================*/
260  fts_query_t* query,
261  const byte* word,
262  fts_word_freq_t*word_freq,
263  const fts_node_t*
264  node,
265  void* data,
266  ulint len,
267  ibool calc_doc_count);
270 #if 0
271 /*****************************************************************//***
272 Find a doc_id in a word's ilist.
273 @return TRUE if found. */
274 static
275 ibool
276 fts_query_find_doc_id(
277 /*==================*/
278  fts_select_t* select,
280  void* data,
281  ulint len);
282 #endif
283 
284 /*************************************************************/
290 static
291 dberr_t
292 fts_expand_query(
293 /*=============*/
295  fts_query_t* query)
297  __attribute__((nonnull, warn_unused_result));
298 /*************************************************************/
304 static
305 ibool
306 fts_phrase_or_proximity_search(
307 /*===========================*/
308  fts_query_t* query,
311  ib_vector_t* tokens);
312 /*************************************************************/
319 static
320 bool
321 fts_proximity_get_positions(
322 /*========================*/
323  fts_match_t** match,
324  ulint num_match,
326  ulint distance,
328  fts_proximity_t* qualified_pos);
331 #if 0
332 /********************************************************************
333 Get the total number of words in a documents. */
334 static
335 ulint
336 fts_query_terms_in_document(
337 /*========================*/
340  fts_query_t* query,
341  doc_id_t doc_id,
342  ulint* total);
343 #endif
344 
345 /********************************************************************
346 Compare two fts_doc_freq_t doc_ids.
347 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
348 UNIV_INLINE
349 int
351 /*================*/
352  const void* p1,
353  const void* p2)
354 {
355  const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
356  const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
357 
358  return((int) (fq1->doc_id - fq2->doc_id));
359 }
360 
361 #if 0
362 /*******************************************************************/
364 static
365 void
366 fts_print_lcs_table(
367 /*================*/
368  const ulint* table,
369  ulint n_rows,
370  ulint n_cols)
371 {
372  ulint i;
373 
374  for (i = 0; i < n_rows; ++i) {
375  ulint j;
376 
377  printf("\n");
378 
379  for (j = 0; j < n_cols; ++j) {
380 
381  printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
382  }
383  }
384 }
385 
386 /********************************************************************
387 Find the longest common subsequence between the query string and
388 the document. */
389 static
390 ulint
391 fts_query_lcs(
392 /*==========*/
395  const ulint* p1,
396  ulint len_p1,
397  const ulint* p2,
398  ulint len_p2)
399 {
400  int i;
401  ulint len = 0;
402  ulint r = len_p1;
403  ulint c = len_p2;
404  ulint size = (r + 1) * (c + 1) * sizeof(ulint);
405  ulint* table = (ulint*) ut_malloc(size);
406 
407  /* Traverse the table backwards, from the last row to the first and
408  also from the last column to the first. We compute the smaller
409  common subsequeces first, then use the caluclated values to determine
410  the longest common subsequence. The result will be in TABLE[0][0]. */
411  for (i = r; i >= 0; --i) {
412  int j;
413 
414  for (j = c; j >= 0; --j) {
415 
416  if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
417 
418  FTS_ELEM(table, c, i, j) = 0;
419 
420  } else if (p1[i] == p2[j]) {
421 
422  FTS_ELEM(table, c, i, j) = FTS_ELEM(
423  table, c, i + 1, j + 1) + 1;
424 
425  } else {
426 
427  ulint value;
428 
429  value = ut_max(
430  FTS_ELEM(table, c, i + 1, j),
431  FTS_ELEM(table, c, i, j + 1));
432 
433  FTS_ELEM(table, c, i, j) = value;
434  }
435  }
436  }
437 
438  len = FTS_ELEM(table, c, 0, 0);
439 
440  fts_print_lcs_table(table, r, c);
441  printf("\nLen=%lu\n", len);
442 
443  ut_free(table);
444 
445  return(len);
446 }
447 #endif
448 
449 /*******************************************************************/
453 static
454 int
455 fts_query_compare_rank(
456 /*===================*/
457  const void* p1,
458  const void* p2)
459 {
460  const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
461  const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
462 
463  if (r2->rank < r1->rank) {
464  return(-1);
465  } else if (r2->rank == r1->rank) {
466 
467  if (r1->doc_id < r2->doc_id) {
468  return(1);
469  } else if (r1->doc_id > r2->doc_id) {
470  return(1);
471  }
472 
473  return(0);
474  }
475 
476  return(1);
477 }
478 
479 #ifdef FTS_UTF8_DEBUG
480 /*******************************************************************/
484 static
485 byte*
486 fts_tolower(
487 /*========*/
488  const byte* src,
489  ulint len)
490 {
491  fts_string_t str;
492  byte* lc_str = ut_malloc(len + 1);
493 
494  str.f_len = len;
495  str.f_str = lc_str;
496 
497  memcpy(str.f_str, src, len);
498 
499  /* Make sure the last byte is NUL terminated */
500  str.f_str[len] = '\0';
501 
502  fts_utf8_tolower(&str);
503 
504  return(lc_str);
505 }
506 
507 /*******************************************************************/
511 static
512 int
513 fts_utf8_strcmp(
514 /*============*/
515  const fts_string_t*
516  str1,
518  fts_string_t* str2)
521 {
522  byte b = str2->f_str[str2->f_len];
523 
524  ut_a(str2->f_len <= str1->f_len);
525 
526  /* We need to write a NUL byte at the end of the string because the
527  string is converted to lowercase by a MySQL function which doesn't
528  care about the length. */
529  str2->f_str[str2->f_len] = 0;
530 
531  fts_utf8_tolower(str2);
532 
533  /* Restore the value we replaced above. */
534  str2->f_str[str2->f_len] = b;
535 
536  return(memcmp(str1->f_str, str2->f_str, str2->f_len));
537 }
538 #endif
539 
540 /*******************************************************************/
542 static
543 void
544 fts_ranking_words_create(
545 /*=====================*/
546  fts_query_t* query,
547  fts_ranking_t* ranking)
548 {
549  ranking->words = static_cast<byte*>(
550  mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
551  ranking->words_len = RANKING_WORDS_INIT_LEN;
552 }
553 
554 /*
555 The optimization here is using a char array(bitmap) to replace words rb tree
556 in fts_ranking_t.
557 
558 It can save lots of memory except in some cases of QUERY EXPANSION.
559 
560 'word_map' is used as a word dictionary, in which the key is a word, the value
561 is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
562 if not, we add it into 'word_map', and give it a position(actually a number).
563 then we set the corresponding bit to '1' at the position in the char array 'words'.
564 
565 'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
566 more quickly than searching by value in 'word_map'. we use 'word_vector'
567 in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
568 to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
569 by the word.
570 */
571 
572 /*******************************************************************/
574 static
575 void
576 fts_ranking_words_add(
577 /*==================*/
578  fts_query_t* query,
579  fts_ranking_t* ranking,
580  const char* word)
581 {
582  ulint pos;
583  ulint byte_offset;
584  ulint bit_offset;
585  word_map_t::iterator it;
586 
587  /* Note: we suppose the word map and vector are append-only */
588  /* Check if need to add it to word map */
589  it = query->word_map->lower_bound(word);
590  if (it != query->word_map->end()
591  && !query->word_map->key_comp()(word, it->first)) {
592  pos = it->second;
593  } else {
594  pos = query->word_map->size();
595  query->word_map->insert(it,
596  std::pair<std::string, ulint>(word, pos));
597 
598  query->word_vector->push_back(word);
599  }
600 
601  /* Check words len */
602  byte_offset = pos / CHAR_BIT;
603  if (byte_offset >= ranking->words_len) {
604  byte* words = ranking->words;
605  ulint words_len = ranking->words_len;
606 
607  while (byte_offset >= words_len) {
608  words_len *= 2;
609  }
610 
611  ranking->words = static_cast<byte*>(
612  mem_heap_zalloc(query->heap, words_len));
613  ut_memcpy(ranking->words, words, ranking->words_len);
614  ranking->words_len = words_len;
615  }
616 
617  /* Set ranking words */
618  ut_ad(byte_offset < ranking->words_len);
619  bit_offset = pos % CHAR_BIT;
620  ranking->words[byte_offset] |= 1 << bit_offset;
621 }
622 
623 /*******************************************************************/
626 static
627 bool
628 fts_ranking_words_get_next(
629 /*=======================*/
630  const fts_query_t* query,
631  fts_ranking_t* ranking,
632  ulint* pos,
633  byte** word)
634 {
635  bool ret = false;
636  ulint max_pos = ranking->words_len * CHAR_BIT;
637 
638  /* Search for next word */
639  while (*pos < max_pos) {
640  ulint byte_offset = *pos / CHAR_BIT;
641  ulint bit_offset = *pos % CHAR_BIT;
642 
643  if (ranking->words[byte_offset] & (1 << bit_offset)) {
644  ret = true;
645  break;
646  }
647 
648  *pos += 1;
649  };
650 
651  /* Get next word from word vector */
652  if (ret) {
653  ut_ad(*pos < query->word_vector->size());
654  *word = (byte*)query->word_vector->at((size_t)*pos).c_str();
655  *pos += 1;
656  }
657 
658  return ret;
659 }
660 
661 /*******************************************************************/
665 static
667 fts_query_add_word_freq(
668 /*====================*/
669  fts_query_t* query,
670  const byte* word)
671 {
672  ib_rbt_bound_t parent;
673 
674  /* Lookup the word in our rb tree and add if it doesn't exist. */
675  if (rbt_search(query->word_freqs, &parent, word) != 0) {
676  fts_word_freq_t word_freq;
677  ulint len = ut_strlen((char*) word) + 1;
678 
679  memset(&word_freq, 0, sizeof(word_freq));
680 
681  word_freq.word = static_cast<byte*>(
682  mem_heap_alloc(query->heap, len));
683 
684  /* Need to copy the NUL character too. */
685  memcpy(word_freq.word, word, len);
686 
687  word_freq.doc_count = 0;
688 
689  word_freq.doc_freqs = rbt_create(
691 
692  parent.last = rbt_add_node(
693  query->word_freqs, &parent, &word_freq);
694 
695  query->total_size += len
696  + SIZEOF_RBT_CREATE
697  + SIZEOF_RBT_NODE_ADD
698  + sizeof(fts_word_freq_t);
699  }
700 
701  return(rbt_value(fts_word_freq_t, parent.last));
702 }
703 
704 /*******************************************************************/
707 static
709 fts_query_add_doc_freq(
710 /*===================*/
711  fts_query_t* query,
712  ib_rbt_t* doc_freqs,
713  doc_id_t doc_id)
714 {
715  ib_rbt_bound_t parent;
716 
717  /* Lookup the doc id in our rb tree and add if it doesn't exist. */
718  if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
719  fts_doc_freq_t doc_freq;
720 
721  memset(&doc_freq, 0, sizeof(doc_freq));
722 
723  doc_freq.freq = 0;
724  doc_freq.doc_id = doc_id;
725 
726  parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
727 
728  query->total_size += SIZEOF_RBT_NODE_ADD
729  + sizeof(fts_doc_freq_t);
730  }
731 
732  return(rbt_value(fts_doc_freq_t, parent.last));
733 }
734 
735 /*******************************************************************/
738 static
739 void
740 fts_query_union_doc_id(
741 /*===================*/
742  fts_query_t* query,
743  doc_id_t doc_id,
744  fts_rank_t rank)
746 {
747  ib_rbt_bound_t parent;
748  ulint size = ib_vector_size(query->deleted->doc_ids);
749  fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
750 
751  /* Check if the doc id is deleted and it's not already in our set. */
752  if (fts_bsearch(array, 0, size, doc_id) < 0
753  && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
754 
755  fts_ranking_t ranking;
756 
757  ranking.rank = rank;
758  ranking.doc_id = doc_id;
759  fts_ranking_words_create(query, &ranking);
760 
761  rbt_add_node(query->doc_ids, &parent, &ranking);
762 
763  query->total_size += SIZEOF_RBT_NODE_ADD
764  + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
765  }
766 }
767 
768 /*******************************************************************/
771 static
772 void
773 fts_query_remove_doc_id(
774 /*====================*/
775  fts_query_t* query,
776  doc_id_t doc_id)
777 {
778  ib_rbt_bound_t parent;
779  ulint size = ib_vector_size(query->deleted->doc_ids);
780  fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
781 
782  /* Check if the doc id is deleted and it's in our set. */
783  if (fts_bsearch(array, 0, size, doc_id) < 0
784  && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
785  ut_free(rbt_remove_node(query->doc_ids, parent.last));
786 
787  ut_ad(query->total_size >
788  SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
789  query->total_size -= SIZEOF_RBT_NODE_ADD
790  + sizeof(fts_ranking_t);
791  }
792 }
793 
794 /*******************************************************************/
800 static
801 void
802 fts_query_change_ranking(
803 /*====================*/
804  fts_query_t* query,
805  doc_id_t doc_id,
806  ibool downgrade)
807 {
808  ib_rbt_bound_t parent;
809  ulint size = ib_vector_size(query->deleted->doc_ids);
810  fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
811 
812  /* Check if the doc id is deleted and it's in our set. */
813  if (fts_bsearch(array, 0, size, doc_id) < 0
814  && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
815 
816  fts_ranking_t* ranking;
817 
818  ranking = rbt_value(fts_ranking_t, parent.last);
819 
820  ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
821 
822  /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
823  and RANK_UPGRADE (0.5) */
824  if (ranking->rank >= 1.0F) {
825  ranking->rank = 1.0F;
826  } else if (ranking->rank <= -1.0F) {
827  ranking->rank = -1.0F;
828  }
829  }
830 }
831 
832 /*******************************************************************/
836 static
837 void
838 fts_query_intersect_doc_id(
839 /*=======================*/
840  fts_query_t* query,
841  doc_id_t doc_id,
842  fts_rank_t rank)
844 {
845  ib_rbt_bound_t parent;
846  ulint size = ib_vector_size(query->deleted->doc_ids);
847  fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
848  fts_ranking_t* ranking= NULL;
849 
850  /* There are three types of intersect:
851  1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
852  2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
853  if it matches 'b'. if the doc is also in doc_ids, then change the
854  doc's rank, and add 'a' in doc's words.
855  3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
856  if it matches 'b' and it's in doc_ids.(multi_exist = true). */
857 
858  /* Check if the doc id is deleted and it's in our set */
859  if (fts_bsearch(array, 0, size, doc_id) < 0) {
860  fts_ranking_t new_ranking;
861 
862  if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
863  if (query->multi_exist) {
864  return;
865  } else {
866  new_ranking.words = NULL;
867  }
868  } else {
869  ranking = rbt_value(fts_ranking_t, parent.last);
870 
871  /* We've just checked the doc id before */
872  if (ranking->words == NULL) {
873  ut_ad(rbt_search(query->intersection, &parent,
874  ranking) == 0);
875  return;
876  }
877 
878  /* Merge rank */
879  rank += ranking->rank;
880  if (rank >= 1.0F) {
881  rank = 1.0F;
882  } else if (rank <= -1.0F) {
883  rank = -1.0F;
884  }
885 
886  /* Take words */
887  new_ranking.words = ranking->words;
888  new_ranking.words_len = ranking->words_len;
889  }
890 
891  new_ranking.rank = rank;
892  new_ranking.doc_id = doc_id;
893 
894  if (rbt_search(query->intersection, &parent,
895  &new_ranking) != 0) {
896  if (new_ranking.words == NULL) {
897  fts_ranking_words_create(query, &new_ranking);
898 
899  query->total_size += RANKING_WORDS_INIT_LEN;
900  } else {
901  /* Note that the intersection has taken
902  ownership of the ranking data. */
903  ranking->words = NULL;
904  }
905 
906  rbt_add_node(query->intersection,
907  &parent, &new_ranking);
908 
909  query->total_size += SIZEOF_RBT_NODE_ADD
910  + sizeof(fts_ranking_t);
911  }
912  }
913 }
914 
915 /*******************************************************************/
917 static
918 void
919 fts_query_free_doc_ids(
920 /*===================*/
921  fts_query_t* query,
922  ib_rbt_t* doc_ids)
923 {
924  const ib_rbt_node_t* node;
925 
926  for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
927 
928  fts_ranking_t* ranking;
929 
930  ranking = rbt_value(fts_ranking_t, node);
931 
932  if (ranking->words) {
933  ranking->words = NULL;
934  }
935 
936  ut_free(rbt_remove_node(doc_ids, node));
937 
938  ut_ad(query->total_size >
939  SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
940  query->total_size -= SIZEOF_RBT_NODE_ADD
941  + sizeof(fts_ranking_t);
942  }
943 
944  rbt_free(doc_ids);
945 
946  ut_ad(query->total_size > SIZEOF_RBT_CREATE);
947  query->total_size -= SIZEOF_RBT_CREATE;
948 }
949 
950 /*******************************************************************/
953 static
954 void
955 fts_query_add_word_to_document(
956 /*===========================*/
957  fts_query_t* query,
958  doc_id_t doc_id,
959  const byte* word)
960 {
961  ib_rbt_bound_t parent;
962  fts_ranking_t* ranking = NULL;
963 
964  if (query->flags == FTS_OPT_RANKING) {
965  return;
966  }
967 
968  /* First we search the intersection RB tree as it could have
969  taken ownership of the words rb tree instance. */
970  if (query->intersection
971  && rbt_search(query->intersection, &parent, &doc_id) == 0) {
972 
973  ranking = rbt_value(fts_ranking_t, parent.last);
974  }
975 
976  if (ranking == NULL
977  && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
978 
979  ranking = rbt_value(fts_ranking_t, parent.last);
980  }
981 
982  if (ranking != NULL) {
983  fts_ranking_words_add(query, ranking, (char*)word);
984  }
985 }
986 
987 /*******************************************************************/
989 static
990 void
991 fts_query_check_node(
992 /*=================*/
993  fts_query_t* query,
994  const fts_string_t* token,
995  const fts_node_t* node)
996 {
997  /* Skip nodes whose doc ids are out range. */
998  if (query->oper == FTS_EXIST
999  && ((query->upper_doc_id > 0
1000  && node->first_doc_id > query->upper_doc_id)
1001  || (query->lower_doc_id > 0
1002  && node->last_doc_id < query->lower_doc_id))) {
1003 
1004  /* Ignore */
1005 
1006  } else {
1007  int ret;
1008  ib_rbt_bound_t parent;
1009  ulint ilist_size = node->ilist_size;
1010  fts_word_freq_t*word_freqs;
1011 
1012  /* The word must exist. */
1013  ret = rbt_search(query->word_freqs, &parent, token->f_str);
1014  ut_a(ret == 0);
1015 
1016  word_freqs = rbt_value(fts_word_freq_t, parent.last);
1017 
1018  query->error = fts_query_filter_doc_ids(
1019  query, token->f_str, word_freqs, node,
1020  node->ilist, ilist_size, TRUE);
1021  }
1022 }
1023 
1024 /*****************************************************************/
1027 static
1028 ulint
1029 fts_cache_find_wildcard(
1030 /*====================*/
1031  fts_query_t* query,
1032  const fts_index_cache_t*index_cache,
1033  const fts_string_t* token)
1034 {
1035  ib_rbt_bound_t parent;
1036  const ib_vector_t* nodes = NULL;
1037  fts_string_t srch_text;
1038  byte term[FTS_MAX_WORD_LEN + 1];
1039  ulint num_word = 0;
1040 
1041  srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1042  ? token->f_len - 1
1043  : token->f_len;
1044 
1045  strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1046  term[srch_text.f_len] = '\0';
1047  srch_text.f_str = term;
1048 
1049  /* Lookup the word in the rb tree */
1050  if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1052  const fts_tokenizer_word_t* word;
1053  ulint i;
1054  const ib_rbt_node_t* cur_node;
1055  ibool forward = FALSE;
1056 
1057  word = rbt_value(fts_tokenizer_word_t, parent.last);
1058  cur_node = parent.last;
1059 
1061  index_cache->charset, &srch_text, &word->text) == 0) {
1062 
1063  nodes = word->nodes;
1064 
1065  for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1066  int ret;
1067  const fts_node_t* node;
1068  ib_rbt_bound_t freq_parent;
1069  fts_word_freq_t* word_freqs;
1070 
1071  node = static_cast<const fts_node_t*>(
1072  ib_vector_get_const(nodes, i));
1073 
1074  ret = rbt_search(query->word_freqs,
1075  &freq_parent,
1076  srch_text.f_str);
1077 
1078  ut_a(ret == 0);
1079 
1080  word_freqs = rbt_value(
1082  freq_parent.last);
1083 
1084  query->error = fts_query_filter_doc_ids(
1085  query, srch_text.f_str,
1086  word_freqs, node,
1087  node->ilist, node->ilist_size, TRUE);
1088 
1089  if (query->error != DB_SUCCESS) {
1090  return(0);
1091  }
1092  }
1093 
1094  num_word++;
1095 
1096  if (!forward) {
1097  cur_node = rbt_prev(
1098  index_cache->words, cur_node);
1099  } else {
1100 cont_search:
1101  cur_node = rbt_next(
1102  index_cache->words, cur_node);
1103  }
1104 
1105  if (!cur_node) {
1106  break;
1107  }
1108 
1109  word = rbt_value(fts_tokenizer_word_t, cur_node);
1110  }
1111 
1112  if (!forward) {
1113  forward = TRUE;
1114  cur_node = parent.last;
1115  goto cont_search;
1116  }
1117  }
1118 
1119  return(num_word);
1120 }
1121 
1122 /*****************************************************************/
1125 static __attribute__((nonnull, warn_unused_result))
1126 dberr_t
1127 fts_query_difference(
1128 /*=================*/
1129  fts_query_t* query,
1130  const fts_string_t* token)
1131 {
1132  ulint n_doc_ids= 0;
1133  trx_t* trx = query->trx;
1134  dict_table_t* table = query->index->table;
1135 
1136  ut_a(query->oper == FTS_IGNORE);
1137 
1138 #ifdef FTS_INTERNAL_DIAG_PRINT
1139  fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
1140  (int) token->f_len, token->f_str);
1141 #endif
1142 
1143  if (query->doc_ids) {
1144  n_doc_ids = rbt_size(query->doc_ids);
1145  }
1146 
1147  /* There is nothing we can substract from an empty set. */
1148  if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1149  ulint i;
1150  fts_fetch_t fetch;
1151  const ib_vector_t* nodes;
1152  const fts_index_cache_t*index_cache;
1153  que_t* graph = NULL;
1154  fts_cache_t* cache = table->fts->cache;
1155  dberr_t error;
1156 
1157  rw_lock_x_lock(&cache->lock);
1158 
1159  index_cache = fts_find_index_cache(cache, query->index);
1160 
1161  /* Must find the index cache */
1162  ut_a(index_cache != NULL);
1163 
1164  /* Search the cache for a matching word first. */
1165  if (query->cur_node->term.wildcard
1166  && query->flags != FTS_PROXIMITY
1167  && query->flags != FTS_PHRASE) {
1168  fts_cache_find_wildcard(query, index_cache, token);
1169  } else {
1170  nodes = fts_cache_find_word(index_cache, token);
1171 
1172  for (i = 0; nodes && i < ib_vector_size(nodes)
1173  && query->error == DB_SUCCESS; ++i) {
1174  const fts_node_t* node;
1175 
1176  node = static_cast<const fts_node_t*>(
1177  ib_vector_get_const(nodes, i));
1178 
1179  fts_query_check_node(query, token, node);
1180  }
1181  }
1182 
1183  rw_lock_x_unlock(&cache->lock);
1184 
1185  /* error is passed by 'query->error' */
1186  if (query->error != DB_SUCCESS) {
1187  ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1188  return(query->error);
1189  }
1190 
1191  /* Setup the callback args for filtering and
1192  consolidating the ilist. */
1193  fetch.read_arg = query;
1194  fetch.read_record = fts_query_index_fetch_nodes;
1195 
1196  error = fts_index_fetch_nodes(
1197  trx, &graph, &query->fts_index_table, token, &fetch);
1198 
1199  /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1200  ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1201  if (error != DB_SUCCESS) {
1202  query->error = error;
1203  }
1204 
1205  fts_que_graph_free(graph);
1206  }
1207 
1208  /* The size can't increase. */
1209  ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1210 
1211  return(query->error);
1212 }
1213 
1214 /*****************************************************************/
1217 static __attribute__((nonnull, warn_unused_result))
1218 dberr_t
1219 fts_query_intersect(
1220 /*================*/
1221  fts_query_t* query,
1222  const fts_string_t* token)
1223 {
1224  trx_t* trx = query->trx;
1225  dict_table_t* table = query->index->table;
1226 
1227  ut_a(query->oper == FTS_EXIST);
1228 
1229 #ifdef FTS_INTERNAL_DIAG_PRINT
1230  fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
1231  (int) token->f_len, token->f_str);
1232 #endif
1233 
1234  /* If the words set is not empty and multi exist is true,
1235  we know the intersection set is empty in advance. */
1236  if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1237  ulint n_doc_ids = 0;
1238  ulint i;
1239  fts_fetch_t fetch;
1240  const ib_vector_t* nodes;
1241  const fts_index_cache_t*index_cache;
1242  que_t* graph = NULL;
1243  fts_cache_t* cache = table->fts->cache;
1244  dberr_t error;
1245 
1246  ut_a(!query->intersection);
1247 
1248  n_doc_ids = rbt_size(query->doc_ids);
1249 
1250  /* Create the rb tree that will hold the doc ids of
1251  the intersection. */
1252  query->intersection = rbt_create(
1254 
1255  query->total_size += SIZEOF_RBT_CREATE;
1256 
1257  /* This is to avoid decompressing the ilist if the
1258  node's ilist doc ids are out of range. */
1259  if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1260  const ib_rbt_node_t* node;
1261  doc_id_t* doc_id;
1262 
1263  node = rbt_first(query->doc_ids);
1264  doc_id = rbt_value(doc_id_t, node);
1265  query->lower_doc_id = *doc_id;
1266 
1267  node = rbt_last(query->doc_ids);
1268  doc_id = rbt_value(doc_id_t, node);
1269  query->upper_doc_id = *doc_id;
1270 
1271  } else {
1272  query->lower_doc_id = 0;
1273  query->upper_doc_id = 0;
1274  }
1275 
1276  /* Search the cache for a matching word first. */
1277 
1278  rw_lock_x_lock(&cache->lock);
1279 
1280  /* Search for the index specific cache. */
1281  index_cache = fts_find_index_cache(cache, query->index);
1282 
1283  /* Must find the index cache. */
1284  ut_a(index_cache != NULL);
1285 
1286  if (query->cur_node->term.wildcard) {
1287  /* Wildcard search the index cache */
1288  fts_cache_find_wildcard(query, index_cache, token);
1289  } else {
1290  nodes = fts_cache_find_word(index_cache, token);
1291 
1292  for (i = 0; nodes && i < ib_vector_size(nodes)
1293  && query->error == DB_SUCCESS; ++i) {
1294  const fts_node_t* node;
1295 
1296  node = static_cast<const fts_node_t*>(
1297  ib_vector_get_const(nodes, i));
1298 
1299  fts_query_check_node(query, token, node);
1300  }
1301  }
1302 
1303  rw_lock_x_unlock(&cache->lock);
1304 
1305  /* error is passed by 'query->error' */
1306  if (query->error != DB_SUCCESS) {
1307  ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1308  return(query->error);
1309  }
1310 
1311  /* Setup the callback args for filtering and
1312  consolidating the ilist. */
1313  fetch.read_arg = query;
1314  fetch.read_record = fts_query_index_fetch_nodes;
1315 
1316  error = fts_index_fetch_nodes(
1317  trx, &graph, &query->fts_index_table, token, &fetch);
1318 
1319  /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1320  ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1321  if (error != DB_SUCCESS) {
1322  query->error = error;
1323  }
1324 
1325  fts_que_graph_free(graph);
1326 
1327  if (query->error == DB_SUCCESS) {
1328  /* Make the intesection (rb tree) the current doc id
1329  set and free the old set. */
1330  fts_query_free_doc_ids(query, query->doc_ids);
1331  query->doc_ids = query->intersection;
1332  query->intersection = NULL;
1333 
1334  ut_a(!query->multi_exist || (query->multi_exist
1335  && rbt_size(query->doc_ids) <= n_doc_ids));
1336  }
1337  }
1338 
1339  return(query->error);
1340 }
1341 
1342 /*****************************************************************/
1345 static
1346 dberr_t
1347 fts_query_cache(
1348 /*============*/
1349  fts_query_t* query,
1350  const fts_string_t* token)
1351 {
1352  const fts_index_cache_t*index_cache;
1353  dict_table_t* table = query->index->table;
1354  fts_cache_t* cache = table->fts->cache;
1355 
1356  /* Search the cache for a matching word first. */
1357  rw_lock_x_lock(&cache->lock);
1358 
1359  /* Search for the index specific cache. */
1360  index_cache = fts_find_index_cache(cache, query->index);
1361 
1362  /* Must find the index cache. */
1363  ut_a(index_cache != NULL);
1364 
1365  if (query->cur_node->term.wildcard
1366  && query->flags != FTS_PROXIMITY
1367  && query->flags != FTS_PHRASE) {
1368  /* Wildcard search the index cache */
1369  fts_cache_find_wildcard(query, index_cache, token);
1370  } else {
1371  const ib_vector_t* nodes;
1372  ulint i;
1373 
1374  nodes = fts_cache_find_word(index_cache, token);
1375 
1376  for (i = 0; nodes && i < ib_vector_size(nodes)
1377  && query->error == DB_SUCCESS; ++i) {
1378  const fts_node_t* node;
1379 
1380  node = static_cast<const fts_node_t*>(
1381  ib_vector_get_const(nodes, i));
1382 
1383  fts_query_check_node(query, token, node);
1384  }
1385  }
1386 
1387  rw_lock_x_unlock(&cache->lock);
1388 
1389  return(query->error);
1390 }
1391 
1392 /*****************************************************************/
1395 static __attribute__((nonnull, warn_unused_result))
1396 dberr_t
1397 fts_query_union(
1398 /*============*/
1399  fts_query_t* query,
1400  fts_string_t* token)
1401 {
1402  fts_fetch_t fetch;
1403  ulint n_doc_ids = 0;
1404  trx_t* trx = query->trx;
1405  que_t* graph = NULL;
1406  dberr_t error;
1407 
1408  ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1409  query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1410 
1411 #ifdef FTS_INTERNAL_DIAG_PRINT
1412  fprintf(stderr, "UNION: Searching: '%.*s'\n",
1413  (int) token->f_len, token->f_str);
1414 #endif
1415 
1416  if (query->doc_ids) {
1417  n_doc_ids = rbt_size(query->doc_ids);
1418  }
1419 
1420  if (token->f_len == 0) {
1421  return(query->error);
1422  }
1423 
1424  /* Single '%' would confuse parser in pars_like_rebind(). In addition,
1425  our wildcard search only supports prefix search */
1426  ut_ad(*token->f_str != '%');
1427 
1428  fts_query_cache(query, token);
1429 
1430  /* Setup the callback args for filtering and
1431  consolidating the ilist. */
1432  fetch.read_arg = query;
1433  fetch.read_record = fts_query_index_fetch_nodes;
1434 
1435  /* Read the nodes from disk. */
1436  error = fts_index_fetch_nodes(
1437  trx, &graph, &query->fts_index_table, token, &fetch);
1438 
1439  /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1440  ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1441  if (error != DB_SUCCESS) {
1442  query->error = error;
1443  }
1444 
1445  fts_que_graph_free(graph);
1446 
1447  if (query->error == DB_SUCCESS) {
1448 
1449  /* The size can't decrease. */
1450  ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1451 
1452  /* Calulate the number of doc ids that were added to
1453  the current doc id set. */
1454  if (query->doc_ids) {
1455  n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1456  }
1457  }
1458 
1459  return(query->error);
1460 }
1461 
1462 /*****************************************************************/
1466 static
1467 dberr_t
1468 fts_query_process_doc_id(
1469 /*=====================*/
1470  fts_query_t* query,
1471  doc_id_t doc_id,
1472  fts_rank_t rank)
1474 {
1475  if (query->flags == FTS_OPT_RANKING) {
1476  return(DB_SUCCESS);
1477  }
1478 
1479  switch (query->oper) {
1480  case FTS_NONE:
1481  fts_query_union_doc_id(query, doc_id, rank);
1482  break;
1483 
1484  case FTS_EXIST:
1485  fts_query_intersect_doc_id(query, doc_id, rank);
1486  break;
1487 
1488  case FTS_IGNORE:
1489  fts_query_remove_doc_id(query, doc_id);
1490  break;
1491 
1492  case FTS_NEGATE:
1493  fts_query_change_ranking(query, doc_id, TRUE);
1494  break;
1495 
1496  case FTS_DECR_RATING:
1497  fts_query_union_doc_id(query, doc_id, rank);
1498  fts_query_change_ranking(query, doc_id, TRUE);
1499  break;
1500 
1501  case FTS_INCR_RATING:
1502  fts_query_union_doc_id(query, doc_id, rank);
1503  fts_query_change_ranking(query, doc_id, FALSE);
1504  break;
1505 
1506  default:
1507  ut_error;
1508  }
1509 
1510  if (query->total_size > fts_result_cache_limit) {
1512  } else {
1513  return(DB_SUCCESS);
1514  }
1515 }
1516 
1517 /*****************************************************************/
1519 static
1520 dberr_t
1521 fts_merge_doc_ids(
1522 /*==============*/
1523  fts_query_t* query,
1524  const ib_rbt_t* doc_ids)
1525 {
1526  const ib_rbt_node_t* node;
1527 
1528  ut_a(!rbt_empty(doc_ids));
1529  ut_a(!query->intersection);
1530 
1531  /* To process FTS_EXIST operation (intersection), we need
1532  to create a new result set for fts_query_intersect(). */
1533  if (query->oper == FTS_EXIST) {
1534 
1535  query->intersection = rbt_create(
1537 
1538  query->total_size += SIZEOF_RBT_CREATE;
1539  }
1540 
1541  /* Merge the elements to the result set. */
1542  for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1543  fts_ranking_t* ranking;
1544  ulint pos = 0;
1545  byte* word = NULL;
1546 
1547  ranking = rbt_value(fts_ranking_t, node);
1548 
1549  query->error = fts_query_process_doc_id(
1550  query, ranking->doc_id, ranking->rank);
1551 
1552  if (query->error != DB_SUCCESS) {
1553  return(query->error);
1554  }
1555 
1556  /* Merge words. Don't need to take operator into account. */
1557  ut_a(ranking->words);
1558  while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1559  fts_query_add_word_to_document(query, ranking->doc_id,
1560  word);
1561  }
1562  }
1563 
1564  /* If it is an intersection operation, reset query->doc_ids
1565  to query->intersection and free the old result list. */
1566  if (query->oper == FTS_EXIST && query->intersection != NULL) {
1567  fts_query_free_doc_ids(query, query->doc_ids);
1568  query->doc_ids = query->intersection;
1569  query->intersection = NULL;
1570  }
1571 
1572  return(DB_SUCCESS);
1573 }
1574 
1575 /*****************************************************************/
1578 UNIV_INLINE
1579 byte*
1580 fts_query_skip_word(
1581 /*================*/
1582  byte* ptr,
1583  const byte* end)
1584 {
1585  /* TODO: Does this have to be UTF-8 too ? */
1586  while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1587  ++ptr;
1588  }
1589 
1590  return(ptr);
1591 }
1592 
1593 /*****************************************************************/
1596 static
1597 ibool
1598 fts_query_match_phrase_terms(
1599 /*=========================*/
1600  fts_phrase_t* phrase,
1601  byte** start,
1605  const byte* end,
1607  mem_heap_t* heap)
1608 {
1609  ulint i;
1610  byte* ptr = *start;
1611  const ib_vector_t* tokens = phrase->tokens;
1612  ulint distance = phrase->distance;
1613 
1614  /* We check only from the second term onwards, since the first
1615  must have matched otherwise we wouldn't be here. */
1616  for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1618  fts_string_t cmp_str;
1619  const fts_string_t* token;
1620  int result;
1621  ulint ret;
1622  ulint offset;
1623 
1625  phrase->charset, ptr, (byte*) end,
1626  &match, &offset);
1627 
1628  if (match.f_len > 0) {
1629  /* Get next token to match. */
1630  token = static_cast<const fts_string_t*>(
1631  ib_vector_get_const(tokens, i));
1632 
1633  fts_utf8_string_dup(&cmp_str, &match, heap);
1634 
1635  result = innobase_fts_text_case_cmp(
1636  phrase->charset, token, &cmp_str);
1637 
1638  /* Skip the rest of the tokens if this one doesn't
1639  match and the proximity distance is exceeded. */
1640  if (result
1641  && (distance == ULINT_UNDEFINED
1642  || distance == 0)) {
1643 
1644  break;
1645  }
1646 
1647  /* This token matched move to the next token. */
1648  if (result == 0) {
1649  /* Advance the text to search by the length
1650  of the last token. */
1651  ptr += ret;
1652 
1653  /* Advance to the next token. */
1654  ++i;
1655  } else {
1656 
1657  ut_a(distance != ULINT_UNDEFINED);
1658 
1659  ptr = fts_query_skip_word(ptr, end);
1660  }
1661 
1662  /* Distance can be 0 for exact matches. */
1663  if (distance != ULINT_UNDEFINED && distance > 0) {
1664  --distance;
1665  }
1666  } else {
1667  ptr += ret;
1668  }
1669  }
1670 
1671  *start = ptr;
1672 
1673  /* Can't be greater than the number of elements. */
1674  ut_a(i <= ib_vector_size(tokens));
1675 
1676  /* This is the case for multiple words. */
1677  if (i == ib_vector_size(tokens)) {
1678  phrase->found = TRUE;
1679  }
1680 
1681  return(phrase->found);
1682 }
1683 
1684 /*****************************************************************/
1688 static
1689 bool
1690 fts_proximity_is_word_in_range(
1691 /*===========================*/
1692  const fts_phrase_t*
1693  phrase,
1694  byte* start,
1695  ulint total_len)
1696 {
1697  fts_proximity_t* proximity_pos = phrase->proximity_pos;
1698 
1699  /* Search each matched position pair (with min and max positions)
1700  and count the number of words in the range */
1701  for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1702  ulint cur_pos = proximity_pos->min_pos[i];
1703  ulint n_word = 0;
1704 
1705  ut_ad(proximity_pos->max_pos[i] <= total_len);
1706 
1707  /* Walk through words in the range and count them */
1708  while (cur_pos <= proximity_pos->max_pos[i]) {
1709  ulint len;
1710  fts_string_t str;
1711  ulint offset = 0;
1712 
1714  phrase->charset,
1715  start + cur_pos,
1716  start + total_len, &str, &offset);
1717 
1718  if (len == 0) {
1719  break;
1720  }
1721 
1722  /* Advances position with "len" bytes */
1723  cur_pos += len;
1724 
1725  /* Record the number of words */
1726  if (str.f_n_char > 0) {
1727  n_word++;
1728  }
1729 
1730  if (n_word > phrase->distance) {
1731  break;
1732  }
1733  }
1734 
1735  /* Check if the number of words is less than specified
1736  "distance" */
1737  if (n_word && n_word <= phrase->distance) {
1738  return(true);
1739  }
1740  }
1741 
1742  return(false);
1743 }
1744 
1745 /*****************************************************************/
1748 static
1749 ibool
1750 fts_query_match_phrase(
1751 /*===================*/
1752  fts_phrase_t* phrase,
1753  byte* start,
1756  ulint cur_len,
1757  ulint prev_len,
1759  mem_heap_t* heap) /* heap */
1760 {
1761  ulint i;
1762  const fts_string_t* first;
1763  const byte* end = start + cur_len;
1764  const ib_vector_t* tokens = phrase->tokens;
1765  const ib_vector_t* positions = phrase->match->positions;
1766 
1767  ut_a(!phrase->found);
1768  ut_a(phrase->match->doc_id > 0);
1769  ut_a(ib_vector_size(tokens) > 0);
1770  ut_a(ib_vector_size(positions) > 0);
1771 
1772  first = static_cast<const fts_string_t*>(
1773  ib_vector_get_const(tokens, 0));
1774 
1775  ut_a(phrase->match->start < ib_vector_size(positions));
1776 
1777  for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1778  ulint pos;
1779  fts_string_t match;
1780  fts_string_t cmp_str;
1781  byte* ptr = start;
1782  ulint ret;
1783  ulint offset;
1784 
1785  pos = *(ulint*) ib_vector_get_const(positions, i);
1786 
1787  if (pos == ULINT_UNDEFINED) {
1788  break;
1789  }
1790 
1791  if (pos < prev_len) {
1792  continue;
1793  }
1794 
1795  /* Document positions are calculated from the beginning
1796  of the first field, need to save the length for each
1797  searched field to adjust the doc position when search
1798  phrases. */
1799  pos -= prev_len;
1800  ptr = match.f_str = start + pos;
1801 
1802  /* Within limits ? */
1803  if (ptr >= end) {
1804  break;
1805  }
1806 
1808  phrase->charset, start + pos, (byte*) end,
1809  &match, &offset);
1810 
1811  if (match.f_len == 0) {
1812  break;
1813  }
1814 
1815  fts_utf8_string_dup(&cmp_str, &match, heap);
1816 
1818  phrase->charset, first, &cmp_str) == 0) {
1819 
1820  /* This is the case for the single word
1821  in the phrase. */
1822  if (ib_vector_size(phrase->tokens) == 1) {
1823  phrase->found = TRUE;
1824  break;
1825  }
1826 
1827  ptr += ret;
1828 
1829  /* Match the remaining terms in the phrase. */
1830  if (fts_query_match_phrase_terms(phrase, &ptr,
1831  end, heap)) {
1832  break;
1833  }
1834  }
1835  }
1836 
1837  return(phrase->found);
1838 }
1839 
1840 /*****************************************************************/
1843 static
1844 ibool
1845 fts_query_fetch_document(
1846 /*=====================*/
1847  void* row,
1848  void* user_arg)
1849 {
1850 
1851  que_node_t* exp;
1852  sel_node_t* node = static_cast<sel_node_t*>(row);
1853  fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1854  ulint prev_len = 0;
1855  ulint total_len = 0;
1856  byte* document_text = NULL;
1857 
1858  exp = node->select_list;
1859 
1860  phrase->found = FALSE;
1861 
1862  /* For proximity search, we will need to get the whole document
1863  from all fields, so first count the total length of the document
1864  from all the fields */
1865  if (phrase->proximity_pos) {
1866  while (exp) {
1867  ulint field_len;
1868  dfield_t* dfield = que_node_get_val(exp);
1869  byte* data = static_cast<byte*>(
1870  dfield_get_data(dfield));
1871 
1872  if (dfield_is_ext(dfield)) {
1873  ulint local_len = dfield_get_len(dfield);
1874 
1875  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1876 
1877  field_len = mach_read_from_4(
1878  data + local_len + BTR_EXTERN_LEN + 4);
1879  } else {
1880  field_len = dfield_get_len(dfield);
1881  }
1882 
1883  if (field_len != UNIV_SQL_NULL) {
1884  total_len += field_len + 1;
1885  }
1886 
1887  exp = que_node_get_next(exp);
1888  }
1889 
1890  document_text = static_cast<byte*>(mem_heap_zalloc(
1891  phrase->heap, total_len));
1892 
1893  if (!document_text) {
1894  return(FALSE);
1895  }
1896  }
1897 
1898  exp = node->select_list;
1899 
1900  while (exp) {
1901  dfield_t* dfield = que_node_get_val(exp);
1902  byte* data = static_cast<byte*>(
1903  dfield_get_data(dfield));
1904  ulint cur_len;
1905 
1906  if (dfield_is_ext(dfield)) {
1908  &cur_len, data, phrase->zip_size,
1909  dfield_get_len(dfield), phrase->heap);
1910  } else {
1911  cur_len = dfield_get_len(dfield);
1912  }
1913 
1914  if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
1915  if (phrase->proximity_pos) {
1916  memcpy(document_text + prev_len, data, cur_len);
1917  } else {
1918  /* For phrase search */
1919  phrase->found =
1920  fts_query_match_phrase(
1921  phrase,
1922  static_cast<byte*>(data),
1923  cur_len, prev_len,
1924  phrase->heap);
1925  }
1926  }
1927 
1928  if (phrase->found) {
1929  break;
1930  }
1931 
1932  /* Document positions are calculated from the beginning
1933  of the first field, need to save the length for each
1934  searched field to adjust the doc position when search
1935  phrases. */
1936  prev_len += cur_len + 1;
1937  exp = que_node_get_next(exp);
1938  }
1939 
1940  if (phrase->proximity_pos) {
1941  ut_ad(prev_len <= total_len);
1942 
1943  phrase->found = fts_proximity_is_word_in_range(
1944  phrase, document_text, total_len);
1945  }
1946 
1947  return(phrase->found);
1948 }
1949 
1950 #if 0
1951 /********************************************************************
1952 Callback function to check whether a record was found or not. */
1953 static
1954 ibool
1955 fts_query_select(
1956 /*=============*/
1957  void* row,
1958  void* user_arg)
1959 {
1960  int i;
1961  que_node_t* exp;
1962  sel_node_t* node = row;
1963  fts_select_t* select = user_arg;
1964 
1965  ut_a(select->word_freq);
1966  ut_a(select->word_freq->doc_freqs);
1967 
1968  exp = node->select_list;
1969 
1970  for (i = 0; exp && !select->found; ++i) {
1971  dfield_t* dfield = que_node_get_val(exp);
1972  void* data = dfield_get_data(dfield);
1973  ulint len = dfield_get_len(dfield);
1974 
1975  switch (i) {
1976  case 0: /* DOC_COUNT */
1977  if (len != UNIV_SQL_NULL && len != 0) {
1978 
1979  select->word_freq->doc_count +=
1980  mach_read_from_4(data);
1981  }
1982  break;
1983 
1984  case 1: /* ILIST */
1985  if (len != UNIV_SQL_NULL && len != 0) {
1986 
1987  fts_query_find_doc_id(select, data, len);
1988  }
1989  break;
1990 
1991  default:
1992  ut_error;
1993  }
1994 
1995  exp = que_node_get_next(exp);
1996  }
1997 
1998  return(FALSE);
1999 }
2000 
2001 /********************************************************************
2002 Read the rows from the FTS index, that match word and where the
2003 doc id is between first and last doc id.
2004 @return DB_SUCCESS if all go well else error code */
2005 static __attribute__((nonnull, warn_unused_result))
2006 dberr_t
2007 fts_query_find_term(
2008 /*================*/
2009  fts_query_t* query,
2010  que_t** graph,
2011  const fts_string_t* word,
2012  doc_id_t doc_id,
2013  ulint* min_pos,
2015  ibool* found)
2016 {
2017  pars_info_t* info;
2018  dberr_t error;
2019  fts_select_t select;
2020  doc_id_t match_doc_id;
2021  trx_t* trx = query->trx;
2022 
2023  trx->op_info = "fetching FTS index matching nodes";
2024 
2025  if (*graph) {
2026  info = (*graph)->info;
2027  } else {
2028  info = pars_info_create();
2029  }
2030 
2031  select.found = FALSE;
2032  select.doc_id = doc_id;
2033  select.min_pos = *min_pos;
2034  select.word_freq = fts_query_add_word_freq(query, word->f_str);
2035 
2036  pars_info_bind_function(info, "my_func", fts_query_select, &select);
2037  pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2038 
2039  /* Convert to "storage" byte order. */
2040  fts_write_doc_id((byte*) &match_doc_id, doc_id);
2041 
2042  fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2043 
2044  fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2045 
2046  if (!*graph) {
2047  ulint selected;
2048 
2049  selected = fts_select_index(*word->f_str);
2050 
2051  query->fts_index_table.suffix = fts_get_suffix(selected);
2052 
2053  *graph = fts_parse_sql(
2054  &query->fts_index_table,
2055  info,
2056  "DECLARE FUNCTION my_func;\n"
2057  "DECLARE CURSOR c IS"
2058  " SELECT doc_count, ilist\n"
2059  " FROM %s\n"
2060  " WHERE word LIKE :word AND "
2061  " first_doc_id <= :min_doc_id AND "
2062  " last_doc_id >= :max_doc_id\n"
2063  " ORDER BY first_doc_id;\n"
2064  "BEGIN\n"
2065  "\n"
2066  "OPEN c;\n"
2067  "WHILE 1 = 1 LOOP\n"
2068  " FETCH c INTO my_func();\n"
2069  " IF c % NOTFOUND THEN\n"
2070  " EXIT;\n"
2071  " END IF;\n"
2072  "END LOOP;\n"
2073  "CLOSE c;");
2074  }
2075 
2076  for(;;) {
2077  error = fts_eval_sql(trx, *graph);
2078 
2079  if (error == DB_SUCCESS) {
2080 
2081  break; /* Exit the loop. */
2082  } else {
2083  ut_print_timestamp(stderr);
2084 
2085  if (error == DB_LOCK_WAIT_TIMEOUT) {
2086  fprintf(stderr, " InnoDB: Warning: lock wait "
2087  "timeout reading FTS index. "
2088  "Retrying!\n");
2089 
2090  trx->error_state = DB_SUCCESS;
2091  } else {
2092  fprintf(stderr, " InnoDB: Error: %lu "
2093  "while reading FTS index.\n", error);
2094 
2095  break; /* Exit the loop. */
2096  }
2097  }
2098  }
2099 
2100  /* Value to return */
2101  *found = select.found;
2102 
2103  if (*found) {
2104  *min_pos = select.min_pos;
2105  }
2106 
2107  return(error);
2108 }
2109 
2110 /********************************************************************
2111 Callback aggregator for int columns. */
2112 static
2113 ibool
2114 fts_query_sum(
2115 /*==========*/
2117  void* row,
2118  void* user_arg)
2119 {
2120 
2121  que_node_t* exp;
2122  sel_node_t* node = row;
2123  ulint* total = user_arg;
2124 
2125  exp = node->select_list;
2126 
2127  while (exp) {
2128  dfield_t* dfield = que_node_get_val(exp);
2129  void* data = dfield_get_data(dfield);
2130  ulint len = dfield_get_len(dfield);
2131 
2132  if (len != UNIV_SQL_NULL && len != 0) {
2133  *total += mach_read_from_4(data);
2134  }
2135 
2136  exp = que_node_get_next(exp);
2137  }
2138 
2139  return(TRUE);
2140 }
2141 
2142 /********************************************************************
2143 Calculate the total documents that contain a particular word (term).
2144 @return DB_SUCCESS if all go well else error code */
2145 static __attribute__((nonnull, warn_unused_result))
2146 dberr_t
2147 fts_query_total_docs_containing_term(
2148 /*=================================*/
2149  fts_query_t* query,
2150  const fts_string_t* word,
2151  ulint* total)
2152 {
2153  pars_info_t* info;
2154  dberr_t error;
2155  que_t* graph;
2156  ulint selected;
2157  trx_t* trx = query->trx;
2158 
2159  trx->op_info = "fetching FTS index document count";
2160 
2161  *total = 0;
2162 
2163  info = pars_info_create();
2164 
2165  pars_info_bind_function(info, "my_func", fts_query_sum, total);
2166  pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2167 
2168  selected = fts_select_index(*word->f_str);
2169 
2170  query->fts_index_table.suffix = fts_get_suffix(selected);
2171 
2172  graph = fts_parse_sql(
2173  &query->fts_index_table,
2174  info,
2175  "DECLARE FUNCTION my_func;\n"
2176  "DECLARE CURSOR c IS"
2177  " SELECT doc_count\n"
2178  " FROM %s\n"
2179  " WHERE word = :word "
2180  " ORDER BY first_doc_id;\n"
2181  "BEGIN\n"
2182  "\n"
2183  "OPEN c;\n"
2184  "WHILE 1 = 1 LOOP\n"
2185  " FETCH c INTO my_func();\n"
2186  " IF c % NOTFOUND THEN\n"
2187  " EXIT;\n"
2188  " END IF;\n"
2189  "END LOOP;\n"
2190  "CLOSE c;");
2191 
2192  for(;;) {
2193  error = fts_eval_sql(trx, graph);
2194 
2195  if (error == DB_SUCCESS) {
2196 
2197  break; /* Exit the loop. */
2198  } else {
2199  ut_print_timestamp(stderr);
2200 
2201  if (error == DB_LOCK_WAIT_TIMEOUT) {
2202  fprintf(stderr, " InnoDB: Warning: lock wait "
2203  "timeout reading FTS index. "
2204  "Retrying!\n");
2205 
2206  trx->error_state = DB_SUCCESS;
2207  } else {
2208  fprintf(stderr, " InnoDB: Error: %lu "
2209  "while reading FTS index.\n", error);
2210 
2211  break; /* Exit the loop. */
2212  }
2213  }
2214  }
2215 
2216  fts_que_graph_free(graph);
2217 
2218  return(error);
2219 }
2220 
2221 /********************************************************************
2222 Get the total number of words in a documents.
2223 @return DB_SUCCESS if all go well else error code */
2224 static __attribute__((nonnull, warn_unused_result))
2225 dberr_t
2226 fts_query_terms_in_document(
2227 /*========================*/
2228  fts_query_t* query,
2229  doc_id_t doc_id,
2230  ulint* total)
2231 {
2232  pars_info_t* info;
2233  dberr_t error;
2234  que_t* graph;
2235  doc_id_t read_doc_id;
2236  trx_t* trx = query->trx;
2237 
2238  trx->op_info = "fetching FTS document term count";
2239 
2240  *total = 0;
2241 
2242  info = pars_info_create();
2243 
2244  pars_info_bind_function(info, "my_func", fts_query_sum, total);
2245 
2246  /* Convert to "storage" byte order. */
2247  fts_write_doc_id((byte*) &read_doc_id, doc_id);
2248  fts_bind_doc_id(info, "doc_id", &read_doc_id);
2249 
2250  query->fts_index_table.suffix = "DOC_ID";
2251 
2252  graph = fts_parse_sql(
2253  &query->fts_index_table,
2254  info,
2255  "DECLARE FUNCTION my_func;\n"
2256  "DECLARE CURSOR c IS"
2257  " SELECT count\n"
2258  " FROM %s\n"
2259  " WHERE doc_id = :doc_id "
2260  "BEGIN\n"
2261  "\n"
2262  "OPEN c;\n"
2263  "WHILE 1 = 1 LOOP\n"
2264  " FETCH c INTO my_func();\n"
2265  " IF c % NOTFOUND THEN\n"
2266  " EXIT;\n"
2267  " END IF;\n"
2268  "END LOOP;\n"
2269  "CLOSE c;");
2270 
2271  for(;;) {
2272  error = fts_eval_sql(trx, graph);
2273 
2274  if (error == DB_SUCCESS) {
2275 
2276  break; /* Exit the loop. */
2277  } else {
2278  ut_print_timestamp(stderr);
2279 
2280  if (error == DB_LOCK_WAIT_TIMEOUT) {
2281  fprintf(stderr, " InnoDB: Warning: lock wait "
2282  "timeout reading FTS doc id table. "
2283  "Retrying!\n");
2284 
2285  trx->error_state = DB_SUCCESS;
2286  } else {
2287  fprintf(stderr, " InnoDB: Error: %lu "
2288  "while reading FTS doc id table.\n",
2289  error);
2290 
2291  break; /* Exit the loop. */
2292  }
2293  }
2294  }
2295 
2296  fts_que_graph_free(graph);
2297 
2298  return(error);
2299 }
2300 #endif
2301 
2302 /*****************************************************************/
2305 static __attribute__((nonnull, warn_unused_result))
2306 dberr_t
2307 fts_query_match_document(
2308 /*=====================*/
2309  ib_vector_t* tokens,
2310  fts_get_doc_t* get_doc,
2311  fts_match_t* match,
2312  ulint distance,
2313  ibool* found)
2314 {
2315  dberr_t error;
2316  fts_phrase_t phrase;
2317 
2318  memset(&phrase, 0x0, sizeof(phrase));
2319 
2320  phrase.match = match; /* Positions to match */
2321  phrase.tokens = tokens; /* Tokens to match */
2322  phrase.distance = distance;
2323  phrase.charset = get_doc->index_cache->charset;
2324  phrase.zip_size = dict_table_zip_size(
2325  get_doc->index_cache->index->table);
2326  phrase.heap = mem_heap_create(512);
2327 
2328  *found = phrase.found = FALSE;
2329 
2330  error = fts_doc_fetch_by_doc_id(
2331  get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2332  fts_query_fetch_document, &phrase);
2333 
2334  if (error != DB_SUCCESS) {
2335  ut_print_timestamp(stderr);
2336  fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
2337  ut_strerr(error));
2338  } else {
2339  *found = phrase.found;
2340  }
2341 
2342  mem_heap_free(phrase.heap);
2343 
2344  return(error);
2345 }
2346 
2347 /*****************************************************************/
2351 static __attribute__((nonnull, warn_unused_result))
2352 bool
2353 fts_query_is_in_proximity_range(
2354 /*============================*/
2355  const fts_query_t* query,
2356  fts_match_t** match,
2357  fts_proximity_t* qualified_pos)
2359 {
2360  fts_get_doc_t get_doc;
2361  fts_cache_t* cache = query->index->table->fts->cache;
2362  dberr_t err;
2363  fts_phrase_t phrase;
2364 
2365  memset(&get_doc, 0x0, sizeof(get_doc));
2366  memset(&phrase, 0x0, sizeof(phrase));
2367 
2368  rw_lock_x_lock(&cache->lock);
2369  get_doc.index_cache = fts_find_index_cache(cache, query->index);
2370  rw_lock_x_unlock(&cache->lock);
2371  ut_a(get_doc.index_cache != NULL);
2372 
2373  phrase.distance = query->distance;
2374  phrase.charset = get_doc.index_cache->charset;
2375  phrase.zip_size = dict_table_zip_size(
2376  get_doc.index_cache->index->table);
2377  phrase.heap = mem_heap_create(512);
2378  phrase.proximity_pos = qualified_pos;
2379  phrase.found = FALSE;
2380 
2382  &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2383  fts_query_fetch_document, &phrase);
2384 
2385  if (err != DB_SUCCESS) {
2386  ib_logf(IB_LOG_LEVEL_ERROR,
2387  "Error: (%s) in verification phase of proximity "
2388  "search", ut_strerr(err));
2389  }
2390 
2391  /* Free the prepared statement. */
2392  if (get_doc.get_document_graph) {
2393  fts_que_graph_free(get_doc.get_document_graph);
2394  get_doc.get_document_graph = NULL;
2395  }
2396 
2397  mem_heap_free(phrase.heap);
2398 
2399  return(err == DB_SUCCESS && phrase.found);
2400 }
2401 
2402 /*****************************************************************/
2406 static __attribute__((nonnull, warn_unused_result))
2407 dberr_t
2408 fts_query_search_phrase(
2409 /*====================*/
2410  fts_query_t* query,
2411  ib_vector_t* orig_tokens,
2414  ib_vector_t* tokens)
2418 {
2419  ulint i;
2420  fts_get_doc_t get_doc;
2421  ulint n_matched;
2422  fts_cache_t* cache = query->index->table->fts->cache;
2423 
2424  n_matched = ib_vector_size(query->matched);
2425 
2426  /* Setup the doc retrieval infrastructure. */
2427  memset(&get_doc, 0x0, sizeof(get_doc));
2428 
2429  rw_lock_x_lock(&cache->lock);
2430 
2431  get_doc.index_cache = fts_find_index_cache(cache, query->index);
2432 
2433  /* Must find the index cache */
2434  ut_a(get_doc.index_cache != NULL);
2435 
2436  rw_lock_x_unlock(&cache->lock);
2437 
2438 #ifdef FTS_INTERNAL_DIAG_PRINT
2439  ut_print_timestamp(stderr);
2440  fprintf(stderr, " Start phrase search\n");
2441 #endif
2442 
2443  /* Read the document from disk and do the actual
2444  match, matching documents will be added to the current
2445  doc id set. */
2446  for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2447  fts_match_t* match;
2448  ibool found = FALSE;
2449 
2450  match = static_cast<fts_match_t*>(
2451  ib_vector_get(query->matched, i));
2452 
2453  /* Skip the document ids that were filtered out by
2454  an earlier pass. */
2455  if (match->doc_id != 0) {
2456 
2457  query->error = fts_query_match_document(
2458  orig_tokens, &get_doc,
2459  match, query->distance, &found);
2460 
2461  if (query->error == DB_SUCCESS && found) {
2462  ulint z;
2463 
2464  query->error = fts_query_process_doc_id(query,
2465  match->doc_id, 0);
2466  if (query->error != DB_SUCCESS) {
2467  goto func_exit;
2468  }
2469 
2470  for (z = 0; z < ib_vector_size(tokens); z++) {
2471  fts_string_t* token;
2472  token = static_cast<fts_string_t*>(
2473  ib_vector_get(tokens, z));
2474  fts_query_add_word_to_document(
2475  query, match->doc_id,
2476  token->f_str);
2477  }
2478  }
2479  }
2480  }
2481 
2482 func_exit:
2483  /* Free the prepared statement. */
2484  if (get_doc.get_document_graph) {
2485  fts_que_graph_free(get_doc.get_document_graph);
2486  get_doc.get_document_graph = NULL;
2487  }
2488 
2489  return(query->error);
2490 }
2491 
2492 /*****************************************************************/
2495 static __attribute__((nonnull, warn_unused_result))
2496 dberr_t
2497 fts_query_phrase_search(
2498 /*====================*/
2499  fts_query_t* query,
2500  const fts_string_t* phrase)
2501 {
2502  ib_vector_t* tokens;
2503  ib_vector_t* orig_tokens;
2504  mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2505  ulint len = phrase->f_len;
2506  ulint cur_pos = 0;
2507  ib_alloc_t* heap_alloc;
2508  ulint num_token;
2509  CHARSET_INFO* charset;
2510 
2511  charset = query->fts_index_table.charset;
2512 
2513  heap_alloc = ib_heap_allocator_create(heap);
2514 
2515  tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2516  orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2517 
2518  if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2519  query->flags = FTS_PROXIMITY;
2520  } else {
2521  query->flags = FTS_PHRASE;
2522  }
2523 
2524  /* Split the phrase into tokens. */
2525  while (cur_pos < len) {
2526  fts_cache_t* cache = query->index->table->fts->cache;
2527  ib_rbt_bound_t parent;
2528  ulint offset;
2529  ulint cur_len;
2530  fts_string_t result_str;
2531 
2532  cur_len = innobase_mysql_fts_get_token(
2533  charset,
2534  reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
2535  reinterpret_cast<const byte*>(phrase->f_str) + len,
2536  &result_str, &offset);
2537 
2538  if (cur_len == 0) {
2539  break;
2540  }
2541 
2542  cur_pos += cur_len;
2543 
2544  if (result_str.f_n_char == 0) {
2545  continue;
2546  }
2547 
2548  fts_string_t* token = static_cast<fts_string_t*>(
2549  ib_vector_push(tokens, NULL));
2550 
2551  token->f_str = static_cast<byte*>(
2552  mem_heap_alloc(heap, result_str.f_len + 1));
2553  ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
2554 
2555  token->f_len = result_str.f_len;
2556  token->f_str[token->f_len] = 0;
2557 
2558  if (cache->stopword_info.cached_stopword
2560  &parent, token) != 0
2561  && result_str.f_n_char >= fts_min_token_size
2562  && result_str.f_n_char <= fts_max_token_size) {
2563  /* Add the word to the RB tree so that we can
2564  calculate it's frequencey within a document. */
2565  fts_query_add_word_freq(query, token->f_str);
2566  } else {
2567  ib_vector_pop(tokens);
2568  }
2569 
2570  /* we will start to store all words including stopwords
2571  in the "orig_tokens" vector, but skip any leading words
2572  that are stopwords */
2573  if (!ib_vector_is_empty(tokens)) {
2574  fts_string_t* orig_token = static_cast<fts_string_t*>(
2575  ib_vector_push(orig_tokens, NULL));
2576 
2577  orig_token->f_str = token->f_str;
2578  orig_token->f_len = token->f_len;
2579  }
2580  }
2581 
2582  num_token = ib_vector_size(tokens);
2583  ut_ad(ib_vector_size(orig_tokens) >= num_token);
2584 
2585  /* Ignore empty strings. */
2586  if (num_token > 0) {
2587  fts_string_t* token;
2588  fts_fetch_t fetch;
2589  trx_t* trx = query->trx;
2590  fts_ast_oper_t oper = query->oper;
2591  que_t* graph = NULL;
2592  ulint i;
2593  dberr_t error;
2594 
2595  /* Create the vector for storing matching document ids
2596  and the positions of the first token of the phrase. */
2597  if (!query->matched) {
2598  ib_alloc_t* heap_alloc;
2599 
2600  heap_alloc = ib_heap_allocator_create(heap);
2601 
2602  if (!(query->flags & FTS_PROXIMITY)
2603  && !(query->flags & FTS_PHRASE)) {
2604  query->matched = ib_vector_create(
2605  heap_alloc, sizeof(fts_match_t),
2606  64);
2607  } else {
2608  ut_a(num_token < MAX_PROXIMITY_ITEM);
2609  query->match_array =
2611  heap,
2612  num_token *
2613  sizeof(query->matched));
2614 
2615  for (i = 0; i < num_token; i++) {
2616  query->match_array[i] =
2617  ib_vector_create(
2618  heap_alloc, sizeof(fts_match_t),
2619  64);
2620  }
2621 
2622  query->matched = query->match_array[0];
2623  }
2624  }
2625 
2626  /* Setup the callback args for filtering and consolidating
2627  the ilist. */
2628  fetch.read_arg = query;
2629  fetch.read_record = fts_query_index_fetch_nodes;
2630 
2631  for (i = 0; i < num_token; i++) {
2632  /* Search for the first word from the phrase. */
2633  token = static_cast<fts_string_t*>(
2634  ib_vector_get(tokens, i));
2635 
2636  if (query->flags & FTS_PROXIMITY
2637  || query->flags & FTS_PHRASE) {
2638  query->matched = query->match_array[i];
2639  }
2640 
2641  error = fts_index_fetch_nodes(
2642  trx, &graph, &query->fts_index_table,
2643  token, &fetch);
2644 
2645  /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2646  ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2647  if (error != DB_SUCCESS) {
2648  query->error = error;
2649  }
2650 
2651  fts_que_graph_free(graph);
2652  graph = NULL;
2653 
2654  fts_query_cache(query, token);
2655 
2656  if (!(query->flags & FTS_PHRASE)
2657  && !(query->flags & FTS_PROXIMITY)) {
2658  break;
2659  }
2660 
2661  /* If any of the token can't be found,
2662  no need to continue match */
2663  if (ib_vector_is_empty(query->match_array[i])
2664  || query->error != DB_SUCCESS) {
2665  goto func_exit;
2666  }
2667  }
2668 
2669  /* Just a single word, no need to fetch the original
2670  documents to do phrase matching */
2671  if (ib_vector_size(orig_tokens) == 1
2672  && !ib_vector_is_empty(query->match_array[0])) {
2673  fts_match_t* match;
2674  ulint n_matched;
2675 
2676  n_matched = ib_vector_size(query->match_array[0]);
2677 
2678  for (i = 0; i < n_matched; i++) {
2679  match = static_cast<fts_match_t*>(
2680  ib_vector_get(
2681  query->match_array[0], i));
2682 
2683  query->error = fts_query_process_doc_id(
2684  query, match->doc_id, 0);
2685  if (query->error != DB_SUCCESS) {
2686  goto func_exit;
2687  }
2688 
2689  fts_query_add_word_to_document(
2690  query, match->doc_id, token->f_str);
2691  }
2692  query->oper = oper;
2693  goto func_exit;
2694  }
2695 
2696  /* If we are doing proximity search, verify the distance
2697  between all words, and check they are in specified distance. */
2698  if (query->flags & FTS_PROXIMITY) {
2699  fts_phrase_or_proximity_search(query, tokens);
2700  } else {
2701  ibool matched;
2702 
2703  /* Phrase Search case:
2704  We filter out the doc ids that don't contain
2705  all the tokens in the phrase. It's cheaper to
2706  search the ilist than bringing the documents in
2707  and then doing a search through the text. Isolated
2708  testing shows this also helps in mitigating disruption
2709  of the buffer cache. */
2710  matched = fts_phrase_or_proximity_search(query, tokens);
2711  query->matched = query->match_array[0];
2712 
2713  /* Read the actual text in and search for the phrase. */
2714  if (matched) {
2715  ut_ad(query->error == DB_SUCCESS);
2716  query->error = fts_query_search_phrase(
2717  query, orig_tokens, tokens);
2718  }
2719  }
2720 
2721  /* Restore original operation. */
2722  query->oper = oper;
2723 
2724  if (query->error != DB_SUCCESS) {
2725  goto func_exit;
2726  }
2727  }
2728 
2729 func_exit:
2730  mem_heap_free(heap);
2731 
2732  /* Don't need it anymore. */
2733  query->matched = NULL;
2734 
2735  return(query->error);
2736 }
2737 
2738 /*****************************************************************/
2741 static __attribute__((nonnull, warn_unused_result))
2742 dberr_t
2743 fts_query_execute(
2744 /*==============*/
2745  fts_query_t* query,
2746  fts_string_t* token)
2747 {
2748  switch (query->oper) {
2749  case FTS_NONE:
2750  case FTS_NEGATE:
2751  case FTS_INCR_RATING:
2752  case FTS_DECR_RATING:
2753  query->error = fts_query_union(query, token);
2754  break;
2755 
2756  case FTS_EXIST:
2757  query->error = fts_query_intersect(query, token);
2758  break;
2759 
2760  case FTS_IGNORE:
2761  query->error = fts_query_difference(query, token);
2762  break;
2763 
2764  default:
2765  ut_error;
2766  }
2767 
2768  return(query->error);
2769 }
2770 
2771 /*****************************************************************/
2775 static
2776 byte*
2777 fts_query_get_token(
2778 /*================*/
2779  fts_ast_node_t* node,
2780  fts_string_t* token)
2781 {
2782  ulint str_len;
2783  byte* new_ptr = NULL;
2784 
2785  str_len = ut_strlen((char*) node->term.ptr);
2786 
2787  ut_a(node->type == FTS_AST_TERM);
2788 
2789  token->f_len = str_len;
2790  token->f_str = node->term.ptr;
2791 
2792  if (node->term.wildcard) {
2793 
2794  token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
2795  token->f_len = str_len + 1;
2796 
2797  /* Need to copy the NUL character too. */
2798  memcpy(token->f_str, node->term.ptr, str_len + 1);
2799 
2800  token->f_str[str_len] = '%';
2801  token->f_str[token->f_len] = 0;
2802 
2803  new_ptr = token->f_str;
2804  }
2805 
2806  return(new_ptr);
2807 }
2808 
2809 /*****************************************************************/
2811 static
2812 dberr_t
2813 fts_query_visitor(
2814 /*==============*/
2815  fts_ast_oper_t oper,
2816  fts_ast_node_t* node,
2817  void* arg)
2818 {
2819  byte* ptr;
2820  fts_string_t token;
2821  fts_query_t* query = static_cast<fts_query_t*>(arg);
2822 
2823  ut_a(node);
2824 
2825  token.f_n_char = 0;
2826 
2827  query->oper = oper;
2828 
2829  query->cur_node = node;
2830 
2831  switch (node->type) {
2832  case FTS_AST_TEXT:
2833  token.f_str = node->text.ptr;
2834  token.f_len = ut_strlen((char*) token.f_str);
2835 
2836  if (query->oper == FTS_EXIST) {
2837  ut_ad(query->intersection == NULL);
2838  query->intersection = rbt_create(
2840  }
2841 
2842  /* Set the current proximity distance. */
2843  query->distance = node->text.distance;
2844 
2845  /* Force collection of doc ids and the positions. */
2846  query->collect_positions = TRUE;
2847 
2848  query->error = fts_query_phrase_search(query, &token);
2849 
2850  query->collect_positions = FALSE;
2851 
2852  if (query->oper == FTS_EXIST) {
2853  fts_query_free_doc_ids(query, query->doc_ids);
2854  query->doc_ids = query->intersection;
2855  query->intersection = NULL;
2856  }
2857 
2858  break;
2859 
2860  case FTS_AST_TERM:
2861 
2862  /* Add the word to our RB tree that will be used to
2863  calculate this terms per document frequency. */
2864  fts_query_add_word_freq(query, node->term.ptr);
2865 
2866  ptr = fts_query_get_token(node, &token);
2867  query->error = fts_query_execute(query, &token);
2868 
2869  if (ptr) {
2870  ut_free(ptr);
2871  }
2872  break;
2873 
2874  default:
2875  ut_error;
2876  }
2877 
2878  if (query->oper == FTS_EXIST) {
2879  query->multi_exist = true;
2880  }
2881 
2882  return(query->error);
2883 }
2884 
2885 /*****************************************************************/
2890 UNIV_INTERN
2891 dberr_t
2893 /*==================*/
2894  fts_ast_node_t* node,
2895  fts_ast_callback visitor,
2896  void* arg)
2897 {
2898  fts_ast_oper_t cur_oper;
2899  fts_query_t* query = static_cast<fts_query_t*>(arg);
2900  ib_rbt_t* parent_doc_ids;
2901  ib_rbt_t* subexpr_doc_ids;
2902  dberr_t error = DB_SUCCESS;
2903  bool will_be_ignored = false;
2904  bool multi_exist;
2905 
2906  ut_a(node->type == FTS_AST_SUBEXP_LIST);
2907 
2908  node = node->list.head;
2909 
2910  if (!node || !node->next) {
2911  return(error);
2912  }
2913 
2914  cur_oper = node->oper;
2915 
2916  /* Save current result set */
2917  parent_doc_ids = query->doc_ids;
2918 
2919  /* Create new result set to store the sub-expression result. We
2920  will merge this result set with the parent after processing. */
2921  query->doc_ids = rbt_create(sizeof(fts_ranking_t),
2923 
2924  query->total_size += SIZEOF_RBT_CREATE;
2925 
2926  multi_exist = query->multi_exist;
2927  query->multi_exist = false;
2928  /* Process nodes in current sub-expression and store its
2929  result set in query->doc_ids we created above. */
2930  error = fts_ast_visit(FTS_NONE, node->next, visitor,
2931  arg, &will_be_ignored);
2932 
2933  /* Reinstate parent node state and prepare for merge. */
2934  query->multi_exist = multi_exist;
2935  query->oper = cur_oper;
2936  subexpr_doc_ids = query->doc_ids;
2937 
2938  /* Restore current result set. */
2939  query->doc_ids = parent_doc_ids;
2940 
2941  /* Merge the sub-expression result with the parent result set. */
2942  if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) {
2943  error = fts_merge_doc_ids(query, subexpr_doc_ids);
2944  }
2945 
2946  if (query->oper == FTS_EXIST) {
2947  query->multi_exist = true;
2948  }
2949 
2950  /* Free current result set. Result already merged into parent. */
2951  fts_query_free_doc_ids(query, subexpr_doc_ids);
2952 
2953  return(error);
2954 }
2955 
2956 #if 0
2957 /*****************************************************************//***
2958 Check if the doc id exists in the ilist.
2959 @return TRUE if doc id found */
2960 static
2961 ulint
2962 fts_query_find_doc_id(
2963 /*==================*/
2964  fts_select_t* select,
2967  void* data,
2968  ulint len)
2969 {
2970  byte* ptr = data;
2971  doc_id_t doc_id = 0;
2972  ulint decoded = 0;
2973 
2974  /* Decode the ilist and search for selected doc_id. We also
2975  calculate the frequency of the word in the document if found. */
2976  while (decoded < len && !select->found) {
2977  ulint freq = 0;
2978  ulint min_pos = 0;
2979  ulint last_pos = 0;
2980  ulint pos = fts_decode_vlc(&ptr);
2981 
2982  /* Add the delta. */
2983  doc_id += pos;
2984 
2985  while (*ptr) {
2986  ++freq;
2987  last_pos += fts_decode_vlc(&ptr);
2988 
2989  /* Only if min_pos is not set and the current
2990  term exists in a position greater than the
2991  min_pos of the previous term. */
2992  if (min_pos == 0 && last_pos > select->min_pos) {
2993  min_pos = last_pos;
2994  }
2995  }
2996 
2997  /* Skip the end of word position marker. */
2998  ++ptr;
2999 
3000  /* Bytes decoded so far. */
3001  decoded = ptr - (byte*) data;
3002 
3003  /* A word may exist in the document but we only consider a
3004  match if it exists in a position that is greater than the
3005  position of the previous term. */
3006  if (doc_id == select->doc_id && min_pos > 0) {
3007  fts_doc_freq_t* doc_freq;
3008 
3009  /* Add the doc id to the doc freq rb tree, if
3010  the doc id doesn't exist it will be created. */
3011  doc_freq = fts_query_add_doc_freq(
3012  select->word_freq->doc_freqs, doc_id);
3013 
3014  /* Avoid duplicating the frequency tally */
3015  if (doc_freq->freq == 0) {
3016  doc_freq->freq = freq;
3017  }
3018 
3019  select->found = TRUE;
3020  select->min_pos = min_pos;
3021  }
3022  }
3023 
3024  return(select->found);
3025 }
3026 #endif
3027 
3028 /*****************************************************************/
3032 static
3033 dberr_t
3034 fts_query_filter_doc_ids(
3035 /*=====================*/
3036  fts_query_t* query,
3037  const byte* word,
3038  fts_word_freq_t*word_freq,
3039  const fts_node_t*
3040  node,
3041  void* data,
3042  ulint len,
3043  ibool calc_doc_count)
3044 {
3045  byte* ptr = static_cast<byte*>(data);
3046  doc_id_t doc_id = 0;
3047  ulint decoded = 0;
3048  ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3049 
3050  /* Decode the ilist and add the doc ids to the query doc_id set. */
3051  while (decoded < len) {
3052  ulint freq = 0;
3053  fts_doc_freq_t* doc_freq;
3054  fts_match_t* match = NULL;
3055  ulint last_pos = 0;
3056  ulint pos = fts_decode_vlc(&ptr);
3057 
3058  /* Some sanity checks. */
3059  if (doc_id == 0) {
3060  ut_a(pos == node->first_doc_id);
3061  }
3062 
3063  /* Add the delta. */
3064  doc_id += pos;
3065 
3066  if (calc_doc_count) {
3067  word_freq->doc_count++;
3068  }
3069 
3070  /* We simply collect the matching instances here. */
3071  if (query->collect_positions) {
3072  ib_alloc_t* heap_alloc;
3073 
3074  /* Create a new fts_match_t instance. */
3075  match = static_cast<fts_match_t*>(
3076  ib_vector_push(query->matched, NULL));
3077 
3078  match->start = 0;
3079  match->doc_id = doc_id;
3080  heap_alloc = ib_vector_allocator(query->matched);
3081 
3082  /* Allocate from the same heap as the
3083  parent container. */
3084  match->positions = ib_vector_create(
3085  heap_alloc, sizeof(ulint), 64);
3086 
3087  query->total_size += sizeof(fts_match_t)
3088  + sizeof(ib_vector_t)
3089  + sizeof(ulint) * 64;
3090  }
3091 
3092  /* Unpack the positions within the document. */
3093  while (*ptr) {
3094  last_pos += fts_decode_vlc(&ptr);
3095 
3096  /* Collect the matching word positions, for phrase
3097  matching later. */
3098  if (query->collect_positions) {
3099  ib_vector_push(match->positions, &last_pos);
3100  }
3101 
3102  ++freq;
3103  }
3104 
3105  /* End of list marker. */
3106  last_pos = (ulint) -1;
3107 
3108  if (query->collect_positions) {
3109  ut_a(match != NULL);
3110  ib_vector_push(match->positions, &last_pos);
3111  }
3112 
3113  /* Add the doc id to the doc freq rb tree, if the doc id
3114  doesn't exist it will be created. */
3115  doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3116 
3117  /* Avoid duplicating frequency tally. */
3118  if (doc_freq->freq == 0) {
3119  doc_freq->freq = freq;
3120  }
3121 
3122  /* Skip the end of word position marker. */
3123  ++ptr;
3124 
3125  /* Bytes decoded so far */
3126  decoded = ptr - (byte*) data;
3127 
3128  /* We simply collect the matching documents and the
3129  positions here and match later. */
3130  if (!query->collect_positions) {
3131  /* We ignore error here and will check it later */
3132  fts_query_process_doc_id(query, doc_id, 0);
3133 
3134  /* Add the word to the document's matched RB tree. */
3135  fts_query_add_word_to_document(query, doc_id, word);
3136  }
3137  }
3138 
3139  /* Some sanity checks. */
3140  ut_a(doc_id == node->last_doc_id);
3141 
3142  if (query->total_size > fts_result_cache_limit) {
3144  } else {
3145  return(DB_SUCCESS);
3146  }
3147 }
3148 
3149 /*****************************************************************/
3152 static
3153 dberr_t
3154 fts_query_read_node(
3155 /*================*/
3156  fts_query_t* query,
3157  const fts_string_t* word,
3158  que_node_t* exp)
3159 {
3160  int i;
3161  int ret;
3162  fts_node_t node;
3163  ib_rbt_bound_t parent;
3164  fts_word_freq_t* word_freq;
3165  ibool skip = FALSE;
3166  byte term[FTS_MAX_WORD_LEN + 1];
3167  dberr_t error = DB_SUCCESS;
3168 
3169  ut_a(query->cur_node->type == FTS_AST_TERM ||
3170  query->cur_node->type == FTS_AST_TEXT);
3171 
3172  memset(&node, 0, sizeof(node));
3173 
3174  /* Need to consider the wildcard search case, the word frequency
3175  is created on the search string not the actual word. So we need
3176  to assign the frequency on search string behalf. */
3177  if (query->cur_node->type == FTS_AST_TERM
3178  && query->cur_node->term.wildcard) {
3179 
3180  /* These cast are safe since we only care about the
3181  terminating NUL character as an end of string marker. */
3182  ut_strcpy((char*) term, (char*) query->cur_node->term.ptr);
3183  } else {
3184  /* Need to copy the NUL character too. */
3185  memcpy(term, word->f_str, word->f_len);
3186  term[word->f_len] = 0;
3187  }
3188 
3189  /* Lookup the word in our rb tree, it must exist. */
3190  ret = rbt_search(query->word_freqs, &parent, term);
3191 
3192  ut_a(ret == 0);
3193 
3194  word_freq = rbt_value(fts_word_freq_t, parent.last);
3195 
3196  /* Start from 1 since the first column has been read by the caller.
3197  Also, we rely on the order of the columns projected, to filter
3198  out ilists that are out of range and we always want to read
3199  the doc_count irrespective of the suitablility of the row. */
3200 
3201  for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3202 
3203  dfield_t* dfield = que_node_get_val(exp);
3204  byte* data = static_cast<byte*>(
3205  dfield_get_data(dfield));
3206  ulint len = dfield_get_len(dfield);
3207 
3208  ut_a(len != UNIV_SQL_NULL);
3209 
3210  /* Note: The column numbers below must match the SELECT. */
3211 
3212  switch (i) {
3213  case 1: /* DOC_COUNT */
3214  word_freq->doc_count += mach_read_from_4(data);
3215  break;
3216 
3217  case 2: /* FIRST_DOC_ID */
3218  node.first_doc_id = fts_read_doc_id(data);
3219 
3220  /* Skip nodes whose doc ids are out range. */
3221  if (query->oper == FTS_EXIST
3222  && query->upper_doc_id > 0
3223  && node.first_doc_id > query->upper_doc_id) {
3224  skip = TRUE;
3225  }
3226  break;
3227 
3228  case 3: /* LAST_DOC_ID */
3229  node.last_doc_id = fts_read_doc_id(data);
3230 
3231  /* Skip nodes whose doc ids are out range. */
3232  if (query->oper == FTS_EXIST
3233  && query->lower_doc_id > 0
3234  && node.last_doc_id < query->lower_doc_id) {
3235  skip = TRUE;
3236  }
3237  break;
3238 
3239  case 4: /* ILIST */
3240 
3241  error = fts_query_filter_doc_ids(
3242  query, word_freq->word, word_freq,
3243  &node, data, len, FALSE);
3244 
3245  break;
3246 
3247  default:
3248  ut_error;
3249  }
3250  }
3251 
3252  if (!skip) {
3253  /* Make sure all columns were read. */
3254 
3255  ut_a(i == 5);
3256  }
3257 
3258  return error;
3259 }
3260 
3261 /*****************************************************************/
3264 static
3265 ibool
3266 fts_query_index_fetch_nodes(
3267 /*========================*/
3268  void* row,
3269  void* user_arg)
3270 {
3271  fts_string_t key;
3272  sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3273  fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3274  fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3275  que_node_t* exp = sel_node->select_list;
3276  dfield_t* dfield = que_node_get_val(exp);
3277  void* data = dfield_get_data(dfield);
3278  ulint dfield_len = dfield_get_len(dfield);
3279 
3280  key.f_str = static_cast<byte*>(data);
3281  key.f_len = dfield_len;
3282 
3283  ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3284 
3285  /* Note: we pass error out by 'query->error' */
3286  query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3287 
3288  if (query->error != DB_SUCCESS) {
3290  return(FALSE);
3291  } else {
3292  return(TRUE);
3293  }
3294 }
3295 
3296 /*****************************************************************/
3298 static
3299 void
3300 fts_query_calculate_idf(
3301 /*====================*/
3302  fts_query_t* query)
3303 {
3304  const ib_rbt_node_t* node;
3305  ib_uint64_t total_docs = query->total_docs;
3306 
3307  /* We need to free any instances of fts_doc_freq_t that we
3308  may have allocated. */
3309  for (node = rbt_first(query->word_freqs);
3310  node;
3311  node = rbt_next(query->word_freqs, node)) {
3312 
3313  fts_word_freq_t* word_freq;
3314 
3315  word_freq = rbt_value(fts_word_freq_t, node);
3316 
3317  if (word_freq->doc_count > 0) {
3318  if (total_docs == word_freq->doc_count) {
3319  /* QP assume ranking > 0 if we find
3320  a match. Since Log10(1) = 0, we cannot
3321  make IDF a zero value if do find a
3322  word in all documents. So let's make
3323  it an arbitrary very small number */
3324  word_freq->idf = log10(1.0001);
3325  } else {
3326  word_freq->idf = log10(
3327  total_docs
3328  / (double) word_freq->doc_count);
3329  }
3330  }
3331 
3332  if (fts_enable_diag_print) {
3333  fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
3334  " %6.5lf\n",
3335  word_freq->word,
3336  query->total_docs, word_freq->doc_count,
3337  word_freq->idf);
3338  }
3339  }
3340 }
3341 
3342 /*****************************************************************/
3344 static
3345 void
3346 fts_query_calculate_ranking(
3347 /*========================*/
3348  const fts_query_t* query,
3349  fts_ranking_t* ranking)
3350 {
3351  ulint pos = 0;
3352  byte* word = NULL;
3353 
3354  /* At this stage, ranking->rank should not exceed the 1.0
3355  bound */
3356  ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3357  ut_ad(query->word_map->size() == query->word_vector->size());
3358 
3359  while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3360  int ret;
3361  ib_rbt_bound_t parent;
3362  double weight;
3363  fts_doc_freq_t* doc_freq;
3364  fts_word_freq_t* word_freq;
3365 
3366  ut_ad(word != NULL);
3367  ret = rbt_search(query->word_freqs, &parent, word);
3368 
3369  /* It must exist. */
3370  ut_a(ret == 0);
3371 
3372  word_freq = rbt_value(fts_word_freq_t, parent.last);
3373 
3374  ret = rbt_search(
3375  word_freq->doc_freqs, &parent, &ranking->doc_id);
3376 
3377  /* It must exist. */
3378  ut_a(ret == 0);
3379 
3380  doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3381 
3382  weight = (double) doc_freq->freq * word_freq->idf;
3383 
3384  ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3385  }
3386 }
3387 
3388 /*****************************************************************/
3390 static
3391 void
3392 fts_query_add_ranking(
3393 /*==================*/
3394  fts_query_t* query,
3395  ib_rbt_t* ranking_tree,
3396  const fts_ranking_t* new_ranking)
3397 {
3398  ib_rbt_bound_t parent;
3399 
3400  /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3401  if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3402  fts_ranking_t* ranking;
3403 
3404  ranking = rbt_value(fts_ranking_t, parent.last);
3405 
3406  ranking->rank += new_ranking->rank;
3407 
3408  ut_a(ranking->words == NULL);
3409  } else {
3410  rbt_add_node(ranking_tree, &parent, new_ranking);
3411 
3412  query->total_size += SIZEOF_RBT_NODE_ADD
3413  + sizeof(fts_ranking_t);
3414  }
3415 }
3416 
3417 /*****************************************************************/
3421 float
3423 /*=================*/
3424  fts_result_t* result,
3425  doc_id_t doc_id)
3426 {
3427  ib_rbt_bound_t parent;
3428  fts_ranking_t new_ranking;
3429 
3430  if (!result || !result->rankings_by_id) {
3431  return(0);
3432  }
3433 
3434  new_ranking.doc_id = doc_id;
3435 
3436  /* Lookup the ranking in our rb tree */
3437  if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3438  fts_ranking_t* ranking;
3439 
3440  ranking = rbt_value(fts_ranking_t, parent.last);
3441 
3442  return(ranking->rank);
3443  }
3444 
3445  return(0);
3446 }
3447 
3448 /*****************************************************************/
3450 static
3451 fts_result_t*
3452 fts_query_prepare_result(
3453 /*=====================*/
3454  fts_query_t* query,
3455  fts_result_t* result)
3458 {
3459  const ib_rbt_node_t* node;
3460  bool result_is_null = false;
3461 
3462  if (result == NULL) {
3463  result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3464 
3465  memset(result, 0x0, sizeof(*result));
3466 
3467  result->rankings_by_id = rbt_create(
3469 
3470  query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3471  result_is_null = true;
3472  }
3473 
3474  if (query->flags == FTS_OPT_RANKING) {
3475  fts_word_freq_t* word_freq;
3476  ulint size = ib_vector_size(query->deleted->doc_ids);
3477  fts_update_t* array =
3478  (fts_update_t*) query->deleted->doc_ids->data;
3479 
3480  node = rbt_first(query->word_freqs);
3481  ut_ad(node);
3482  word_freq = rbt_value(fts_word_freq_t, node);
3483 
3484  for (node = rbt_first(word_freq->doc_freqs);
3485  node;
3486  node = rbt_next(word_freq->doc_freqs, node)) {
3487  fts_doc_freq_t* doc_freq;
3488  fts_ranking_t ranking;
3489 
3490  doc_freq = rbt_value(fts_doc_freq_t, node);
3491 
3492  /* Don't put deleted docs into result */
3493  if (fts_bsearch(array, 0, size, doc_freq->doc_id)
3494  >= 0) {
3495  continue;
3496  }
3497 
3498  ranking.doc_id = doc_freq->doc_id;
3499  ranking.rank = doc_freq->freq * word_freq->idf
3500  * word_freq->idf;
3501  ranking.words = NULL;
3502 
3503  fts_query_add_ranking(query, result->rankings_by_id,
3504  &ranking);
3505 
3506  if (query->total_size > fts_result_cache_limit) {
3508  fts_query_free_result(result);
3509  return(NULL);
3510  }
3511  }
3512 
3513  return(result);
3514  }
3515 
3516  ut_a(rbt_size(query->doc_ids) > 0);
3517 
3518  for (node = rbt_first(query->doc_ids);
3519  node;
3520  node = rbt_next(query->doc_ids, node)) {
3521 
3522  fts_ranking_t* ranking;
3523 
3524  ranking = rbt_value(fts_ranking_t, node);
3525  fts_query_calculate_ranking(query, ranking);
3526 
3527  // FIXME: I think we may requre this information to improve the
3528  // ranking of doc ids which have more word matches from
3529  // different FTS indexes.
3530 
3531  /* We don't need these anymore free the resources. */
3532  ranking->words = NULL;
3533 
3534  if (!result_is_null) {
3535  fts_query_add_ranking(query, result->rankings_by_id, ranking);
3536 
3537  if (query->total_size > fts_result_cache_limit) {
3539  fts_query_free_result(result);
3540  return(NULL);
3541  }
3542  }
3543  }
3544 
3545  if (result_is_null) {
3546  /* Use doc_ids directly */
3547  rbt_free(result->rankings_by_id);
3548  result->rankings_by_id = query->doc_ids;
3549  query->doc_ids = NULL;
3550  }
3551 
3552  return(result);
3553 }
3554 
3555 /*****************************************************************/
3557 static
3558 fts_result_t*
3559 fts_query_get_result(
3560 /*=================*/
3561  fts_query_t* query,
3562  fts_result_t* result)
3563 {
3564  if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3565  /* Copy the doc ids to the result. */
3566  result = fts_query_prepare_result(query, result);
3567  } else {
3568  /* Create an empty result instance. */
3569  result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
3570  memset(result, 0, sizeof(*result));
3571  }
3572 
3573  return(result);
3574 }
3575 
3576 /*****************************************************************/
3578 static
3579 void
3580 fts_query_free(
3581 /*===========*/
3582  fts_query_t* query)
3583 {
3584 
3585  if (query->read_nodes_graph) {
3586  fts_que_graph_free(query->read_nodes_graph);
3587  }
3588 
3589  if (query->root) {
3590  fts_ast_free_node(query->root);
3591  }
3592 
3593  if (query->deleted) {
3594  fts_doc_ids_free(query->deleted);
3595  }
3596 
3597  if (query->doc_ids) {
3598  fts_query_free_doc_ids(query, query->doc_ids);
3599  }
3600 
3601  if (query->word_freqs) {
3602  const ib_rbt_node_t* node;
3603 
3604  /* We need to free any instances of fts_doc_freq_t that we
3605  may have allocated. */
3606  for (node = rbt_first(query->word_freqs);
3607  node;
3608  node = rbt_next(query->word_freqs, node)) {
3609 
3610  fts_word_freq_t* word_freq;
3611 
3612  word_freq = rbt_value(fts_word_freq_t, node);
3613 
3614  /* We need to cast away the const. */
3615  rbt_free(word_freq->doc_freqs);
3616  }
3617 
3618  rbt_free(query->word_freqs);
3619  }
3620 
3621  ut_a(!query->intersection);
3622 
3623  if (query->heap) {
3624  mem_heap_free(query->heap);
3625  }
3626 
3627  if (query->word_map) {
3628  delete query->word_map;
3629  }
3630 
3631  if (query->word_vector) {
3632  delete query->word_vector;
3633  }
3634 
3635  memset(query, 0, sizeof(*query));
3636 }
3637 
3638 /*****************************************************************/
3640 static
3642 fts_query_parse(
3643 /*============*/
3644  fts_query_t* query,
3645  byte* query_str,
3646  ulint query_len)
3647 {
3648  int error;
3649  fts_ast_state_t state;
3650  bool mode = query->boolean_mode;
3651 
3652  memset(&state, 0x0, sizeof(state));
3653 
3654  /* Setup the scanner to use, this depends on the mode flag. */
3655  state.lexer = fts_lexer_create(mode, query_str, query_len);
3656  state.charset = query->fts_index_table.charset;
3657  error = fts_parse(&state);
3658  fts_lexer_free(state.lexer);
3659  state.lexer = NULL;
3660 
3661  /* Error during parsing ? */
3662  if (error) {
3663  /* Free the nodes that were allocated during parsing. */
3664  fts_ast_state_free(&state);
3665  } else {
3666  query->root = state.root;
3667  }
3668 
3669  return(state.root);
3670 }
3671 
3672 /*******************************************************************/
3675 static
3676 void
3677 fts_query_can_optimize(
3678 /*===================*/
3679  fts_query_t* query,
3680  uint flags)
3681 {
3682  fts_ast_node_t* node = query->root;
3683 
3684  if (flags & FTS_EXPAND) {
3685  return;
3686  }
3687 
3688  /* Check if it has only a term without oper */
3689  ut_ad(node->type == FTS_AST_LIST);
3690  node = node->list.head;
3691  if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3692  query->flags = FTS_OPT_RANKING;
3693  }
3694 }
3695 
3696 /*******************************************************************/
3702 static
3703 byte*
3704 fts_query_str_preprocess(
3705 /*=====================*/
3706  const byte* query_str,
3707  ulint query_len,
3708  ulint *result_len,
3709  CHARSET_INFO* charset,
3710  bool boolean_mode)
3711 {
3712  ulint cur_pos = 0;
3713  ulint str_len;
3714  byte* str_ptr;
3715  bool in_phrase = false;
3716 
3717  /* Convert the query string to lower case before parsing. We own
3718  the ut_malloc'ed result and so remember to free it before return. */
3719 
3720  str_len = query_len * charset->casedn_multiply + 1;
3721  str_ptr = static_cast<byte*>(ut_malloc(str_len));
3722 
3723  *result_len = innobase_fts_casedn_str(
3724  charset, const_cast<char*>(reinterpret_cast<const char*>(
3725  query_str)), query_len,
3726  reinterpret_cast<char*>(str_ptr), str_len);
3727 
3728  ut_ad(*result_len < str_len);
3729 
3730  str_ptr[*result_len] = 0;
3731 
3732  /* If it is boolean mode, no need to check for '-/+' */
3733  if (!boolean_mode) {
3734  return(str_ptr);
3735  }
3736 
3737  /* Otherwise, we travese the string to find any '-/+' that are
3738  immediately proceeded and followed by valid search word.
3739  NOTE: we should not do so for CJK languages, this should
3740  be taken care of in our CJK implementation */
3741  while (cur_pos < *result_len) {
3742  fts_string_t str;
3743  ulint offset;
3744  ulint cur_len;
3745 
3746  cur_len = innobase_mysql_fts_get_token(
3747  charset, str_ptr + cur_pos, str_ptr + *result_len,
3748  &str, &offset);
3749 
3750  if (cur_len == 0) {
3751  break;
3752  }
3753 
3754  /* Check if we are in a phrase, if so, no need to do
3755  replacement of '-/+'. */
3756  for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
3757  if ((char) (*ptr) == '"' ) {
3758  in_phrase = !in_phrase;
3759  }
3760  }
3761 
3762  /* Find those are not leading '-/+' and also not in a phrase */
3763  if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
3764  && !in_phrase) {
3765  char* last_op = reinterpret_cast<char*>(
3766  str_ptr + cur_pos);
3767 
3768  if (*last_op == '-' || *last_op == '+') {
3769  *last_op = ' ';
3770  }
3771  }
3772 
3773  cur_pos += cur_len;
3774  }
3775 
3776  return(str_ptr);
3777 }
3778 
3779 /*******************************************************************/
3782 UNIV_INTERN
3783 dberr_t
3784 fts_query(
3785 /*======*/
3786  trx_t* trx,
3787  dict_index_t* index,
3788  uint flags,
3789  const byte* query_str,
3790  ulint query_len,
3792  fts_result_t** result)
3793 {
3794  fts_query_t query;
3795  dberr_t error = DB_SUCCESS;
3796  byte* lc_query_str;
3797  ulint result_len;
3798  bool boolean_mode;
3799  trx_t* query_trx;
3800  CHARSET_INFO* charset;
3801  ulint start_time_ms;
3802  bool will_be_ignored = false;
3803 
3804  boolean_mode = flags & FTS_BOOL;
3805 
3806  *result = NULL;
3807  memset(&query, 0x0, sizeof(query));
3808  query_trx = trx_allocate_for_background();
3809  query_trx->op_info = "FTS query";
3810 
3811  start_time_ms = ut_time_ms();
3812 
3813  query.trx = query_trx;
3814  query.index = index;
3815  query.boolean_mode = boolean_mode;
3816  query.deleted = fts_doc_ids_create();
3817  query.cur_node = NULL;
3818 
3819  query.fts_common_table.type = FTS_COMMON_TABLE;
3820  query.fts_common_table.table_id = index->table->id;
3821  query.fts_common_table.parent = index->table->name;
3822 
3823  charset = fts_index_get_charset(index);
3824 
3826  query.fts_index_table.index_id = index->id;
3827  query.fts_index_table.table_id = index->table->id;
3828  query.fts_index_table.parent = index->table->name;
3829  query.fts_index_table.charset = charset;
3830 
3831  query.word_map = new word_map_t;
3832  query.word_vector = new word_vector_t;
3833  query.error = DB_SUCCESS;
3834 
3835  /* Setup the RB tree that will be used to collect per term
3836  statistics. */
3838  sizeof(fts_word_freq_t), innobase_fts_string_cmp, charset);
3839 
3840  query.total_size += SIZEOF_RBT_CREATE;
3841 
3842  query.total_docs = dict_table_get_n_rows(index->table);
3843 
3844 #ifdef FTS_DOC_STATS_DEBUG
3845  if (ft_enable_diag_print) {
3846  error = fts_get_total_word_count(
3847  trx, query.index, &query.total_words);
3848 
3849  if (error != DB_SUCCESS) {
3850  goto func_exit;
3851  }
3852 
3853  fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
3854  query.total_docs, query.total_words);
3855  }
3856 #endif /* FTS_DOC_STATS_DEBUG */
3857 
3858  query.fts_common_table.suffix = "DELETED";
3859 
3860  /* Read the deleted doc_ids, we need these for filtering. */
3861  error = fts_table_fetch_doc_ids(
3862  NULL, &query.fts_common_table, query.deleted);
3863 
3864  if (error != DB_SUCCESS) {
3865  goto func_exit;
3866  }
3867 
3868  query.fts_common_table.suffix = "DELETED_CACHE";
3869 
3870  error = fts_table_fetch_doc_ids(
3871  NULL, &query.fts_common_table, query.deleted);
3872 
3873  if (error != DB_SUCCESS) {
3874  goto func_exit;
3875  }
3876 
3877  /* Get the deleted doc ids that are in the cache. */
3879  index->table->fts->cache, query.deleted->doc_ids);
3880 
3881  /* Sort the vector so that we can do a binary search over the ids. */
3882  ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
3883 
3884 #if 0
3885  /* Convert the query string to lower case before parsing. We own
3886  the ut_malloc'ed result and so remember to free it before return. */
3887 
3888  lc_query_str_len = query_len * charset->casedn_multiply + 1;
3889  lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
3890 
3891  result_len = innobase_fts_casedn_str(
3892  charset, (char*) query_str, query_len,
3893  (char*) lc_query_str, lc_query_str_len);
3894 
3895  ut_ad(result_len < lc_query_str_len);
3896 
3897  lc_query_str[result_len] = 0;
3898 
3899 #endif
3900 
3901  lc_query_str = fts_query_str_preprocess(
3902  query_str, query_len, &result_len, charset, boolean_mode);
3903 
3904  query.heap = mem_heap_create(128);
3905 
3906  /* Create the rb tree for the doc id (current) set. */
3907  query.doc_ids = rbt_create(
3909 
3910  query.total_size += SIZEOF_RBT_CREATE;
3911 
3912  /* Parse the input query string. */
3913  if (fts_query_parse(&query, lc_query_str, result_len)) {
3914  fts_ast_node_t* ast = query.root;
3915 
3916  /* Optimize query to check if it's a single term */
3917  fts_query_can_optimize(&query, flags);
3918 
3919  DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
3920  fts_result_cache_limit = 2048;
3921  );
3922 
3923  /* Traverse the Abstract Syntax Tree (AST) and execute
3924  the query. */
3925  query.error = fts_ast_visit(
3926  FTS_NONE, ast, fts_query_visitor,
3927  &query, &will_be_ignored);
3928 
3929  /* If query expansion is requested, extend the search
3930  with first search pass result */
3931  if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
3932  query.error = fts_expand_query(index, &query);
3933  }
3934 
3935  /* Calculate the inverse document frequency of the terms. */
3936  if (query.error == DB_SUCCESS) {
3937  fts_query_calculate_idf(&query);
3938  }
3939 
3940  /* Copy the result from the query state, so that we can
3941  return it to the caller. */
3942  if (query.error == DB_SUCCESS) {
3943  *result = fts_query_get_result(&query, *result);
3944  }
3945 
3946  error = query.error;
3947  } else {
3948  /* still return an empty result set */
3949  *result = static_cast<fts_result_t*>(
3950  ut_malloc(sizeof(**result)));
3951  memset(*result, 0, sizeof(**result));
3952  }
3953 
3954  ut_free(lc_query_str);
3955 
3956  if (fts_enable_diag_print && (*result)) {
3957  ulint diff_time = ut_time_ms() - start_time_ms;
3958  fprintf(stderr, "FTS Search Processing time: %ld secs:"
3959  " %ld millisec: row(s) %d \n",
3960  diff_time / 1000, diff_time % 1000,
3961  (*result)->rankings_by_id
3962  ? (int) rbt_size((*result)->rankings_by_id)
3963  : -1);
3964 
3965  /* Log memory consumption & result size */
3966  ib_logf(IB_LOG_LEVEL_INFO,
3967  "Full Search Memory: "
3968  "%lu (bytes), Row: %lu .",
3969  query.total_size,
3970  (*result)->rankings_by_id
3971  ? rbt_size((*result)->rankings_by_id)
3972  : 0);
3973  }
3974 
3975 func_exit:
3976  fts_query_free(&query);
3977 
3978  trx_free_for_background(query_trx);
3979 
3980  return(error);
3981 }
3982 
3983 /*****************************************************************/
3986 void
3988 /*==================*/
3989  fts_result_t* result)
3990 {
3991  if (result) {
3992  if (result->rankings_by_id != NULL) {
3993  rbt_free(result->rankings_by_id);
3994  result->rankings_by_id = NULL;
3995  }
3996  if (result->rankings_by_rank != NULL) {
3997  rbt_free(result->rankings_by_rank);
3998  result->rankings_by_rank = NULL;
3999  }
4000 
4001  ut_free(result);
4002  result = NULL;
4003  }
4004 }
4005 
4006 /*****************************************************************/
4009 void
4011 /*==========================*/
4012  fts_result_t* result)
4013 {
4014  const ib_rbt_node_t* node;
4015  ib_rbt_t* ranked;
4016 
4017  ut_a(result->rankings_by_id != NULL);
4018  if (result->rankings_by_rank) {
4019  rbt_free(result->rankings_by_rank);
4020  }
4021 
4022  ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4023 
4024  /* We need to free any instances of fts_doc_freq_t that we
4025  may have allocated. */
4026  for (node = rbt_first(result->rankings_by_id);
4027  node;
4028  node = rbt_next(result->rankings_by_id, node)) {
4029 
4030  fts_ranking_t* ranking;
4031 
4032  ranking = rbt_value(fts_ranking_t, node);
4033 
4034  ut_a(ranking->words == NULL);
4035 
4036  rbt_insert(ranked, ranking, ranking);
4037  }
4038 
4039  /* Reset the current node too. */
4040  result->current = NULL;
4041  result->rankings_by_rank = ranked;
4042 }
4043 
4044 #ifdef UNIV_DEBUG
4045 /*******************************************************************/
4047 static
4048 void
4049 fts_print_doc_id(
4050 /*=============*/
4051  fts_query_t* query)
4052 {
4053  const ib_rbt_node_t* node;
4054 
4055  /* Iterate each member of the doc_id set */
4056  for (node = rbt_first(query->doc_ids);
4057  node;
4058  node = rbt_next(query->doc_ids, node)) {
4059  fts_ranking_t* ranking;
4060  ranking = rbt_value(fts_ranking_t, node);
4061 
4062  fprintf(stderr, "doc_ids info, doc_id: %ld \n",
4063  (ulint) ranking->doc_id);
4064 
4065  ulint pos = 0;
4066  byte* value = NULL;
4067  while (fts_ranking_words_get_next(query, ranking, &pos, &value)) {
4068  fprintf(stderr, "doc_ids info, value: %s \n", value);
4069  }
4070  }
4071 }
4072 #endif
4073 
4074 /*************************************************************/
4080 static __attribute__((nonnull, warn_unused_result))
4081 dberr_t
4082 fts_expand_query(
4083 /*=============*/
4084  dict_index_t* index,
4085  fts_query_t* query)
4086 {
4087  const ib_rbt_node_t* node;
4088  const ib_rbt_node_t* token_node;
4089  fts_doc_t result_doc;
4090  dberr_t error = DB_SUCCESS;
4091  const fts_index_cache_t*index_cache;
4092 
4093  /* If no doc is found in first search pass, return */
4094  if (!rbt_size(query->doc_ids)) {
4095  return(error);
4096  }
4097 
4098  /* Init "result_doc", to hold words from the first search pass */
4099  fts_doc_init(&result_doc);
4100 
4101  rw_lock_x_lock(&index->table->fts->cache->lock);
4102  index_cache = fts_find_index_cache(index->table->fts->cache, index);
4103  rw_lock_x_unlock(&index->table->fts->cache->lock);
4104 
4105  ut_a(index_cache);
4106 
4107  result_doc.tokens = rbt_create_arg_cmp(
4109  index_cache->charset);
4110 
4111  result_doc.charset = index_cache->charset;
4112 
4113  query->total_size += SIZEOF_RBT_CREATE;
4114 #ifdef UNIV_DEBUG
4115  fts_print_doc_id(query);
4116 #endif
4117 
4118  for (node = rbt_first(query->doc_ids);
4119  node;
4120  node = rbt_next(query->doc_ids, node)) {
4121 
4122  fts_ranking_t* ranking;
4123  ulint pos;
4124  byte* word;
4125  ulint prev_token_size;
4126  ulint estimate_size;
4127 
4128  prev_token_size = rbt_size(result_doc.tokens);
4129 
4130  ranking = rbt_value(fts_ranking_t, node);
4131 
4132  /* Fetch the documents with the doc_id from the
4133  result of first seach pass. Since we do not
4134  store document-to-word mapping, we need to
4135  fetch the original document and parse them.
4136  Future optimization could be done here if we
4137  support some forms of document-to-word mapping */
4138  fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4141  &result_doc);
4142 
4143  /* Remove words that have already been searched in the
4144  first pass */
4145  pos = 0;
4146  word = NULL;
4147  while (fts_ranking_words_get_next(query, ranking, &pos,
4148  &word)) {
4149  fts_string_t str;
4150  ibool ret;
4151 
4152  /* FIXME: We are discarding a const qualifier here. */
4153  str.f_str = word;
4154  str.f_len = ut_strlen((const char*) str.f_str);
4155  ret = rbt_delete(result_doc.tokens, &str);
4156 
4157  /* The word must exist in the doc we found */
4158  if (!ret) {
4159  fprintf(stderr, " InnoDB: Error: Did not "
4160  "find word %s in doc %ld for query "
4161  "expansion search.\n", str.f_str,
4162  (ulint) ranking->doc_id);
4163  }
4164  }
4165 
4166  /* Estimate memory used, see fts_process_token and fts_token_t.
4167  We ignore token size here. */
4168  estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4169  * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4170  + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4171  query->total_size += estimate_size;
4172 
4173  if (query->total_size > fts_result_cache_limit) {
4175  goto func_exit;
4176  }
4177  }
4178 
4179  /* Search the table the second time with expanded search list */
4180  for (token_node = rbt_first(result_doc.tokens);
4181  token_node;
4182  token_node = rbt_next(result_doc.tokens, token_node)) {
4183  fts_token_t* mytoken;
4184  mytoken = rbt_value(fts_token_t, token_node);
4185 
4186  fts_query_add_word_freq(query, mytoken->text.f_str);
4187  error = fts_query_union(query, &mytoken->text);
4188 
4189  if (error != DB_SUCCESS) {
4190  break;
4191  }
4192  }
4193 
4194 func_exit:
4195  fts_doc_free(&result_doc);
4196 
4197  return(error);
4198 }
4199 /*************************************************************/
4205 static
4206 ibool
4207 fts_phrase_or_proximity_search(
4208 /*===========================*/
4209  fts_query_t* query,
4212  ib_vector_t* tokens)
4213 {
4214  ulint n_matched;
4215  ulint i;
4216  ibool matched = FALSE;
4217  ulint num_token = ib_vector_size(tokens);
4218  fts_match_t* match[MAX_PROXIMITY_ITEM];
4219  ibool end_list = FALSE;
4220 
4221  /* Number of matched documents for the first token */
4222  n_matched = ib_vector_size(query->match_array[0]);
4223 
4224  /* We have a set of match list for each word, we shall
4225  walk through the list and find common documents that
4226  contain all the matching words. */
4227  for (i = 0; i < n_matched; i++) {
4228  ulint j;
4229  ulint k = 0;
4230  fts_proximity_t qualified_pos;
4231  ulint qualified_pos_buf[MAX_PROXIMITY_ITEM * 2];
4232 
4233  qualified_pos.min_pos = &qualified_pos_buf[0];
4234  qualified_pos.max_pos = &qualified_pos_buf[MAX_PROXIMITY_ITEM];
4235 
4236  match[0] = static_cast<fts_match_t*>(
4237  ib_vector_get(query->match_array[0], i));
4238 
4239  /* For remaining match list for the token(word), we
4240  try to see if there is a document with the same
4241  doc id */
4242  for (j = 1; j < num_token; j++) {
4243  match[j] = static_cast<fts_match_t*>(
4244  ib_vector_get(query->match_array[j], k));
4245 
4246  while (match[j]->doc_id < match[0]->doc_id
4247  && k < ib_vector_size(query->match_array[j])) {
4248  match[j] = static_cast<fts_match_t*>(
4249  ib_vector_get(
4250  query->match_array[j], k));
4251  k++;
4252  }
4253 
4254  if (match[j]->doc_id > match[0]->doc_id) {
4255  /* no match */
4256  if (query->flags & FTS_PHRASE) {
4257  match[0]->doc_id = 0;
4258  }
4259  break;
4260  }
4261 
4262  if (k == ib_vector_size(query->match_array[j])) {
4263  end_list = TRUE;
4264 
4265  if (match[j]->doc_id != match[0]->doc_id) {
4266  /* no match */
4267  if (query->flags & FTS_PHRASE) {
4268  ulint s;
4269 
4270  match[0]->doc_id = 0;
4271 
4272  for (s = i + 1; s < n_matched;
4273  s++) {
4274  match[0] = static_cast<
4275  fts_match_t*>(
4276  ib_vector_get(
4277  query->match_array[0],
4278  s));
4279  match[0]->doc_id = 0;
4280  }
4281  }
4282 
4283  goto func_exit;
4284  }
4285  }
4286 
4287  /* FIXME: A better solution will be a counter array
4288  remember each run's last position. So we don't
4289  reset it here very time */
4290  k = 0;
4291  }
4292 
4293  if (j != num_token) {
4294  continue;
4295  }
4296 
4297  /* For this matching doc, we need to further
4298  verify whether the words in the doc are close
4299  to each other, and within the distance specified
4300  in the proximity search */
4301  if (query->flags & FTS_PHRASE) {
4302  matched = TRUE;
4303  } else if (fts_proximity_get_positions(
4304  match, num_token, ULINT_MAX, &qualified_pos)) {
4305 
4306  /* Fetch the original documents and count the
4307  words in between matching words to see that is in
4308  specified distance */
4309  if (fts_query_is_in_proximity_range(
4310  query, match, &qualified_pos)) {
4311  /* If so, mark we find a matching doc */
4312  query->error = fts_query_process_doc_id(
4313  query, match[0]->doc_id, 0);
4314  if (query->error != DB_SUCCESS) {
4315  matched = FALSE;
4316  goto func_exit;
4317  }
4318 
4319  matched = TRUE;
4320  for (ulint z = 0; z < num_token; z++) {
4321  fts_string_t* token;
4322  token = static_cast<fts_string_t*>(
4323  ib_vector_get(tokens, z));
4324  fts_query_add_word_to_document(
4325  query, match[0]->doc_id,
4326  token->f_str);
4327  }
4328  }
4329  }
4330 
4331  if (end_list) {
4332  break;
4333  }
4334  }
4335 
4336 func_exit:
4337  return(matched);
4338 }
4339 
4340 /*************************************************************/
4347 static
4348 bool
4349 fts_proximity_get_positions(
4350 /*========================*/
4351  fts_match_t** match,
4352  ulint num_match,
4354  ulint distance,
4356  fts_proximity_t* qualified_pos)
4359 {
4360  ulint i;
4361  ulint idx[MAX_PROXIMITY_ITEM];
4362  ulint num_pos[MAX_PROXIMITY_ITEM];
4363  ulint min_idx;
4364 
4365  qualified_pos->n_pos = 0;
4366 
4367  ut_a(num_match < MAX_PROXIMITY_ITEM);
4368 
4369  /* Each word could appear multiple times in a doc. So
4370  we need to walk through each word's position list, and find
4371  closest distance between different words to see if
4372  they are in the proximity distance. */
4373 
4374  /* Assume each word's position list is sorted, we
4375  will just do a walk through to all words' lists
4376  similar to a the merge phase of a merge sort */
4377  for (i = 0; i < num_match; i++) {
4378  /* idx is the current position we are checking
4379  for a particular word */
4380  idx[i] = 0;
4381 
4382  /* Number of positions for this word */
4383  num_pos[i] = ib_vector_size(match[i]->positions);
4384  }
4385 
4386  /* Start with the first word */
4387  min_idx = 0;
4388 
4389  while (idx[min_idx] < num_pos[min_idx]) {
4390  ulint position[MAX_PROXIMITY_ITEM];
4391  ulint min_pos = ULINT_MAX;
4392  ulint max_pos = 0;
4393 
4394  /* Check positions in each word position list, and
4395  record the max/min position */
4396  for (i = 0; i < num_match; i++) {
4397  position[i] = *(ulint*) ib_vector_get_const(
4398  match[i]->positions, idx[i]);
4399 
4400  if (position[i] == ULINT_UNDEFINED) {
4401  break;
4402  }
4403 
4404  if (position[i] < min_pos) {
4405  min_pos = position[i];
4406  min_idx = i;
4407  }
4408 
4409  if (position[i] > max_pos) {
4410  max_pos = position[i];
4411  }
4412  }
4413 
4414  /* If max and min position are within range, we
4415  find a good match */
4416  if (max_pos - min_pos <= distance
4417  && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4418  /* The charset has variable character
4419  length encoding, record the min_pos and
4420  max_pos, we will need to verify the actual
4421  number of characters */
4422  qualified_pos->min_pos[qualified_pos->n_pos] = min_pos;
4423  qualified_pos->max_pos[qualified_pos->n_pos] = max_pos;
4424  qualified_pos->n_pos++;
4425  }
4426 
4427  /* Otherwise, move to the next position is the
4428  list for the word with the smallest position */
4429  idx[min_idx]++;
4430  }
4431 
4432  ut_ad(qualified_pos->n_pos <= MAX_PROXIMITY_ITEM);
4433 
4434  return(qualified_pos->n_pos != 0);
4435 }