MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
row0sel.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify it under
13 the terms of the GNU General Public License as published by the Free Software
14 Foundation; version 2 of the License.
15 
16 This program is distributed in the hope that it will be useful, but WITHOUT
17 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
19 
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc.,
22 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
23 
24 *****************************************************************************/
25 
26 /***************************************************/
33 #include "row0sel.h"
34 
35 #ifdef UNIV_NONINL
36 #include "row0sel.ic"
37 #endif
38 
39 #include "dict0dict.h"
40 #include "dict0boot.h"
41 #include "trx0undo.h"
42 #include "trx0trx.h"
43 #include "btr0btr.h"
44 #include "btr0cur.h"
45 #include "btr0sea.h"
46 #include "mach0data.h"
47 #include "que0que.h"
48 #include "row0upd.h"
49 #include "row0row.h"
50 #include "row0vers.h"
51 #include "rem0cmp.h"
52 #include "lock0lock.h"
53 #include "eval0eval.h"
54 #include "pars0sym.h"
55 #include "pars0pars.h"
56 #include "row0mysql.h"
57 #include "read0read.h"
58 #include "buf0lru.h"
59 #include "ha_prototypes.h"
60 #include "m_string.h" /* for my_sys.h */
61 #include "my_sys.h" /* DEBUG_SYNC_C */
62 
63 #include "my_compare.h" /* enum icp_result */
64 
65 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
66 #define SEL_MAX_N_PREFETCH 16
67 
68 /* Number of rows fetched, after which to start prefetching; MySQL interface
69 has another parameter */
70 #define SEL_PREFETCH_LIMIT 1
71 
72 /* When a select has accessed about this many pages, it returns control back
73 to que_run_threads: this is to allow canceling runaway queries */
74 
75 #define SEL_COST_LIMIT 100
76 
77 /* Flags for search shortcut */
78 #define SEL_FOUND 0
79 #define SEL_EXHAUSTED 1
80 #define SEL_RETRY 2
81 
82 /********************************************************************/
89 static
90 ibool
91 row_sel_sec_rec_is_for_blob(
92 /*========================*/
93  ulint mtype,
94  ulint prtype,
95  ulint mbminmaxlen,
97  const byte* clust_field,
103  ulint clust_len,
104  const byte* sec_field,
105  ulint sec_len,
106  ulint prefix_len,
109 {
110  ulint len;
112  ulint zip_size = dict_tf_get_zip_size(table->flags);
113 
114  /* This function should never be invoked on an Antelope format
115  table, because they should always contain enough prefix in the
116  clustered index record. */
117  ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
118  ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
119  ut_ad(prefix_len >= sec_len);
120  ut_ad(prefix_len > 0);
121  ut_a(prefix_len <= sizeof buf);
122 
123  if (UNIV_UNLIKELY
124  (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
126  /* The externally stored field was not written yet.
127  This record should only be seen by
128  recv_recovery_rollback_active() or any
129  TRX_ISO_READ_UNCOMMITTED transactions. */
130  return(FALSE);
131  }
132 
133  len = btr_copy_externally_stored_field_prefix(buf, prefix_len,
134  zip_size,
135  clust_field, clust_len);
136 
137  if (UNIV_UNLIKELY(len == 0)) {
138  /* The BLOB was being deleted as the server crashed.
139  There should not be any secondary index records
140  referring to this clustered index record, because
141  btr_free_externally_stored_field() is called after all
142  secondary index entries of the row have been purged. */
143  return(FALSE);
144  }
145 
146  len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
147  prefix_len, len, (const char*) buf);
148 
149  return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
150 }
151 
152 /********************************************************************/
161 static
162 ibool
163 row_sel_sec_rec_is_for_clust_rec(
164 /*=============================*/
165  const rec_t* sec_rec,
166  dict_index_t* sec_index,
167  const rec_t* clust_rec,
171  dict_index_t* clust_index)
172 {
173  const byte* sec_field;
174  ulint sec_len;
175  const byte* clust_field;
176  ulint n;
177  ulint i;
178  mem_heap_t* heap = NULL;
179  ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
180  ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
181  ulint* clust_offs = clust_offsets_;
182  ulint* sec_offs = sec_offsets_;
183  ibool is_equal = TRUE;
184 
185  rec_offs_init(clust_offsets_);
186  rec_offs_init(sec_offsets_);
187 
188  if (rec_get_deleted_flag(clust_rec,
189  dict_table_is_comp(clust_index->table))) {
190 
191  /* The clustered index record is delete-marked;
192  it is not visible in the read view. Besides,
193  if there are any externally stored columns,
194  some of them may have already been purged. */
195  return(FALSE);
196  }
197 
198  clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
199  ULINT_UNDEFINED, &heap);
200  sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
201  ULINT_UNDEFINED, &heap);
202 
204 
205  for (i = 0; i < n; i++) {
206  const dict_field_t* ifield;
207  const dict_col_t* col;
208  ulint clust_pos;
209  ulint clust_len;
210  ulint len;
211 
212  ifield = dict_index_get_nth_field(sec_index, i);
213  col = dict_field_get_col(ifield);
214  clust_pos = dict_col_get_clust_pos(col, clust_index);
215 
216  clust_field = rec_get_nth_field(
217  clust_rec, clust_offs, clust_pos, &clust_len);
218  sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
219 
220  len = clust_len;
221 
222  if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
223  && sec_len != UNIV_SQL_NULL) {
224 
225  if (rec_offs_nth_extern(clust_offs, clust_pos)) {
227  }
228 
230  col->prtype, col->mbminmaxlen,
231  ifield->prefix_len, len, (char*) clust_field);
232 
233  if (rec_offs_nth_extern(clust_offs, clust_pos)
234  && len < sec_len) {
235  if (!row_sel_sec_rec_is_for_blob(
236  col->mtype, col->prtype,
237  col->mbminmaxlen,
238  clust_field, clust_len,
239  sec_field, sec_len,
240  ifield->prefix_len,
241  clust_index->table)) {
242  goto inequal;
243  }
244 
245  continue;
246  }
247  }
248 
249  if (0 != cmp_data_data(col->mtype, col->prtype,
250  clust_field, len,
251  sec_field, sec_len)) {
252 inequal:
253  is_equal = FALSE;
254  goto func_exit;
255  }
256  }
257 
258 func_exit:
259  if (UNIV_LIKELY_NULL(heap)) {
260  mem_heap_free(heap);
261  }
262  return(is_equal);
263 }
264 
265 /*********************************************************************/
268 UNIV_INTERN
269 sel_node_t*
271 /*============*/
272  mem_heap_t* heap)
273 {
274  sel_node_t* node;
275 
276  node = static_cast<sel_node_t*>(
277  mem_heap_alloc(heap, sizeof(sel_node_t)));
278 
279  node->common.type = QUE_NODE_SELECT;
280  node->state = SEL_NODE_OPEN;
281 
282  node->plans = NULL;
283 
284  return(node);
285 }
286 
287 /*********************************************************************/
290 UNIV_INTERN
291 void
293 /*==================*/
294  sel_node_t* node)
295 {
296  ulint i;
297  plan_t* plan;
298 
299  if (node->plans != NULL) {
300  for (i = 0; i < node->n_tables; i++) {
301  plan = sel_node_get_nth_plan(node, i);
302 
303  btr_pcur_close(&(plan->pcur));
304  btr_pcur_close(&(plan->clust_pcur));
305 
306  if (plan->old_vers_heap) {
308  }
309  }
310  }
311 }
312 
313 /*********************************************************************/
316 UNIV_INLINE
317 void
319 /*=================*/
320  sel_node_t* node)
321 {
322  que_node_t* exp;
323 
324  exp = node->select_list;
325 
326  while (exp) {
327  eval_exp(exp);
328 
329  exp = que_node_get_next(exp);
330  }
331 }
332 
333 /*********************************************************************/
336 UNIV_INLINE
337 void
339 /*=======================*/
340  sym_node_t* var,
342  sel_node_t* node)
343 {
344  que_node_t* exp;
345 
346  if (var == NULL) {
347 
348  return;
349  }
350 
351  for (exp = node->select_list;
352  var != 0;
353  var = static_cast<sym_node_t*>(que_node_get_next(var))) {
354 
355  ut_ad(exp);
356 
357  eval_node_copy_val(var->alias, exp);
358 
359  exp = que_node_get_next(exp);
360  }
361 }
362 
363 /*********************************************************************/
366 UNIV_INLINE
367 void
369 /*=====================*/
370  sel_node_t* node)
371 {
372  func_node_t* func_node;
373 
374  ut_ad(node->is_aggregate);
375 
376  for (func_node = static_cast<func_node_t*>(node->select_list);
377  func_node != 0;
378  func_node = static_cast<func_node_t*>(
379  que_node_get_next(func_node))) {
380 
381  eval_node_set_int_val(func_node, 0);
382  }
383 
384  node->aggregate_already_fetched = FALSE;
385 }
386 
387 /*********************************************************************/
389 UNIV_INLINE
390 void
392 /*=============================*/
393  sel_node_t* node)
394 {
395  sym_node_t* var;
396 
397  var = UT_LIST_GET_FIRST(node->copy_variables);
398 
399  while (var) {
400  eval_node_copy_val(var, var->alias);
401 
402  var->indirection = NULL;
403 
404  var = UT_LIST_GET_NEXT(col_var_list, var);
405  }
406 }
407 
408 /*********************************************************************/
410 static
411 void
412 row_sel_fetch_columns(
413 /*==================*/
415  const rec_t* rec,
417  const ulint* offsets,
418  sym_node_t* column)
420 {
421  dfield_t* val;
422  ulint index_type;
423  ulint field_no;
424  const byte* data;
425  ulint len;
426 
427  ut_ad(rec_offs_validate(rec, index, offsets));
428 
429  if (dict_index_is_clust(index)) {
430  index_type = SYM_CLUST_FIELD_NO;
431  } else {
432  index_type = SYM_SEC_FIELD_NO;
433  }
434 
435  while (column) {
436  mem_heap_t* heap = NULL;
437  ibool needs_copy;
438 
439  field_no = column->field_nos[index_type];
440 
441  if (field_no != ULINT_UNDEFINED) {
442 
443  if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
444  field_no))) {
445 
446  /* Copy an externally stored field to the
447  temporary heap, if possible. */
448 
449  heap = mem_heap_create(1);
450 
452  rec, offsets,
453  dict_table_zip_size(index->table),
454  field_no, &len, heap);
455 
456  /* data == NULL means that the
457  externally stored field was not
458  written yet. This record
459  should only be seen by
460  recv_recovery_rollback_active() or any
461  TRX_ISO_READ_UNCOMMITTED
462  transactions. The InnoDB SQL parser
463  (the sole caller of this function)
464  does not implement READ UNCOMMITTED,
465  and it is not involved during rollback. */
466  ut_a(data);
467  ut_a(len != UNIV_SQL_NULL);
468 
469  needs_copy = TRUE;
470  } else {
471  data = rec_get_nth_field(rec, offsets,
472  field_no, &len);
473 
474  needs_copy = column->copy_val;
475  }
476 
477  if (needs_copy) {
478  eval_node_copy_and_alloc_val(column, data,
479  len);
480  } else {
481  val = que_node_get_val(column);
482  dfield_set_data(val, data, len);
483  }
484 
485  if (UNIV_LIKELY_NULL(heap)) {
486  mem_heap_free(heap);
487  }
488  }
489 
490  column = UT_LIST_GET_NEXT(col_var_list, column);
491  }
492 }
493 
494 /*********************************************************************/
496 static
497 void
498 sel_col_prefetch_buf_alloc(
499 /*=======================*/
500  sym_node_t* column)
501 {
502  sel_buf_t* sel_buf;
503  ulint i;
504 
505  ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
506 
507  column->prefetch_buf = static_cast<sel_buf_t*>(
508  mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
509 
510  for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
511  sel_buf = column->prefetch_buf + i;
512 
513  sel_buf->data = NULL;
514  sel_buf->len = 0;
515  sel_buf->val_buf_size = 0;
516  }
517 }
518 
519 /*********************************************************************/
522 UNIV_INTERN
523 void
525 /*======================*/
526  sel_buf_t* prefetch_buf)
527 {
528  sel_buf_t* sel_buf;
529  ulint i;
530 
531  for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
532  sel_buf = prefetch_buf + i;
533 
534  if (sel_buf->val_buf_size > 0) {
535 
536  mem_free(sel_buf->data);
537  }
538  }
539 
540  mem_free(prefetch_buf);
541 }
542 
543 /*********************************************************************/
546 static
547 void
548 sel_dequeue_prefetched_row(
549 /*=======================*/
550  plan_t* plan)
551 {
552  sym_node_t* column;
553  sel_buf_t* sel_buf;
554  dfield_t* val;
555  byte* data;
556  ulint len;
557  ulint val_buf_size;
558 
559  ut_ad(plan->n_rows_prefetched > 0);
560 
561  column = UT_LIST_GET_FIRST(plan->columns);
562 
563  while (column) {
564  val = que_node_get_val(column);
565 
566  if (!column->copy_val) {
567  /* We did not really push any value for the
568  column */
569 
570  ut_ad(!column->prefetch_buf);
571  ut_ad(que_node_get_val_buf_size(column) == 0);
572  ut_d(dfield_set_null(val));
573 
574  goto next_col;
575  }
576 
577  ut_ad(column->prefetch_buf);
578  ut_ad(!dfield_is_ext(val));
579 
580  sel_buf = column->prefetch_buf + plan->first_prefetched;
581 
582  data = sel_buf->data;
583  len = sel_buf->len;
584  val_buf_size = sel_buf->val_buf_size;
585 
586  /* We must keep track of the allocated memory for
587  column values to be able to free it later: therefore
588  we swap the values for sel_buf and val */
589 
590  sel_buf->data = static_cast<byte*>(dfield_get_data(val));
591  sel_buf->len = dfield_get_len(val);
592  sel_buf->val_buf_size = que_node_get_val_buf_size(column);
593 
594  dfield_set_data(val, data, len);
595  que_node_set_val_buf_size(column, val_buf_size);
596 next_col:
597  column = UT_LIST_GET_NEXT(col_var_list, column);
598  }
599 
600  plan->n_rows_prefetched--;
601 
602  plan->first_prefetched++;
603 }
604 
605 /*********************************************************************/
608 UNIV_INLINE
609 void
611 /*=======================*/
612  plan_t* plan)
613 {
614  sym_node_t* column;
615  sel_buf_t* sel_buf;
616  dfield_t* val;
617  byte* data;
618  ulint len;
619  ulint pos;
620  ulint val_buf_size;
621 
622  if (plan->n_rows_prefetched == 0) {
623  pos = 0;
624  plan->first_prefetched = 0;
625  } else {
626  pos = plan->n_rows_prefetched;
627 
628  /* We have the convention that pushing new rows starts only
629  after the prefetch stack has been emptied: */
630 
631  ut_ad(plan->first_prefetched == 0);
632  }
633 
634  plan->n_rows_prefetched++;
635 
636  ut_ad(pos < SEL_MAX_N_PREFETCH);
637 
638  for (column = UT_LIST_GET_FIRST(plan->columns);
639  column != 0;
640  column = UT_LIST_GET_NEXT(col_var_list, column)) {
641 
642  if (!column->copy_val) {
643  /* There is no sense to push pointers to database
644  page fields when we do not keep latch on the page! */
645  continue;
646  }
647 
648  if (!column->prefetch_buf) {
649  /* Allocate a new prefetch buffer */
650 
651  sel_col_prefetch_buf_alloc(column);
652  }
653 
654  sel_buf = column->prefetch_buf + pos;
655 
656  val = que_node_get_val(column);
657 
658  data = static_cast<byte*>(dfield_get_data(val));
659  len = dfield_get_len(val);
660  val_buf_size = que_node_get_val_buf_size(column);
661 
662  /* We must keep track of the allocated memory for
663  column values to be able to free it later: therefore
664  we swap the values for sel_buf and val */
665 
666  dfield_set_data(val, sel_buf->data, sel_buf->len);
667  que_node_set_val_buf_size(column, sel_buf->val_buf_size);
668 
669  sel_buf->data = data;
670  sel_buf->len = len;
671  sel_buf->val_buf_size = val_buf_size;
672  }
673 }
674 
675 /*********************************************************************/
678 static __attribute__((nonnull, warn_unused_result))
679 dberr_t
680 row_sel_build_prev_vers(
681 /*====================*/
682  read_view_t* read_view,
683  dict_index_t* index,
684  rec_t* rec,
685  ulint** offsets,
690  rec_t** old_vers,
694  mtr_t* mtr)
695 {
696  dberr_t err;
697 
698  if (*old_vers_heap) {
699  mem_heap_empty(*old_vers_heap);
700  } else {
701  *old_vers_heap = mem_heap_create(512);
702  }
703 
705  rec, mtr, index, offsets, read_view, offset_heap,
706  *old_vers_heap, old_vers);
707  return(err);
708 }
709 
710 /*********************************************************************/
713 static __attribute__((nonnull))
714 void
715 row_sel_build_committed_vers_for_mysql(
716 /*===================================*/
717  dict_index_t* clust_index,
718  row_prebuilt_t* prebuilt,
719  const rec_t* rec,
720  ulint** offsets,
724  const rec_t** old_vers,
728  mtr_t* mtr)
729 {
730  if (prebuilt->old_vers_heap) {
731  mem_heap_empty(prebuilt->old_vers_heap);
732  } else {
733  prebuilt->old_vers_heap = mem_heap_create(
734  rec_offs_size(*offsets));
735  }
736 
738  rec, mtr, clust_index, offsets, offset_heap,
739  prebuilt->old_vers_heap, old_vers);
740 }
741 
742 /*********************************************************************/
746 UNIV_INLINE
747 ibool
748 row_sel_test_end_conds(
749 /*===================*/
750  plan_t* plan)
753 {
754  func_node_t* cond;
755 
756  /* All conditions in end_conds are comparisons of a column to an
757  expression */
758 
759  for (cond = UT_LIST_GET_FIRST(plan->end_conds);
760  cond != 0;
761  cond = UT_LIST_GET_NEXT(cond_list, cond)) {
762 
763  /* Evaluate the left side of the comparison, i.e., get the
764  column value if there is an indirection */
765 
766  eval_sym(static_cast<sym_node_t*>(cond->args));
767 
768  /* Do the comparison */
769 
770  if (!eval_cmp(cond)) {
771 
772  return(FALSE);
773  }
774  }
775 
776  return(TRUE);
777 }
778 
779 /*********************************************************************/
782 UNIV_INLINE
783 ibool
784 row_sel_test_other_conds(
785 /*=====================*/
786  plan_t* plan)
788 {
789  func_node_t* cond;
790 
791  cond = UT_LIST_GET_FIRST(plan->other_conds);
792 
793  while (cond) {
794  eval_exp(cond);
795 
796  if (!eval_node_get_ibool_val(cond)) {
797 
798  return(FALSE);
799  }
800 
801  cond = UT_LIST_GET_NEXT(cond_list, cond);
802  }
803 
804  return(TRUE);
805 }
806 
807 /*********************************************************************/
811 static __attribute__((nonnull, warn_unused_result))
812 dberr_t
813 row_sel_get_clust_rec(
814 /*==================*/
815  sel_node_t* node,
816  plan_t* plan,
817  rec_t* rec,
818  que_thr_t* thr,
819  rec_t** out_rec,
823  mtr_t* mtr)
826 {
828  rec_t* clust_rec;
829  rec_t* old_vers;
830  dberr_t err;
831  mem_heap_t* heap = NULL;
832  ulint offsets_[REC_OFFS_NORMAL_SIZE];
833  ulint* offsets = offsets_;
834  rec_offs_init(offsets_);
835 
836  *out_rec = NULL;
837 
838  offsets = rec_get_offsets(rec,
839  btr_pcur_get_btr_cur(&plan->pcur)->index,
840  offsets, ULINT_UNDEFINED, &heap);
841 
842  row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
843 
844  index = dict_table_get_first_index(plan->table);
845 
846  btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
847  BTR_SEARCH_LEAF, &plan->clust_pcur,
848  0, mtr);
849 
850  clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
851 
852  /* Note: only if the search ends up on a non-infimum record is the
853  low_match value the real match to the search tuple */
854 
855  if (!page_rec_is_user_rec(clust_rec)
856  || btr_pcur_get_low_match(&(plan->clust_pcur))
857  < dict_index_get_n_unique(index)) {
858 
860  dict_table_is_comp(plan->table)));
861  ut_a(node->read_view);
862 
863  /* In a rare case it is possible that no clust rec is found
864  for a delete-marked secondary index record: if in row0umod.cc
865  in row_undo_mod_remove_clust_low() we have already removed
866  the clust rec, while purge is still cleaning and removing
867  secondary index records associated with earlier versions of
868  the clustered index record. In that case we know that the
869  clustered index record did not exist in the read view of
870  trx. */
871 
872  goto func_exit;
873  }
874 
875  offsets = rec_get_offsets(clust_rec, index, offsets,
876  ULINT_UNDEFINED, &heap);
877 
878  if (!node->read_view) {
879  /* Try to place a lock on the index record */
880 
881  /* If innodb_locks_unsafe_for_binlog option is used
882  or this session is using READ COMMITTED isolation level
883  we lock only the record, i.e., next-key locking is
884  not used. */
885  ulint lock_type;
886  trx_t* trx;
887 
888  trx = thr_get_trx(thr);
889 
891  || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
892  lock_type = LOCK_REC_NOT_GAP;
893  } else {
894  lock_type = LOCK_ORDINARY;
895  }
896 
898  0, btr_pcur_get_block(&plan->clust_pcur),
899  clust_rec, index, offsets,
900  static_cast<enum lock_mode>(node->row_lock_mode),
901  lock_type,
902  thr);
903 
904  switch (err) {
905  case DB_SUCCESS:
907  /* Declare the variable uninitialized in Valgrind.
908  It should be set to DB_SUCCESS at func_exit. */
909  UNIV_MEM_INVALID(&err, sizeof err);
910  break;
911  default:
912  goto err_exit;
913  }
914  } else {
915  /* This is a non-locking consistent read: if necessary, fetch
916  a previous version of the record */
917 
918  old_vers = NULL;
919 
920  if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
921  node->read_view)) {
922 
923  err = row_sel_build_prev_vers(
924  node->read_view, index, clust_rec,
925  &offsets, &heap, &plan->old_vers_heap,
926  &old_vers, mtr);
927 
928  if (err != DB_SUCCESS) {
929 
930  goto err_exit;
931  }
932 
933  clust_rec = old_vers;
934 
935  if (clust_rec == NULL) {
936  goto func_exit;
937  }
938  }
939 
940  /* If we had to go to an earlier version of row or the
941  secondary index record is delete marked, then it may be that
942  the secondary index record corresponding to clust_rec
943  (or old_vers) is not rec; in that case we must ignore
944  such row because in our snapshot rec would not have existed.
945  Remember that from rec we cannot see directly which transaction
946  id corresponds to it: we have to go to the clustered index
947  record. A query where we want to fetch all rows where
948  the secondary index value is in some interval would return
949  a wrong result if we would not drop rows which we come to
950  visit through secondary index records that would not really
951  exist in our snapshot. */
952 
953  if ((old_vers
955  plan->table)))
956  && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
957  clust_rec, index)) {
958  goto func_exit;
959  }
960  }
961 
962  /* Fetch the columns needed in test conditions. The clustered
963  index record is protected by a page latch that was acquired
964  when plan->clust_pcur was positioned. The latch will not be
965  released until mtr_commit(mtr). */
966 
967  ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
968  row_sel_fetch_columns(index, clust_rec, offsets,
969  UT_LIST_GET_FIRST(plan->columns));
970  *out_rec = clust_rec;
971 func_exit:
972  err = DB_SUCCESS;
973 err_exit:
974  if (UNIV_LIKELY_NULL(heap)) {
975  mem_heap_free(heap);
976  }
977  return(err);
978 }
979 
980 /*********************************************************************/
983 UNIV_INLINE
984 dberr_t
985 sel_set_rec_lock(
986 /*=============*/
987  const buf_block_t* block,
988  const rec_t* rec,
989  dict_index_t* index,
990  const ulint* offsets,
991  ulint mode,
992  ulint type,
994  que_thr_t* thr)
995 {
996  trx_t* trx;
997  dberr_t err;
998 
999  trx = thr_get_trx(thr);
1000 
1001  if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
1003 
1004  return(DB_LOCK_TABLE_FULL);
1005  }
1006  }
1007 
1008  if (dict_index_is_clust(index)) {
1010  0, block, rec, index, offsets,
1011  static_cast<enum lock_mode>(mode), type, thr);
1012  } else {
1014  0, block, rec, index, offsets,
1015  static_cast<enum lock_mode>(mode), type, thr);
1016  }
1017 
1018  return(err);
1019 }
1020 
1021 /*********************************************************************/
1023 static
1024 void
1025 row_sel_open_pcur(
1026 /*==============*/
1027  plan_t* plan,
1028  ibool search_latch_locked,
1032  mtr_t* mtr)
1033 {
1035  func_node_t* cond;
1036  que_node_t* exp;
1037  ulint n_fields;
1038  ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
1039  ulint i;
1040 
1041  if (search_latch_locked) {
1042  has_search_latch = RW_S_LATCH;
1043  }
1044 
1045  index = plan->index;
1046 
1047  /* Calculate the value of the search tuple: the exact match columns
1048  get their expressions evaluated when we evaluate the right sides of
1049  end_conds */
1050 
1051  cond = UT_LIST_GET_FIRST(plan->end_conds);
1052 
1053  while (cond) {
1055 
1056  cond = UT_LIST_GET_NEXT(cond_list, cond);
1057  }
1058 
1059  if (plan->tuple) {
1060  n_fields = dtuple_get_n_fields(plan->tuple);
1061 
1062  if (plan->n_exact_match < n_fields) {
1063  /* There is a non-exact match field which must be
1064  evaluated separately */
1065 
1066  eval_exp(plan->tuple_exps[n_fields - 1]);
1067  }
1068 
1069  for (i = 0; i < n_fields; i++) {
1070  exp = plan->tuple_exps[i];
1071 
1072  dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1073  que_node_get_val(exp));
1074  }
1075 
1076  /* Open pcur to the index */
1077 
1078  btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1079  BTR_SEARCH_LEAF, &plan->pcur,
1080  has_search_latch, mtr);
1081  } else {
1082  /* Open the cursor to the start or the end of the index
1083  (FALSE: no init) */
1084 
1086  &(plan->pcur), false, 0, mtr);
1087  }
1088 
1089  ut_ad(plan->n_rows_prefetched == 0);
1090  ut_ad(plan->n_rows_fetched == 0);
1091  ut_ad(plan->cursor_at_end == FALSE);
1092 
1093  plan->pcur_is_open = TRUE;
1094 }
1095 
1096 /*********************************************************************/
1102 static
1103 ibool
1104 row_sel_restore_pcur_pos(
1105 /*=====================*/
1106  plan_t* plan,
1107  mtr_t* mtr)
1108 {
1109  ibool equal_position;
1110  ulint relative_position;
1111 
1112  ut_ad(!plan->cursor_at_end);
1113 
1114  relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1115 
1116  equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1117  &(plan->pcur), mtr);
1118 
1119  /* If the cursor is traveling upwards, and relative_position is
1120 
1121  (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1122  yet on the successor of the page infimum;
1123  (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1124  first record GREATER than the predecessor of a page supremum; we have
1125  not yet processed the cursor record: no need to move the cursor to the
1126  next record;
1127  (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1128  last record LESS or EQUAL to the old stored user record; (a) if
1129  equal_position is FALSE, this means that the cursor is now on a record
1130  less than the old user record, and we must move to the next record;
1131  (b) if equal_position is TRUE, then if
1132  plan->stored_cursor_rec_processed is TRUE, we must move to the next
1133  record, else there is no need to move the cursor. */
1134 
1135  if (plan->asc) {
1136  if (relative_position == BTR_PCUR_ON) {
1137 
1138  if (equal_position) {
1139 
1140  return(plan->stored_cursor_rec_processed);
1141  }
1142 
1143  return(TRUE);
1144  }
1145 
1146  ut_ad(relative_position == BTR_PCUR_AFTER
1147  || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1148 
1149  return(FALSE);
1150  }
1151 
1152  /* If the cursor is traveling downwards, and relative_position is
1153 
1154  (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1155  the last record LESS than the successor of a page infimum; we have not
1156  processed the cursor record: no need to move the cursor;
1157  (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1158  first record GREATER than the predecessor of a page supremum; we have
1159  processed the cursor record: we should move the cursor to the previous
1160  record;
1161  (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1162  last record LESS or EQUAL to the old stored user record; (a) if
1163  equal_position is FALSE, this means that the cursor is now on a record
1164  less than the old user record, and we need not move to the previous
1165  record; (b) if equal_position is TRUE, then if
1166  plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1167  record, else there is no need to move the cursor. */
1168 
1169  if (relative_position == BTR_PCUR_BEFORE
1170  || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1171 
1172  return(FALSE);
1173  }
1174 
1175  if (relative_position == BTR_PCUR_ON) {
1176 
1177  if (equal_position) {
1178 
1179  return(plan->stored_cursor_rec_processed);
1180  }
1181 
1182  return(FALSE);
1183  }
1184 
1185  ut_ad(relative_position == BTR_PCUR_AFTER
1186  || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1187 
1188  return(TRUE);
1189 }
1190 
1191 /*********************************************************************/
1193 UNIV_INLINE
1194 void
1195 plan_reset_cursor(
1196 /*==============*/
1197  plan_t* plan)
1198 {
1199  plan->pcur_is_open = FALSE;
1200  plan->cursor_at_end = FALSE;
1201  plan->n_rows_fetched = 0;
1202  plan->n_rows_prefetched = 0;
1203 }
1204 
1205 /*********************************************************************/
1209 static
1210 ulint
1211 row_sel_try_search_shortcut(
1212 /*========================*/
1213  sel_node_t* node,
1214  plan_t* plan,
1216  ibool search_latch_locked,
1219  mtr_t* mtr)
1220 {
1222  rec_t* rec;
1223  mem_heap_t* heap = NULL;
1224  ulint offsets_[REC_OFFS_NORMAL_SIZE];
1225  ulint* offsets = offsets_;
1226  ulint ret;
1227  rec_offs_init(offsets_);
1228 
1229  index = plan->index;
1230 
1231  ut_ad(node->read_view);
1232  ut_ad(plan->unique_search);
1233  ut_ad(!plan->must_get_clust);
1234 #ifdef UNIV_SYNC_DEBUG
1235  if (search_latch_locked) {
1236  ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1237  }
1238 #endif /* UNIV_SYNC_DEBUG */
1239 
1240  row_sel_open_pcur(plan, search_latch_locked, mtr);
1241 
1242  rec = btr_pcur_get_rec(&(plan->pcur));
1243 
1244  if (!page_rec_is_user_rec(rec)) {
1245 
1246  return(SEL_RETRY);
1247  }
1248 
1249  ut_ad(plan->mode == PAGE_CUR_GE);
1250 
1251  /* As the cursor is now placed on a user record after a search with
1252  the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1253  fields in the user record matched to the search tuple */
1254 
1255  if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1256 
1257  return(SEL_EXHAUSTED);
1258  }
1259 
1260  /* This is a non-locking consistent read: if necessary, fetch
1261  a previous version of the record */
1262 
1263  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1264 
1265  if (dict_index_is_clust(index)) {
1266  if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1267  node->read_view)) {
1268  ret = SEL_RETRY;
1269  goto func_exit;
1270  }
1271  } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1272 
1273  ret = SEL_RETRY;
1274  goto func_exit;
1275  }
1276 
1277  /* Test the deleted flag. */
1278 
1279  if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1280 
1281  ret = SEL_EXHAUSTED;
1282  goto func_exit;
1283  }
1284 
1285  /* Fetch the columns needed in test conditions. The index
1286  record is protected by a page latch that was acquired when
1287  plan->pcur was positioned. The latch will not be released
1288  until mtr_commit(mtr). */
1289 
1290  row_sel_fetch_columns(index, rec, offsets,
1291  UT_LIST_GET_FIRST(plan->columns));
1292 
1293  /* Test the rest of search conditions */
1294 
1295  if (!row_sel_test_other_conds(plan)) {
1296 
1297  ret = SEL_EXHAUSTED;
1298  goto func_exit;
1299  }
1300 
1302 
1303  plan->n_rows_fetched++;
1304  ret = SEL_FOUND;
1305 func_exit:
1306  if (UNIV_LIKELY_NULL(heap)) {
1307  mem_heap_free(heap);
1308  }
1309  return(ret);
1310 }
1311 
1312 /*********************************************************************/
1315 static __attribute__((nonnull, warn_unused_result))
1316 dberr_t
1317 row_sel(
1318 /*====*/
1319  sel_node_t* node,
1320  que_thr_t* thr)
1321 {
1323  plan_t* plan;
1324  mtr_t mtr;
1325  ibool moved;
1326  rec_t* rec;
1327  rec_t* old_vers;
1328  rec_t* clust_rec;
1329  ibool search_latch_locked;
1330  ibool consistent_read;
1331 
1332  /* The following flag becomes TRUE when we are doing a
1333  consistent read from a non-clustered index and we must look
1334  at the clustered index to find out the previous delete mark
1335  state of the non-clustered record: */
1336 
1337  ibool cons_read_requires_clust_rec = FALSE;
1338  ulint cost_counter = 0;
1339  ibool cursor_just_opened;
1340  ibool must_go_to_next;
1341  ibool mtr_has_extra_clust_latch = FALSE;
1342  /* TRUE if the search was made using
1343  a non-clustered index, and we had to
1344  access the clustered record: now &mtr
1345  contains a clustered index latch, and
1346  &mtr must be committed before we move
1347  to the next non-clustered record */
1348  ulint found_flag;
1349  dberr_t err;
1350  mem_heap_t* heap = NULL;
1351  ulint offsets_[REC_OFFS_NORMAL_SIZE];
1352  ulint* offsets = offsets_;
1353  rec_offs_init(offsets_);
1354 
1355  ut_ad(thr->run_node == node);
1356 
1357  search_latch_locked = FALSE;
1358 
1359  if (node->read_view) {
1360  /* In consistent reads, we try to do with the hash index and
1361  not to use the buffer page get. This is to reduce memory bus
1362  load resulting from semaphore operations. The search latch
1363  will be s-locked when we access an index with a unique search
1364  condition, but not locked when we access an index with a
1365  less selective search condition. */
1366 
1367  consistent_read = TRUE;
1368  } else {
1369  consistent_read = FALSE;
1370  }
1371 
1372 table_loop:
1373  /* TABLE LOOP
1374  ----------
1375  This is the outer major loop in calculating a join. We come here when
1376  node->fetch_table changes, and after adding a row to aggregate totals
1377  and, of course, when this function is called. */
1378 
1379  ut_ad(mtr_has_extra_clust_latch == FALSE);
1380 
1381  plan = sel_node_get_nth_plan(node, node->fetch_table);
1382  index = plan->index;
1383 
1384  if (plan->n_rows_prefetched > 0) {
1385  sel_dequeue_prefetched_row(plan);
1386 
1387  goto next_table_no_mtr;
1388  }
1389 
1390  if (plan->cursor_at_end) {
1391  /* The cursor has already reached the result set end: no more
1392  rows to process for this table cursor, as also the prefetch
1393  stack was empty */
1394 
1395  ut_ad(plan->pcur_is_open);
1396 
1397  goto table_exhausted_no_mtr;
1398  }
1399 
1400  /* Open a cursor to index, or restore an open cursor position */
1401 
1402  mtr_start(&mtr);
1403 
1404  if (consistent_read && plan->unique_search && !plan->pcur_is_open
1405  && !plan->must_get_clust
1406  && !plan->table->big_rows) {
1407  if (!search_latch_locked) {
1409 
1410  search_latch_locked = TRUE;
1411  } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1412 
1413  /* There is an x-latch request waiting: release the
1414  s-latch for a moment; as an s-latch here is often
1415  kept for some 10 searches before being released,
1416  a waiting x-latch request would block other threads
1417  from acquiring an s-latch for a long time, lowering
1418  performance significantly in multiprocessors. */
1419 
1420  rw_lock_s_unlock(&btr_search_latch);
1422  }
1423 
1424  found_flag = row_sel_try_search_shortcut(node, plan,
1425  search_latch_locked,
1426  &mtr);
1427 
1428  if (found_flag == SEL_FOUND) {
1429 
1430  goto next_table;
1431 
1432  } else if (found_flag == SEL_EXHAUSTED) {
1433 
1434  goto table_exhausted;
1435  }
1436 
1437  ut_ad(found_flag == SEL_RETRY);
1438 
1439  plan_reset_cursor(plan);
1440 
1441  mtr_commit(&mtr);
1442  mtr_start(&mtr);
1443  }
1444 
1445  if (search_latch_locked) {
1446  rw_lock_s_unlock(&btr_search_latch);
1447 
1448  search_latch_locked = FALSE;
1449  }
1450 
1451  if (!plan->pcur_is_open) {
1452  /* Evaluate the expressions to build the search tuple and
1453  open the cursor */
1454 
1455  row_sel_open_pcur(plan, search_latch_locked, &mtr);
1456 
1457  cursor_just_opened = TRUE;
1458 
1459  /* A new search was made: increment the cost counter */
1460  cost_counter++;
1461  } else {
1462  /* Restore pcur position to the index */
1463 
1464  must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1465 
1466  cursor_just_opened = FALSE;
1467 
1468  if (must_go_to_next) {
1469  /* We have already processed the cursor record: move
1470  to the next */
1471 
1472  goto next_rec;
1473  }
1474  }
1475 
1476 rec_loop:
1477  /* RECORD LOOP
1478  -----------
1479  In this loop we use pcur and try to fetch a qualifying row, and
1480  also fill the prefetch buffer for this table if n_rows_fetched has
1481  exceeded a threshold. While we are inside this loop, the following
1482  holds:
1483  (1) &mtr is started,
1484  (2) pcur is positioned and open.
1485 
1486  NOTE that if cursor_just_opened is TRUE here, it means that we came
1487  to this point right after row_sel_open_pcur. */
1488 
1489  ut_ad(mtr_has_extra_clust_latch == FALSE);
1490 
1491  rec = btr_pcur_get_rec(&(plan->pcur));
1492 
1493  /* PHASE 1: Set a lock if specified */
1494 
1495  if (!node->asc && cursor_just_opened
1496  && !page_rec_is_supremum(rec)) {
1497 
1498  /* When we open a cursor for a descending search, we must set
1499  a next-key lock on the successor record: otherwise it would
1500  be possible to insert new records next to the cursor position,
1501  and it might be that these new records should appear in the
1502  search result set, resulting in the phantom problem. */
1503 
1504  if (!consistent_read) {
1505 
1506  /* If innodb_locks_unsafe_for_binlog option is used
1507  or this session is using READ COMMITTED isolation
1508  level, we lock only the record, i.e., next-key
1509  locking is not used. */
1510 
1511  rec_t* next_rec = page_rec_get_next(rec);
1512  ulint lock_type;
1513  trx_t* trx;
1514 
1515  trx = thr_get_trx(thr);
1516 
1517  offsets = rec_get_offsets(next_rec, index, offsets,
1518  ULINT_UNDEFINED, &heap);
1519 
1521  || trx->isolation_level
1522  <= TRX_ISO_READ_COMMITTED) {
1523 
1524  if (page_rec_is_supremum(next_rec)) {
1525 
1526  goto skip_lock;
1527  }
1528 
1529  lock_type = LOCK_REC_NOT_GAP;
1530  } else {
1531  lock_type = LOCK_ORDINARY;
1532  }
1533 
1534  err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1535  next_rec, index, offsets,
1536  node->row_lock_mode,
1537  lock_type, thr);
1538 
1539  switch (err) {
1540  case DB_SUCCESS_LOCKED_REC:
1541  err = DB_SUCCESS;
1542  case DB_SUCCESS:
1543  break;
1544  default:
1545  /* Note that in this case we will store in pcur
1546  the PREDECESSOR of the record we are waiting
1547  the lock for */
1548  goto lock_wait_or_error;
1549  }
1550  }
1551  }
1552 
1553 skip_lock:
1554  if (page_rec_is_infimum(rec)) {
1555 
1556  /* The infimum record on a page cannot be in the result set,
1557  and neither can a record lock be placed on it: we skip such
1558  a record. We also increment the cost counter as we may have
1559  processed yet another page of index. */
1560 
1561  cost_counter++;
1562 
1563  goto next_rec;
1564  }
1565 
1566  if (!consistent_read) {
1567  /* Try to place a lock on the index record */
1568 
1569  /* If innodb_locks_unsafe_for_binlog option is used
1570  or this session is using READ COMMITTED isolation level,
1571  we lock only the record, i.e., next-key locking is
1572  not used. */
1573 
1574  ulint lock_type;
1575  trx_t* trx;
1576 
1577  offsets = rec_get_offsets(rec, index, offsets,
1578  ULINT_UNDEFINED, &heap);
1579 
1580  trx = thr_get_trx(thr);
1581 
1583  || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1584 
1585  if (page_rec_is_supremum(rec)) {
1586 
1587  goto next_rec;
1588  }
1589 
1590  lock_type = LOCK_REC_NOT_GAP;
1591  } else {
1592  lock_type = LOCK_ORDINARY;
1593  }
1594 
1595  err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1596  rec, index, offsets,
1597  node->row_lock_mode, lock_type, thr);
1598 
1599  switch (err) {
1600  case DB_SUCCESS_LOCKED_REC:
1601  err = DB_SUCCESS;
1602  case DB_SUCCESS:
1603  break;
1604  default:
1605  goto lock_wait_or_error;
1606  }
1607  }
1608 
1609  if (page_rec_is_supremum(rec)) {
1610 
1611  /* A page supremum record cannot be in the result set: skip
1612  it now when we have placed a possible lock on it */
1613 
1614  goto next_rec;
1615  }
1616 
1618 
1619  if (cost_counter > SEL_COST_LIMIT) {
1620 
1621  /* Now that we have placed the necessary locks, we can stop
1622  for a while and store the cursor position; NOTE that if we
1623  would store the cursor position BEFORE placing a record lock,
1624  it might happen that the cursor would jump over some records
1625  that another transaction could meanwhile insert adjacent to
1626  the cursor: this would result in the phantom problem. */
1627 
1628  goto stop_for_a_while;
1629  }
1630 
1631  /* PHASE 2: Check a mixed index mix id if needed */
1632 
1633  if (plan->unique_search && cursor_just_opened) {
1634 
1635  ut_ad(plan->mode == PAGE_CUR_GE);
1636 
1637  /* As the cursor is now placed on a user record after a search
1638  with the mode PAGE_CUR_GE, the up_match field in the cursor
1639  tells how many fields in the user record matched to the search
1640  tuple */
1641 
1642  if (btr_pcur_get_up_match(&(plan->pcur))
1643  < plan->n_exact_match) {
1644  goto table_exhausted;
1645  }
1646 
1647  /* Ok, no need to test end_conds or mix id */
1648 
1649  }
1650 
1651  /* We are ready to look at a possible new index entry in the result
1652  set: the cursor is now placed on a user record */
1653 
1654  /* PHASE 3: Get previous version in a consistent read */
1655 
1656  cons_read_requires_clust_rec = FALSE;
1657  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1658 
1659  if (consistent_read) {
1660  /* This is a non-locking consistent read: if necessary, fetch
1661  a previous version of the record */
1662 
1663  if (dict_index_is_clust(index)) {
1664 
1665  if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1666  node->read_view)) {
1667 
1668  err = row_sel_build_prev_vers(
1669  node->read_view, index, rec,
1670  &offsets, &heap, &plan->old_vers_heap,
1671  &old_vers, &mtr);
1672 
1673  if (err != DB_SUCCESS) {
1674 
1675  goto lock_wait_or_error;
1676  }
1677 
1678  if (old_vers == NULL) {
1679  /* The record does not exist
1680  in our read view. Skip it, but
1681  first attempt to determine
1682  whether the index segment we
1683  are searching through has been
1684  exhausted. */
1685 
1686  offsets = rec_get_offsets(
1687  rec, index, offsets,
1688  ULINT_UNDEFINED, &heap);
1689 
1690  /* Fetch the columns needed in
1691  test conditions. The clustered
1692  index record is protected by a
1693  page latch that was acquired
1694  by row_sel_open_pcur() or
1695  row_sel_restore_pcur_pos().
1696  The latch will not be released
1697  until mtr_commit(mtr). */
1698 
1699  row_sel_fetch_columns(
1700  index, rec, offsets,
1702  plan->columns));
1703 
1704  if (!row_sel_test_end_conds(plan)) {
1705 
1706  goto table_exhausted;
1707  }
1708 
1709  goto next_rec;
1710  }
1711 
1712  rec = old_vers;
1713  }
1714  } else if (!lock_sec_rec_cons_read_sees(rec,
1715  node->read_view)) {
1716  cons_read_requires_clust_rec = TRUE;
1717  }
1718  }
1719 
1720  /* PHASE 4: Test search end conditions and deleted flag */
1721 
1722  /* Fetch the columns needed in test conditions. The record is
1723  protected by a page latch that was acquired by
1724  row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1725  will not be released until mtr_commit(mtr). */
1726 
1727  row_sel_fetch_columns(index, rec, offsets,
1728  UT_LIST_GET_FIRST(plan->columns));
1729 
1730  /* Test the selection end conditions: these can only contain columns
1731  which already are found in the index, even though the index might be
1732  non-clustered */
1733 
1734  if (plan->unique_search && cursor_just_opened) {
1735 
1736  /* No test necessary: the test was already made above */
1737 
1738  } else if (!row_sel_test_end_conds(plan)) {
1739 
1740  goto table_exhausted;
1741  }
1742 
1744  && !cons_read_requires_clust_rec) {
1745 
1746  /* The record is delete marked: we can skip it if this is
1747  not a consistent read which might see an earlier version
1748  of a non-clustered index record */
1749 
1750  if (plan->unique_search) {
1751 
1752  goto table_exhausted;
1753  }
1754 
1755  goto next_rec;
1756  }
1757 
1758  /* PHASE 5: Get the clustered index record, if needed and if we did
1759  not do the search using the clustered index */
1760 
1761  if (plan->must_get_clust || cons_read_requires_clust_rec) {
1762 
1763  /* It was a non-clustered index and we must fetch also the
1764  clustered index record */
1765 
1766  err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1767  &mtr);
1768  mtr_has_extra_clust_latch = TRUE;
1769 
1770  if (err != DB_SUCCESS) {
1771 
1772  goto lock_wait_or_error;
1773  }
1774 
1775  /* Retrieving the clustered record required a search:
1776  increment the cost counter */
1777 
1778  cost_counter++;
1779 
1780  if (clust_rec == NULL) {
1781  /* The record did not exist in the read view */
1782  ut_ad(consistent_read);
1783 
1784  goto next_rec;
1785  }
1786 
1787  if (rec_get_deleted_flag(clust_rec,
1788  dict_table_is_comp(plan->table))) {
1789 
1790  /* The record is delete marked: we can skip it */
1791 
1792  goto next_rec;
1793  }
1794 
1795  if (node->can_get_updated) {
1796 
1797  btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1798  }
1799  }
1800 
1801  /* PHASE 6: Test the rest of search conditions */
1802 
1803  if (!row_sel_test_other_conds(plan)) {
1804 
1805  if (plan->unique_search) {
1806 
1807  goto table_exhausted;
1808  }
1809 
1810  goto next_rec;
1811  }
1812 
1813  /* PHASE 7: We found a new qualifying row for the current table; push
1814  the row if prefetch is on, or move to the next table in the join */
1815 
1816  plan->n_rows_fetched++;
1817 
1819 
1820  if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1821  || plan->unique_search || plan->no_prefetch
1822  || plan->table->big_rows) {
1823 
1824  /* No prefetch in operation: go to the next table */
1825 
1826  goto next_table;
1827  }
1828 
1830 
1831  if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1832 
1833  /* The prefetch buffer is now full */
1834 
1835  sel_dequeue_prefetched_row(plan);
1836 
1837  goto next_table;
1838  }
1839 
1840 next_rec:
1841  ut_ad(!search_latch_locked);
1842 
1843  if (mtr_has_extra_clust_latch) {
1844 
1845  /* We must commit &mtr if we are moving to the next
1846  non-clustered index record, because we could break the
1847  latching order if we would access a different clustered
1848  index page right away without releasing the previous. */
1849 
1850  goto commit_mtr_for_a_while;
1851  }
1852 
1853  if (node->asc) {
1854  moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1855  } else {
1856  moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1857  }
1858 
1859  if (!moved) {
1860 
1861  goto table_exhausted;
1862  }
1863 
1864  cursor_just_opened = FALSE;
1865 
1866  /* END OF RECORD LOOP
1867  ------------------ */
1868  goto rec_loop;
1869 
1870 next_table:
1871  /* We found a record which satisfies the conditions: we can move to
1872  the next table or return a row in the result set */
1873 
1875 
1876  if (plan->unique_search && !node->can_get_updated) {
1877 
1878  plan->cursor_at_end = TRUE;
1879  } else {
1880  ut_ad(!search_latch_locked);
1881 
1882  plan->stored_cursor_rec_processed = TRUE;
1883 
1884  btr_pcur_store_position(&(plan->pcur), &mtr);
1885  }
1886 
1887  mtr_commit(&mtr);
1888 
1889  mtr_has_extra_clust_latch = FALSE;
1890 
1891 next_table_no_mtr:
1892  /* If we use 'goto' to this label, it means that the row was popped
1893  from the prefetched rows stack, and &mtr is already committed */
1894 
1895  if (node->fetch_table + 1 == node->n_tables) {
1896 
1897  sel_eval_select_list(node);
1898 
1899  if (node->is_aggregate) {
1900 
1901  goto table_loop;
1902  }
1903 
1904  sel_assign_into_var_values(node->into_list, node);
1905 
1906  thr->run_node = que_node_get_parent(node);
1907 
1908  err = DB_SUCCESS;
1909  goto func_exit;
1910  }
1911 
1912  node->fetch_table++;
1913 
1914  /* When we move to the next table, we first reset the plan cursor:
1915  we do not care about resetting it when we backtrack from a table */
1916 
1917  plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1918 
1919  goto table_loop;
1920 
1921 table_exhausted:
1922  /* The table cursor pcur reached the result set end: backtrack to the
1923  previous table in the join if we do not have cached prefetched rows */
1924 
1925  plan->cursor_at_end = TRUE;
1926 
1927  mtr_commit(&mtr);
1928 
1929  mtr_has_extra_clust_latch = FALSE;
1930 
1931  if (plan->n_rows_prefetched > 0) {
1932  /* The table became exhausted during a prefetch */
1933 
1934  sel_dequeue_prefetched_row(plan);
1935 
1936  goto next_table_no_mtr;
1937  }
1938 
1939 table_exhausted_no_mtr:
1940  if (node->fetch_table == 0) {
1941  err = DB_SUCCESS;
1942 
1943  if (node->is_aggregate && !node->aggregate_already_fetched) {
1944 
1945  node->aggregate_already_fetched = TRUE;
1946 
1947  sel_assign_into_var_values(node->into_list, node);
1948 
1949  thr->run_node = que_node_get_parent(node);
1950  } else {
1951  node->state = SEL_NODE_NO_MORE_ROWS;
1952 
1953  thr->run_node = que_node_get_parent(node);
1954  }
1955 
1956  goto func_exit;
1957  }
1958 
1959  node->fetch_table--;
1960 
1961  goto table_loop;
1962 
1963 stop_for_a_while:
1964  /* Return control for a while to que_run_threads, so that runaway
1965  queries can be canceled. NOTE that when we come here, we must, in a
1966  locking read, have placed the necessary (possibly waiting request)
1967  record lock on the cursor record or its successor: when we reposition
1968  the cursor, this record lock guarantees that nobody can meanwhile have
1969  inserted new records which should have appeared in the result set,
1970  which would result in the phantom problem. */
1971 
1972  ut_ad(!search_latch_locked);
1973 
1974  plan->stored_cursor_rec_processed = FALSE;
1975  btr_pcur_store_position(&(plan->pcur), &mtr);
1976 
1977  mtr_commit(&mtr);
1978 
1979 #ifdef UNIV_SYNC_DEBUG
1980  ut_ad(sync_thread_levels_empty_except_dict());
1981 #endif /* UNIV_SYNC_DEBUG */
1982  err = DB_SUCCESS;
1983  goto func_exit;
1984 
1985 commit_mtr_for_a_while:
1986  /* Stores the cursor position and commits &mtr; this is used if
1987  &mtr may contain latches which would break the latching order if
1988  &mtr would not be committed and the latches released. */
1989 
1990  plan->stored_cursor_rec_processed = TRUE;
1991 
1992  ut_ad(!search_latch_locked);
1993  btr_pcur_store_position(&(plan->pcur), &mtr);
1994 
1995  mtr_commit(&mtr);
1996 
1997  mtr_has_extra_clust_latch = FALSE;
1998 
1999 #ifdef UNIV_SYNC_DEBUG
2000  ut_ad(sync_thread_levels_empty_except_dict());
2001 #endif /* UNIV_SYNC_DEBUG */
2002 
2003  goto table_loop;
2004 
2005 lock_wait_or_error:
2006  /* See the note at stop_for_a_while: the same holds for this case */
2007 
2008  ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
2009  ut_ad(!search_latch_locked);
2010 
2011  plan->stored_cursor_rec_processed = FALSE;
2012  btr_pcur_store_position(&(plan->pcur), &mtr);
2013 
2014  mtr_commit(&mtr);
2015 
2016 #ifdef UNIV_SYNC_DEBUG
2017  ut_ad(sync_thread_levels_empty_except_dict());
2018 #endif /* UNIV_SYNC_DEBUG */
2019 
2020 func_exit:
2021  if (search_latch_locked) {
2022  rw_lock_s_unlock(&btr_search_latch);
2023  }
2024  if (UNIV_LIKELY_NULL(heap)) {
2025  mem_heap_free(heap);
2026  }
2027  return(err);
2028 }
2029 
2030 /**********************************************************************/
2034 UNIV_INTERN
2035 que_thr_t*
2036 row_sel_step(
2037 /*=========*/
2038  que_thr_t* thr)
2039 {
2040  sel_node_t* node;
2041 
2042  ut_ad(thr);
2043 
2044  node = static_cast<sel_node_t*>(thr->run_node);
2045 
2046  ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2047 
2048  /* If this is a new time this node is executed (or when execution
2049  resumes after wait for a table intention lock), set intention locks
2050  on the tables, or assign a read view */
2051 
2052  if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2053 
2054  node->state = SEL_NODE_OPEN;
2055  }
2056 
2057  if (node->state == SEL_NODE_OPEN) {
2058 
2059  /* It may be that the current session has not yet started
2060  its transaction, or it has been committed: */
2061 
2062  trx_start_if_not_started_xa(thr_get_trx(thr));
2063 
2064  plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2065 
2066  if (node->consistent_read) {
2067  /* Assign a read view for the query */
2069  thr_get_trx(thr));
2070  } else {
2071  sym_node_t* table_node;
2072  enum lock_mode i_lock_mode;
2073 
2074  if (node->set_x_locks) {
2075  i_lock_mode = LOCK_IX;
2076  } else {
2077  i_lock_mode = LOCK_IS;
2078  }
2079 
2080  for (table_node = node->table_list;
2081  table_node != 0;
2082  table_node = static_cast<sym_node_t*>(
2083  que_node_get_next(table_node))) {
2084 
2085  dberr_t err = lock_table(
2086  0, table_node->table, i_lock_mode,
2087  thr);
2088 
2089  if (err != DB_SUCCESS) {
2090  trx_t* trx;
2091 
2092  trx = thr_get_trx(thr);
2093  trx->error_state = err;
2094 
2095  return(NULL);
2096  }
2097  }
2098  }
2099 
2100  /* If this is an explicit cursor, copy stored procedure
2101  variable values, so that the values cannot change between
2102  fetches (currently, we copy them also for non-explicit
2103  cursors) */
2104 
2105  if (node->explicit_cursor
2106  && UT_LIST_GET_FIRST(node->copy_variables)) {
2107 
2109  }
2110 
2111  node->state = SEL_NODE_FETCH;
2112  node->fetch_table = 0;
2113 
2114  if (node->is_aggregate) {
2115  /* Reset the aggregate total values */
2117  }
2118  }
2119 
2120  dberr_t err = row_sel(node, thr);
2121 
2122  /* NOTE! if queries are parallelized, the following assignment may
2123  have problems; the assignment should be made only if thr is the
2124  only top-level thr in the graph: */
2125 
2126  thr->graph->last_sel_node = node;
2127 
2128  if (err != DB_SUCCESS) {
2129  thr_get_trx(thr)->error_state = err;
2130 
2131  return(NULL);
2132  }
2133 
2134  return(thr);
2135 }
2136 
2137 /**********************************************************************/
2140 UNIV_INTERN
2141 que_thr_t*
2142 fetch_step(
2143 /*=======*/
2144  que_thr_t* thr)
2145 {
2146  sel_node_t* sel_node;
2147  fetch_node_t* node;
2148 
2149  ut_ad(thr);
2150 
2151  node = static_cast<fetch_node_t*>(thr->run_node);
2152  sel_node = node->cursor_def;
2153 
2154  ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2155 
2156  if (thr->prev_node != que_node_get_parent(node)) {
2157 
2158  if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2159 
2160  if (node->into_list) {
2162  sel_node);
2163  } else {
2164  ibool ret = (*node->func->func)(
2165  sel_node, node->func->arg);
2166 
2167  if (!ret) {
2168  sel_node->state
2170  }
2171  }
2172  }
2173 
2174  thr->run_node = que_node_get_parent(node);
2175 
2176  return(thr);
2177  }
2178 
2179  /* Make the fetch node the parent of the cursor definition for
2180  the time of the fetch, so that execution knows to return to this
2181  fetch node after a row has been selected or we know that there is
2182  no row left */
2183 
2184  sel_node->common.parent = node;
2185 
2186  if (sel_node->state == SEL_NODE_CLOSED) {
2187  fprintf(stderr,
2188  "InnoDB: Error: fetch called on a closed cursor\n");
2189 
2190  thr_get_trx(thr)->error_state = DB_ERROR;
2191 
2192  return(NULL);
2193  }
2194 
2195  thr->run_node = sel_node;
2196 
2197  return(thr);
2198 }
2199 
2200 /****************************************************************/
2203 UNIV_INTERN
2204 void*
2206 /*============*/
2207  void* row,
2208  void* user_arg)
2209 {
2210  que_node_t* exp;
2211  ulint i = 0;
2212  sel_node_t* node = static_cast<sel_node_t*>(row);
2213 
2214  UT_NOT_USED(user_arg);
2215 
2216  fprintf(stderr, "row_fetch_print: row %p\n", row);
2217 
2218  for (exp = node->select_list;
2219  exp != 0;
2220  exp = que_node_get_next(exp), i++) {
2221 
2222  dfield_t* dfield = que_node_get_val(exp);
2223  const dtype_t* type = dfield_get_type(dfield);
2224 
2225  fprintf(stderr, " column %lu:\n", (ulong) i);
2226 
2227  dtype_print(type);
2228  putc('\n', stderr);
2229 
2230  if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2231  ut_print_buf(stderr, dfield_get_data(dfield),
2232  dfield_get_len(dfield));
2233  putc('\n', stderr);
2234  } else {
2235  fputs(" <NULL>;\n", stderr);
2236  }
2237  }
2238 
2239  return((void*)42);
2240 }
2241 
2242 /***********************************************************/
2245 UNIV_INTERN
2246 que_thr_t*
2248 /*============*/
2249  que_thr_t* thr)
2250 {
2251  row_printf_node_t* node;
2252  sel_node_t* sel_node;
2253  que_node_t* arg;
2254 
2255  ut_ad(thr);
2256 
2257  node = static_cast<row_printf_node_t*>(thr->run_node);
2258 
2259  sel_node = node->sel_node;
2260 
2261  ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2262 
2263  if (thr->prev_node == que_node_get_parent(node)) {
2264 
2265  /* Reset the cursor */
2266  sel_node->state = SEL_NODE_OPEN;
2267 
2268  /* Fetch next row to print */
2269 
2270  thr->run_node = sel_node;
2271 
2272  return(thr);
2273  }
2274 
2275  if (sel_node->state != SEL_NODE_FETCH) {
2276 
2277  ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2278 
2279  /* No more rows to print */
2280 
2281  thr->run_node = que_node_get_parent(node);
2282 
2283  return(thr);
2284  }
2285 
2286  arg = sel_node->select_list;
2287 
2288  while (arg) {
2290 
2291  fputs(" ::: ", stderr);
2292 
2293  arg = que_node_get_next(arg);
2294  }
2295 
2296  putc('\n', stderr);
2297 
2298  /* Fetch next row to print */
2299 
2300  thr->run_node = sel_node;
2301 
2302  return(thr);
2303 }
2304 
2305 /****************************************************************/
2312 UNIV_INTERN
2313 void
2315 /*==================================*/
2316  dtuple_t* tuple,
2320  byte* buf,
2327  ulint buf_len,
2328  dict_index_t* index,
2329  const byte* key_ptr,
2330  ulint key_len,
2331  trx_t* trx)
2332 {
2333  byte* original_buf = buf;
2334  const byte* original_key_ptr = key_ptr;
2335  dict_field_t* field;
2336  dfield_t* dfield;
2337  ulint data_offset;
2338  ulint data_len;
2339  ulint data_field_len;
2340  ibool is_null;
2341  const byte* key_end;
2342  ulint n_fields = 0;
2343 
2344  /* For documentation of the key value storage format in MySQL, see
2345  ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2346 
2347  key_end = key_ptr + key_len;
2348 
2349  /* Permit us to access any field in the tuple (ULINT_MAX): */
2350 
2351  dtuple_set_n_fields(tuple, ULINT_MAX);
2352 
2353  dfield = dtuple_get_nth_field(tuple, 0);
2354  field = dict_index_get_nth_field(index, 0);
2355 
2356  if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2357  /* A special case: we are looking for a position in the
2358  generated clustered index which InnoDB automatically added
2359  to a table with no primary key: the first and the only
2360  ordering column is ROW_ID which InnoDB stored to the key_ptr
2361  buffer. */
2362 
2363  ut_a(key_len == DATA_ROW_ID_LEN);
2364 
2365  dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2366 
2367  dtuple_set_n_fields(tuple, 1);
2368 
2369  return;
2370  }
2371 
2372  while (key_ptr < key_end) {
2373 
2374  ulint type = dfield_get_type(dfield)->mtype;
2375  ut_a(field->col->mtype == type);
2376 
2377  data_offset = 0;
2378  is_null = FALSE;
2379 
2380  if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2381  /* The first byte in the field tells if this is
2382  an SQL NULL value */
2383 
2384  data_offset = 1;
2385 
2386  if (*key_ptr != 0) {
2387  dfield_set_null(dfield);
2388 
2389  is_null = TRUE;
2390  }
2391  }
2392 
2393  /* Calculate data length and data field total length */
2394 
2395  if (type == DATA_BLOB) {
2396  /* The key field is a column prefix of a BLOB or
2397  TEXT */
2398 
2399  ut_a(field->prefix_len > 0);
2400 
2401  /* MySQL stores the actual data length to the first 2
2402  bytes after the optional SQL NULL marker byte. The
2403  storage format is little-endian, that is, the most
2404  significant byte at a higher address. In UTF-8, MySQL
2405  seems to reserve field->prefix_len bytes for
2406  storing this field in the key value buffer, even
2407  though the actual value only takes data_len bytes
2408  from the start. */
2409 
2410  data_len = key_ptr[data_offset]
2411  + 256 * key_ptr[data_offset + 1];
2412  data_field_len = data_offset + 2 + field->prefix_len;
2413 
2414  data_offset += 2;
2415 
2416  /* Now that we know the length, we store the column
2417  value like it would be a fixed char field */
2418 
2419  } else if (field->prefix_len > 0) {
2420  /* Looks like MySQL pads unused end bytes in the
2421  prefix with space. Therefore, also in UTF-8, it is ok
2422  to compare with a prefix containing full prefix_len
2423  bytes, and no need to take at most prefix_len / 3
2424  UTF-8 characters from the start.
2425  If the prefix is used as the upper end of a LIKE
2426  'abc%' query, then MySQL pads the end with chars
2427  0xff. TODO: in that case does it any harm to compare
2428  with the full prefix_len bytes. How do characters
2429  0xff in UTF-8 behave? */
2430 
2431  data_len = field->prefix_len;
2432  data_field_len = data_offset + data_len;
2433  } else {
2434  data_len = dfield_get_type(dfield)->len;
2435  data_field_len = data_offset + data_len;
2436  }
2437 
2438  if (UNIV_UNLIKELY
2439  (dtype_get_mysql_type(dfield_get_type(dfield))
2440  == DATA_MYSQL_TRUE_VARCHAR)
2441  && UNIV_LIKELY(type != DATA_INT)) {
2442  /* In a MySQL key value format, a true VARCHAR is
2443  always preceded by 2 bytes of a length field.
2444  dfield_get_type(dfield)->len returns the maximum
2445  'payload' len in bytes. That does not include the
2446  2 bytes that tell the actual data length.
2447 
2448  We added the check != DATA_INT to make sure we do
2449  not treat MySQL ENUM or SET as a true VARCHAR! */
2450 
2451  data_len += 2;
2452  data_field_len += 2;
2453  }
2454 
2455  /* Storing may use at most data_len bytes of buf */
2456 
2457  if (UNIV_LIKELY(!is_null)) {
2458  ut_a(buf + data_len <= original_buf + buf_len);
2460  dfield, buf,
2461  FALSE, /* MySQL key value format col */
2462  key_ptr + data_offset, data_len,
2463  dict_table_is_comp(index->table));
2464  buf += data_len;
2465  }
2466 
2467  key_ptr += data_field_len;
2468 
2469  if (UNIV_UNLIKELY(key_ptr > key_end)) {
2470  /* The last field in key was not a complete key field
2471  but a prefix of it.
2472 
2473  Print a warning about this! HA_READ_PREFIX_LAST does
2474  not currently work in InnoDB with partial-field key
2475  value prefixes. Since MySQL currently uses a padding
2476  trick to calculate LIKE 'abc%' type queries there
2477  should never be partial-field prefixes in searches. */
2478 
2479  ut_print_timestamp(stderr);
2480 
2481  fputs(" InnoDB: Warning: using a partial-field"
2482  " key prefix in search.\n"
2483  "InnoDB: ", stderr);
2484  dict_index_name_print(stderr, trx, index);
2485  fprintf(stderr, ". Last data field length %lu bytes,\n"
2486  "InnoDB: key ptr now exceeds"
2487  " key end by %lu bytes.\n"
2488  "InnoDB: Key value in the MySQL format:\n",
2489  (ulong) data_field_len,
2490  (ulong) (key_ptr - key_end));
2491  fflush(stderr);
2492  ut_print_buf(stderr, original_key_ptr, key_len);
2493  putc('\n', stderr);
2494 
2495  if (!is_null) {
2496  ulint len = dfield_get_len(dfield);
2497  dfield_set_len(dfield, len
2498  - (ulint) (key_ptr - key_end));
2499  }
2500  ut_ad(0);
2501  }
2502 
2503  n_fields++;
2504  field++;
2505  dfield++;
2506  }
2507 
2508  DBUG_EXECUTE_IF("innodb_srch_key_buffer_full",
2509  ut_a(buf == (original_buf + buf_len)););
2510 
2511  ut_a(buf <= original_buf + buf_len);
2512 
2513  /* We set the length of tuple to n_fields: we assume that the memory
2514  area allocated for it is big enough (usually bigger than n_fields). */
2515 
2516  dtuple_set_n_fields(tuple, n_fields);
2517 }
2518 
2519 /**************************************************************/
2521 static
2522 void
2523 row_sel_store_row_id_to_prebuilt(
2524 /*=============================*/
2525  row_prebuilt_t* prebuilt,
2526  const rec_t* index_rec,
2527  const dict_index_t* index,
2528  const ulint* offsets)
2530 {
2531  const byte* data;
2532  ulint len;
2533 
2534  ut_ad(rec_offs_validate(index_rec, index, offsets));
2535 
2536  data = rec_get_nth_field(
2537  index_rec, offsets,
2538  dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2539 
2540  if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2541  fprintf(stderr,
2542  "InnoDB: Error: Row id field is"
2543  " wrong length %lu in ", (ulong) len);
2544  dict_index_name_print(stderr, prebuilt->trx, index);
2545  fprintf(stderr, "\n"
2546  "InnoDB: Field number %lu, record:\n",
2547  (ulong) dict_index_get_sys_col_pos(index,
2548  DATA_ROW_ID));
2549  rec_print_new(stderr, index_rec, offsets);
2550  putc('\n', stderr);
2551  ut_error;
2552  }
2553 
2554  ut_memcpy(prebuilt->row_id, data, len);
2555 }
2556 
2557 #ifdef UNIV_DEBUG
2558 
2559 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
2560  row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
2561 #else /* UNIV_DEBUG */
2562 
2563 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
2564  row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
2565 #endif /* UNIV_DEBUG */
2566 
2567 /**************************************************************/
2570 static __attribute__((nonnull))
2571 void
2572 row_sel_field_store_in_mysql_format_func(
2573 /*=====================================*/
2574  byte* dest,
2580  const mysql_row_templ_t* templ,
2585 #ifdef UNIV_DEBUG
2586  const dict_index_t* index,
2588  ulint field_no,
2592 #endif /* UNIV_DEBUG */
2593  const byte* data,
2594  ulint len)
2595 {
2596  byte* ptr;
2597 #ifdef UNIV_DEBUG
2598  const dict_field_t* field
2599  = dict_index_get_nth_field(index, field_no);
2600 #endif /* UNIV_DEBUG */
2601 
2602  ut_ad(len != UNIV_SQL_NULL);
2603  UNIV_MEM_ASSERT_RW(data, len);
2604  UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
2605  UNIV_MEM_INVALID(dest, templ->mysql_col_len);
2606 
2607  switch (templ->type) {
2608  const byte* field_end;
2609  byte* pad;
2610  case DATA_INT:
2611  /* Convert integer data from Innobase to a little-endian
2612  format, sign bit restored to normal */
2613 
2614  ptr = dest + len;
2615 
2616  for (;;) {
2617  ptr--;
2618  *ptr = *data;
2619  if (ptr == dest) {
2620  break;
2621  }
2622  data++;
2623  }
2624 
2625  if (!templ->is_unsigned) {
2626  dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2627  }
2628 
2629  ut_ad(templ->mysql_col_len == len);
2630  break;
2631 
2632  case DATA_VARCHAR:
2633  case DATA_VARMYSQL:
2634  case DATA_BINARY:
2635  field_end = dest + templ->mysql_col_len;
2636 
2637  if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2638  /* This is a >= 5.0.3 type true VARCHAR. Store the
2639  length of the data to the first byte or the first
2640  two bytes of dest. */
2641 
2643  dest, len, templ->mysql_length_bytes);
2644  /* Copy the actual data. Leave the rest of the
2645  buffer uninitialized. */
2646  memcpy(dest, data, len);
2647  break;
2648  }
2649 
2650  /* Copy the actual data */
2651  ut_memcpy(dest, data, len);
2652 
2653  /* Pad with trailing spaces. */
2654 
2655  pad = dest + len;
2656 
2657  ut_ad(templ->mbminlen <= templ->mbmaxlen);
2658 
2659  /* We treat some Unicode charset strings specially. */
2660  switch (templ->mbminlen) {
2661  case 4:
2662  /* InnoDB should never have stripped partial
2663  UTF-32 characters. */
2664  ut_a(!(len & 3));
2665  break;
2666  case 2:
2667  /* A space char is two bytes,
2668  0x0020 in UCS2 and UTF-16 */
2669 
2670  if (UNIV_UNLIKELY(len & 1)) {
2671  /* A 0x20 has been stripped from the column.
2672  Pad it back. */
2673 
2674  if (pad < field_end) {
2675  *pad++ = 0x20;
2676  }
2677  }
2678  }
2679 
2680  row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
2681  break;
2682 
2683  case DATA_BLOB:
2684  /* Store a pointer to the BLOB buffer to dest: the BLOB was
2685  already copied to the buffer in row_sel_store_mysql_rec */
2686 
2687  row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2688  len);
2689  break;
2690 
2691  case DATA_MYSQL:
2692  memcpy(dest, data, len);
2693 
2694  ut_ad(templ->mysql_col_len >= len);
2695  ut_ad(templ->mbmaxlen >= templ->mbminlen);
2696 
2697  /* If field_no equals to templ->icp_rec_field_no,
2698  we are examining a row pointed by "icp_rec_field_no".
2699  There is possibility that icp_rec_field_no refers to
2700  a field in a secondary index while templ->rec_field_no
2701  points to field in a primary index. The length
2702  should still be equal, unless the field pointed
2703  by icp_rec_field_no has a prefix */
2704  ut_ad(templ->mbmaxlen > templ->mbminlen
2705  || templ->mysql_col_len == len
2706  || (field_no == templ->icp_rec_field_no
2707  && field->prefix_len > 0));
2708 
2709  /* The following assertion would fail for old tables
2710  containing UTF-8 ENUM columns due to Bug #9526. */
2711  ut_ad(!templ->mbmaxlen
2712  || !(templ->mysql_col_len % templ->mbmaxlen));
2713  ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
2714  || (field_no == templ->icp_rec_field_no
2715  && field->prefix_len > 0));
2716  ut_ad(!(field->prefix_len % templ->mbmaxlen));
2717 
2718  if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2719  /* Pad with spaces. This undoes the stripping
2720  done in row0mysql.cc, function
2721  row_mysql_store_col_in_innobase_format(). */
2722 
2723  memset(dest + len, 0x20, templ->mysql_col_len - len);
2724  }
2725  break;
2726 
2727  default:
2728 #ifdef UNIV_DEBUG
2729  case DATA_SYS_CHILD:
2730  case DATA_SYS:
2731  /* These column types should never be shipped to MySQL. */
2732  ut_ad(0);
2733 
2734  case DATA_CHAR:
2735  case DATA_FIXBINARY:
2736  case DATA_FLOAT:
2737  case DATA_DOUBLE:
2738  case DATA_DECIMAL:
2739  /* Above are the valid column types for MySQL data. */
2740 #endif /* UNIV_DEBUG */
2741  ut_ad(field->prefix_len
2742  ? field->prefix_len == len
2743  : templ->mysql_col_len == len);
2744  memcpy(dest, data, len);
2745  }
2746 }
2747 
2748 #ifdef UNIV_DEBUG
2749 
2750 # define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
2751  row_sel_store_mysql_field_func(m,p,r,i,o,f,t)
2752 #else /* UNIV_DEBUG */
2753 
2754 # define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
2755  row_sel_store_mysql_field_func(m,p,r,o,f,t)
2756 #endif /* UNIV_DEBUG */
2757 /**************************************************************/
2759 static __attribute__((warn_unused_result))
2760 ibool
2761 row_sel_store_mysql_field_func(
2762 /*===========================*/
2763  byte* mysql_rec,
2765  row_prebuilt_t* prebuilt,
2766  const rec_t* rec,
2769 #ifdef UNIV_DEBUG
2770  const dict_index_t* index,
2771 #endif
2772  const ulint* offsets,
2774  ulint field_no,
2777  const mysql_row_templ_t*templ)
2778 {
2779  const byte* data;
2780  ulint len;
2781 
2782  ut_ad(prebuilt->default_rec);
2783  ut_ad(templ);
2784  ut_ad(templ >= prebuilt->mysql_template);
2785  ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
2786  ut_ad(field_no == templ->clust_rec_field_no
2787  || field_no == templ->rec_field_no
2788  || field_no == templ->icp_rec_field_no);
2789  ut_ad(rec_offs_validate(rec, index, offsets));
2790 
2791  if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
2792 
2793  mem_heap_t* heap;
2794  /* Copy an externally stored field to a temporary heap */
2795 
2796  ut_a(!prebuilt->trx->has_search_latch);
2797  ut_ad(field_no == templ->clust_rec_field_no);
2798 
2799  if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2800  if (prebuilt->blob_heap == NULL) {
2801  prebuilt->blob_heap = mem_heap_create(
2802  UNIV_PAGE_SIZE);
2803  }
2804 
2805  heap = prebuilt->blob_heap;
2806  } else {
2807  heap = mem_heap_create(UNIV_PAGE_SIZE);
2808  }
2809 
2810  /* NOTE: if we are retrieving a big BLOB, we may
2811  already run out of memory in the next call, which
2812  causes an assert */
2813 
2815  rec, offsets,
2816  dict_table_zip_size(prebuilt->table),
2817  field_no, &len, heap);
2818 
2819  if (UNIV_UNLIKELY(!data)) {
2820  /* The externally stored field was not written
2821  yet. This record should only be seen by
2822  recv_recovery_rollback_active() or any
2823  TRX_ISO_READ_UNCOMMITTED transactions. */
2824 
2825  if (heap != prebuilt->blob_heap) {
2826  mem_heap_free(heap);
2827  }
2828 
2829  ut_a(prebuilt->trx->isolation_level
2830  == TRX_ISO_READ_UNCOMMITTED);
2831  return(FALSE);
2832  }
2833 
2834  ut_a(len != UNIV_SQL_NULL);
2835 
2836  row_sel_field_store_in_mysql_format(
2837  mysql_rec + templ->mysql_col_offset,
2838  templ, index, field_no, data, len);
2839 
2840  if (heap != prebuilt->blob_heap) {
2841  mem_heap_free(heap);
2842  }
2843  } else {
2844  /* Field is stored in the row. */
2845 
2846  data = rec_get_nth_field(rec, offsets, field_no, &len);
2847 
2848  if (len == UNIV_SQL_NULL) {
2849  /* MySQL assumes that the field for an SQL
2850  NULL value is set to the default value. */
2851  ut_ad(templ->mysql_null_bit_mask);
2852 
2853  UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2854  + templ->mysql_col_offset,
2855  templ->mysql_col_len);
2856  mysql_rec[templ->mysql_null_byte_offset]
2857  |= (byte) templ->mysql_null_bit_mask;
2858  memcpy(mysql_rec + templ->mysql_col_offset,
2859  (const byte*) prebuilt->default_rec
2860  + templ->mysql_col_offset,
2861  templ->mysql_col_len);
2862  return(TRUE);
2863  }
2864 
2865  if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2866 
2867  /* It is a BLOB field locally stored in the
2868  InnoDB record: we MUST copy its contents to
2869  prebuilt->blob_heap here because
2870  row_sel_field_store_in_mysql_format() stores a
2871  pointer to the data, and the data passed to us
2872  will be invalid as soon as the
2873  mini-transaction is committed and the page
2874  latch on the clustered index page is
2875  released. */
2876 
2877  if (prebuilt->blob_heap == NULL) {
2878  prebuilt->blob_heap = mem_heap_create(
2879  UNIV_PAGE_SIZE);
2880  }
2881 
2882  data = static_cast<byte*>(
2883  mem_heap_dup(prebuilt->blob_heap, data, len));
2884  }
2885 
2886  row_sel_field_store_in_mysql_format(
2887  mysql_rec + templ->mysql_col_offset,
2888  templ, index, field_no, data, len);
2889  }
2890 
2891  ut_ad(len != UNIV_SQL_NULL);
2892 
2893  if (templ->mysql_null_bit_mask) {
2894  /* It is a nullable column with a non-NULL
2895  value */
2896  mysql_rec[templ->mysql_null_byte_offset]
2897  &= ~(byte) templ->mysql_null_bit_mask;
2898  }
2899 
2900  return(TRUE);
2901 }
2902 
2903 /**************************************************************/
2909 static __attribute__((warn_unused_result))
2910 ibool
2911 row_sel_store_mysql_rec(
2912 /*====================*/
2913  byte* mysql_rec,
2914  row_prebuilt_t* prebuilt,
2915  const rec_t* rec,
2919  ibool rec_clust,
2922  const dict_index_t* index,
2923  const ulint* offsets)
2925 {
2926  ulint i;
2927 
2928  ut_ad(rec_clust || index == prebuilt->index);
2929  ut_ad(!rec_clust || dict_index_is_clust(index));
2930 
2931  if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2932  mem_heap_free(prebuilt->blob_heap);
2933  prebuilt->blob_heap = NULL;
2934  }
2935 
2936  for (i = 0; i < prebuilt->n_template; i++) {
2937  const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
2938  const ulint field_no
2939  = rec_clust
2940  ? templ->clust_rec_field_no
2941  : templ->rec_field_no;
2942  /* We should never deliver column prefixes to MySQL,
2943  except for evaluating innobase_index_cond(). */
2944  ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
2945  == 0);
2946 
2947  if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
2948  rec, index, offsets,
2949  field_no, templ)) {
2950  return(FALSE);
2951  }
2952  }
2953 
2954  /* FIXME: We only need to read the doc_id if an FTS indexed
2955  column is being updated.
2956  NOTE, the record must be cluster index record. Secondary index
2957  might not have the Doc ID */
2958  if (dict_table_has_fts_index(prebuilt->table)
2959  && dict_index_is_clust(index)) {
2960 
2961  prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
2962  prebuilt->table, rec, NULL);
2963  }
2964 
2965  return(TRUE);
2966 }
2967 
2968 /*********************************************************************/
2971 static __attribute__((nonnull, warn_unused_result))
2972 dberr_t
2973 row_sel_build_prev_vers_for_mysql(
2974 /*==============================*/
2975  read_view_t* read_view,
2976  dict_index_t* clust_index,
2977  row_prebuilt_t* prebuilt,
2978  const rec_t* rec,
2979  ulint** offsets,
2981  mem_heap_t** offset_heap,
2983  rec_t** old_vers,
2987  mtr_t* mtr)
2988 {
2989  dberr_t err;
2990 
2991  if (prebuilt->old_vers_heap) {
2992  mem_heap_empty(prebuilt->old_vers_heap);
2993  } else {
2994  prebuilt->old_vers_heap = mem_heap_create(200);
2995  }
2996 
2998  rec, mtr, clust_index, offsets, read_view, offset_heap,
2999  prebuilt->old_vers_heap, old_vers);
3000  return(err);
3001 }
3002 
3003 /*********************************************************************/
3008 static __attribute__((nonnull, warn_unused_result))
3009 dberr_t
3010 row_sel_get_clust_rec_for_mysql(
3011 /*============================*/
3012  row_prebuilt_t* prebuilt,
3013  dict_index_t* sec_index,
3014  const rec_t* rec,
3018  que_thr_t* thr,
3019  const rec_t** out_rec,
3023  ulint** offsets,
3027  mem_heap_t** offset_heap,
3029  mtr_t* mtr)
3032 {
3033  dict_index_t* clust_index;
3034  const rec_t* clust_rec;
3035  rec_t* old_vers;
3036  dberr_t err;
3037  trx_t* trx;
3038 
3039  *out_rec = NULL;
3040  trx = thr_get_trx(thr);
3041 
3042  row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
3043  sec_index, *offsets, trx);
3044 
3045  clust_index = dict_table_get_first_index(sec_index->table);
3046 
3047  btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
3048  PAGE_CUR_LE, BTR_SEARCH_LEAF,
3049  &prebuilt->clust_pcur, 0, mtr);
3050 
3051  clust_rec = btr_pcur_get_rec(&prebuilt->clust_pcur);
3052 
3053  prebuilt->clust_pcur.trx_if_known = trx;
3054 
3055  /* Note: only if the search ends up on a non-infimum record is the
3056  low_match value the real match to the search tuple */
3057 
3058  if (!page_rec_is_user_rec(clust_rec)
3059  || btr_pcur_get_low_match(&prebuilt->clust_pcur)
3060  < dict_index_get_n_unique(clust_index)) {
3061 
3062  /* In a rare case it is possible that no clust rec is found
3063  for a delete-marked secondary index record: if in row0umod.cc
3064  in row_undo_mod_remove_clust_low() we have already removed
3065  the clust rec, while purge is still cleaning and removing
3066  secondary index records associated with earlier versions of
3067  the clustered index record. In that case we know that the
3068  clustered index record did not exist in the read view of
3069  trx. */
3070 
3071  if (!rec_get_deleted_flag(rec,
3072  dict_table_is_comp(sec_index->table))
3073  || prebuilt->select_lock_type != LOCK_NONE) {
3074  ut_print_timestamp(stderr);
3075  fputs(" InnoDB: error clustered record"
3076  " for sec rec not found\n"
3077  "InnoDB: ", stderr);
3078  dict_index_name_print(stderr, trx, sec_index);
3079  fputs("\n"
3080  "InnoDB: sec index record ", stderr);
3081  rec_print(stderr, rec, sec_index);
3082  fputs("\n"
3083  "InnoDB: clust index record ", stderr);
3084  rec_print(stderr, clust_rec, clust_index);
3085  putc('\n', stderr);
3086  trx_print(stderr, trx, 600);
3087  fputs("\n"
3088  "InnoDB: Submit a detailed bug report"
3089  " to http://bugs.mysql.com\n", stderr);
3090  ut_ad(0);
3091  }
3092 
3093  clust_rec = NULL;
3094 
3095  err = DB_SUCCESS;
3096  goto func_exit;
3097  }
3098 
3099  *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
3100  ULINT_UNDEFINED, offset_heap);
3101 
3102  if (prebuilt->select_lock_type != LOCK_NONE) {
3103  /* Try to place a lock on the index record; we are searching
3104  the clust rec with a unique condition, hence
3105  we set a LOCK_REC_NOT_GAP type lock */
3106 
3108  0, btr_pcur_get_block(&prebuilt->clust_pcur),
3109  clust_rec, clust_index, *offsets,
3110  static_cast<enum lock_mode>(prebuilt->select_lock_type),
3112  thr);
3113 
3114  switch (err) {
3115  case DB_SUCCESS:
3116  case DB_SUCCESS_LOCKED_REC:
3117  break;
3118  default:
3119  goto err_exit;
3120  }
3121  } else {
3122  /* This is a non-locking consistent read: if necessary, fetch
3123  a previous version of the record */
3124 
3125  old_vers = NULL;
3126 
3127  /* If the isolation level allows reading of uncommitted data,
3128  then we never look for an earlier version */
3129 
3130  if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3132  clust_rec, clust_index, *offsets,
3133  trx->read_view)) {
3134 
3135  /* The following call returns 'offsets' associated with
3136  'old_vers' */
3137  err = row_sel_build_prev_vers_for_mysql(
3138  trx->read_view, clust_index, prebuilt,
3139  clust_rec, offsets, offset_heap, &old_vers,
3140  mtr);
3141 
3142  if (err != DB_SUCCESS || old_vers == NULL) {
3143 
3144  goto err_exit;
3145  }
3146 
3147  clust_rec = old_vers;
3148  }
3149 
3150  /* If we had to go to an earlier version of row or the
3151  secondary index record is delete marked, then it may be that
3152  the secondary index record corresponding to clust_rec
3153  (or old_vers) is not rec; in that case we must ignore
3154  such row because in our snapshot rec would not have existed.
3155  Remember that from rec we cannot see directly which transaction
3156  id corresponds to it: we have to go to the clustered index
3157  record. A query where we want to fetch all rows where
3158  the secondary index value is in some interval would return
3159  a wrong result if we would not drop rows which we come to
3160  visit through secondary index records that would not really
3161  exist in our snapshot. */
3162 
3163  if (clust_rec
3164  && (old_vers
3165  || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3167  sec_index->table)))
3168  && !row_sel_sec_rec_is_for_clust_rec(
3169  rec, sec_index, clust_rec, clust_index)) {
3170  clust_rec = NULL;
3171 #ifdef UNIV_SEARCH_DEBUG
3172  } else {
3173  ut_a(clust_rec == NULL
3174  || row_sel_sec_rec_is_for_clust_rec(
3175  rec, sec_index, clust_rec, clust_index));
3176 #endif
3177  }
3178 
3179  err = DB_SUCCESS;
3180  }
3181 
3182 func_exit:
3183  *out_rec = clust_rec;
3184 
3185  /* Store the current position if select_lock_type is not
3186  LOCK_NONE or if we are scanning using InnoDB APIs */
3187  if (prebuilt->select_lock_type != LOCK_NONE
3188  || prebuilt->innodb_api) {
3189  /* We may use the cursor in update or in unlock_row():
3190  store its position */
3191 
3192  btr_pcur_store_position(&prebuilt->clust_pcur, mtr);
3193  }
3194 
3195 err_exit:
3196  return(err);
3197 }
3198 
3199 /********************************************************************/
3205 static
3206 ibool
3207 sel_restore_position_for_mysql(
3208 /*===========================*/
3209  ibool* same_user_rec,
3213  ulint latch_mode,
3215  btr_pcur_t* pcur,
3217  ibool moves_up,
3219  mtr_t* mtr)
3221 {
3222  ibool success;
3223  ulint relative_position;
3224 
3225  relative_position = pcur->rel_pos;
3226 
3227  success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3228 
3229  *same_user_rec = success;
3230 
3231  if (relative_position == BTR_PCUR_ON) {
3232  if (success) {
3233  return(FALSE);
3234  }
3235 
3236  if (moves_up) {
3237  btr_pcur_move_to_next(pcur, mtr);
3238  }
3239 
3240  return(TRUE);
3241  }
3242 
3243  if (relative_position == BTR_PCUR_AFTER
3244  || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3245 
3246  if (moves_up) {
3247  return(TRUE);
3248  }
3249 
3250  if (btr_pcur_is_on_user_rec(pcur)) {
3251  btr_pcur_move_to_prev(pcur, mtr);
3252  }
3253 
3254  return(TRUE);
3255  }
3256 
3257  ut_ad(relative_position == BTR_PCUR_BEFORE
3258  || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3259 
3260  if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3261  btr_pcur_move_to_next(pcur, mtr);
3262  }
3263 
3264  return(TRUE);
3265 }
3266 
3267 /********************************************************************/
3269 static
3270 void
3271 row_sel_copy_cached_field_for_mysql(
3272 /*================================*/
3273  byte* buf,
3274  const byte* cache,
3275  const mysql_row_templ_t*templ)
3276 {
3277  ulint len;
3278 
3279  buf += templ->mysql_col_offset;
3280  cache += templ->mysql_col_offset;
3281 
3282  UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
3283 
3284  if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
3285  && templ->type != DATA_INT) {
3286  /* Check for != DATA_INT to make sure we do
3287  not treat MySQL ENUM or SET as a true VARCHAR!
3288  Find the actual length of the true VARCHAR field. */
3290  &len, cache, templ->mysql_length_bytes);
3291  len += templ->mysql_length_bytes;
3292  UNIV_MEM_INVALID(buf, templ->mysql_col_len);
3293  } else {
3294  len = templ->mysql_col_len;
3295  }
3296 
3297  ut_memcpy(buf, cache, len);
3298 }
3299 
3300 /********************************************************************/
3302 UNIV_INLINE
3303 void
3304 row_sel_dequeue_cached_row_for_mysql(
3305 /*=================================*/
3306  byte* buf,
3308  row_prebuilt_t* prebuilt)
3309 {
3310  ulint i;
3311  const mysql_row_templ_t*templ;
3312  const byte* cached_rec;
3313  ut_ad(prebuilt->n_fetch_cached > 0);
3314  ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3315 
3316  UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
3317 
3318  cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
3319 
3320  if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3321  /* Copy cache record field by field, don't touch fields that
3322  are not covered by current key */
3323 
3324  for (i = 0; i < prebuilt->n_template; i++) {
3325  templ = prebuilt->mysql_template + i;
3326  row_sel_copy_cached_field_for_mysql(
3327  buf, cached_rec, templ);
3328  /* Copy NULL bit of the current field from cached_rec
3329  to buf */
3330  if (templ->mysql_null_bit_mask) {
3331  buf[templ->mysql_null_byte_offset]
3332  ^= (buf[templ->mysql_null_byte_offset]
3333  ^ cached_rec[templ->mysql_null_byte_offset])
3334  & (byte) templ->mysql_null_bit_mask;
3335  }
3336  }
3337  } else if (prebuilt->mysql_prefix_len > 63) {
3338  /* The record is long. Copy it field by field, in case
3339  there are some long VARCHAR column of which only a
3340  small length is being used. */
3341  UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
3342 
3343  /* First copy the NULL bits. */
3344  ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
3345  /* Then copy the requested fields. */
3346 
3347  for (i = 0; i < prebuilt->n_template; i++) {
3348  row_sel_copy_cached_field_for_mysql(
3349  buf, cached_rec, prebuilt->mysql_template + i);
3350  }
3351  } else {
3352  ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
3353  }
3354 
3355  prebuilt->n_fetch_cached--;
3356  prebuilt->fetch_cache_first++;
3357 
3358  if (prebuilt->n_fetch_cached == 0) {
3359  prebuilt->fetch_cache_first = 0;
3360  }
3361 }
3362 
3363 /********************************************************************/
3365 UNIV_INLINE
3366 void
3367 row_sel_prefetch_cache_init(
3368 /*========================*/
3369  row_prebuilt_t* prebuilt)
3370 {
3371  ulint i;
3372  ulint sz;
3373  byte* ptr;
3374 
3375  /* Reserve space for the magic number. */
3376  sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
3377  ptr = static_cast<byte*>(mem_alloc(sz));
3378 
3379  for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
3380 
3381  /* A user has reported memory corruption in these
3382  buffers in Linux. Put magic numbers there to help
3383  to track a possible bug. */
3384 
3385  mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3386  ptr += 4;
3387 
3388  prebuilt->fetch_cache[i] = ptr;
3389  ptr += prebuilt->mysql_row_len;
3390 
3391  mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3392  ptr += 4;
3393  }
3394 }
3395 
3396 /********************************************************************/
3399 UNIV_INLINE
3400 byte*
3401 row_sel_fetch_last_buf(
3402 /*===================*/
3403  row_prebuilt_t* prebuilt)
3404 {
3405  ut_ad(!prebuilt->templ_contains_blob);
3406  ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3407 
3408  if (prebuilt->fetch_cache[0] == NULL) {
3409  /* Allocate memory for the fetch cache */
3410  ut_ad(prebuilt->n_fetch_cached == 0);
3411 
3412  row_sel_prefetch_cache_init(prebuilt);
3413  }
3414 
3415  ut_ad(prebuilt->fetch_cache_first == 0);
3416  UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3417  prebuilt->mysql_row_len);
3418 
3419  return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
3420 }
3421 
3422 /********************************************************************/
3424 UNIV_INLINE
3425 void
3426 row_sel_enqueue_cache_row_for_mysql(
3427 /*================================*/
3428  byte* mysql_rec,
3429  row_prebuilt_t* prebuilt)
3430 {
3431  /* For non ICP code path the row should already exist in the
3432  next fetch cache slot. */
3433 
3434  if (prebuilt->idx_cond != NULL) {
3435  byte* dest = row_sel_fetch_last_buf(prebuilt);
3436 
3437  ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
3438  }
3439 
3440  ++prebuilt->n_fetch_cached;
3441 }
3442 
3443 /*********************************************************************/
3449 static
3450 ulint
3451 row_sel_try_search_shortcut_for_mysql(
3452 /*==================================*/
3453  const rec_t** out_rec,
3454  row_prebuilt_t* prebuilt,
3455  ulint** offsets,
3456  mem_heap_t** heap,
3457  mtr_t* mtr)
3458 {
3459  dict_index_t* index = prebuilt->index;
3460  const dtuple_t* search_tuple = prebuilt->search_tuple;
3461  btr_pcur_t* pcur = &prebuilt->pcur;
3462  trx_t* trx = prebuilt->trx;
3463  const rec_t* rec;
3464 
3465  ut_ad(dict_index_is_clust(index));
3466  ut_ad(!prebuilt->templ_contains_blob);
3467 
3468 #ifndef UNIV_SEARCH_DEBUG
3469  btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3470  BTR_SEARCH_LEAF, pcur,
3471  (trx->has_search_latch)
3472  ? RW_S_LATCH
3473  : 0,
3474  mtr);
3475 #else /* UNIV_SEARCH_DEBUG */
3476  btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3477  BTR_SEARCH_LEAF, pcur,
3478  0,
3479  mtr);
3480 #endif /* UNIV_SEARCH_DEBUG */
3481  rec = btr_pcur_get_rec(pcur);
3482 
3483  if (!page_rec_is_user_rec(rec)) {
3484 
3485  return(SEL_RETRY);
3486  }
3487 
3488  /* As the cursor is now placed on a user record after a search with
3489  the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3490  fields in the user record matched to the search tuple */
3491 
3492  if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3493 
3494  return(SEL_EXHAUSTED);
3495  }
3496 
3497  /* This is a non-locking consistent read: if necessary, fetch
3498  a previous version of the record */
3499 
3500  *offsets = rec_get_offsets(rec, index, *offsets,
3501  ULINT_UNDEFINED, heap);
3502 
3503  if (!lock_clust_rec_cons_read_sees(rec, index,
3504  *offsets, trx->read_view)) {
3505 
3506  return(SEL_RETRY);
3507  }
3508 
3509  if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3510 
3511  return(SEL_EXHAUSTED);
3512  }
3513 
3514  *out_rec = rec;
3515 
3516  return(SEL_FOUND);
3517 }
3518 
3519 /*********************************************************************/
3522 static
3523 enum icp_result
3524 row_search_idx_cond_check(
3525 /*======================*/
3526  byte* mysql_rec,
3530  row_prebuilt_t* prebuilt,
3532  const rec_t* rec,
3533  const ulint* offsets)
3534 {
3535  enum icp_result result;
3536  ulint i;
3537 
3538  ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
3539 
3540  if (!prebuilt->idx_cond) {
3541  return(ICP_MATCH);
3542  }
3543 
3544  MONITOR_INC(MONITOR_ICP_ATTEMPTS);
3545 
3546  /* Convert to MySQL format those fields that are needed for
3547  evaluating the index condition. */
3548 
3549  if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
3550  mem_heap_empty(prebuilt->blob_heap);
3551  }
3552 
3553  for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
3554  const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3555 
3556  if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3557  rec, prebuilt->index, offsets,
3558  templ->icp_rec_field_no,
3559  templ)) {
3560  return(ICP_NO_MATCH);
3561  }
3562  }
3563 
3564  /* We assume that the index conditions on
3565  case-insensitive columns are case-insensitive. The
3566  case of such columns may be wrong in a secondary
3567  index, if the case of the column has been updated in
3568  the past, or a record has been deleted and a record
3569  inserted in a different case. */
3570  result = innobase_index_cond(prebuilt->idx_cond);
3571  switch (result) {
3572  case ICP_MATCH:
3573  /* Convert the remaining fields to MySQL format.
3574  If this is a secondary index record, we must defer
3575  this until we have fetched the clustered index record. */
3576  if (!prebuilt->need_to_access_clustered
3577  || dict_index_is_clust(prebuilt->index)) {
3578  if (!row_sel_store_mysql_rec(
3579  mysql_rec, prebuilt, rec, FALSE,
3580  prebuilt->index, offsets)) {
3581  ut_ad(dict_index_is_clust(prebuilt->index));
3582  return(ICP_NO_MATCH);
3583  }
3584  }
3585  MONITOR_INC(MONITOR_ICP_MATCH);
3586  return(result);
3587  case ICP_NO_MATCH:
3588  MONITOR_INC(MONITOR_ICP_NO_MATCH);
3589  return(result);
3590  case ICP_OUT_OF_RANGE:
3591  MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
3592  return(result);
3593  }
3594 
3595  ut_error;
3596  return(result);
3597 }
3598 
3599 /********************************************************************/
3607 UNIV_INTERN
3608 dberr_t
3610 /*=================*/
3611  byte* buf,
3613  ulint mode,
3614  row_prebuilt_t* prebuilt,
3621  ulint match_mode,
3623  ulint direction)
3628 {
3629  dict_index_t* index = prebuilt->index;
3630  ibool comp = dict_table_is_comp(index->table);
3631  const dtuple_t* search_tuple = prebuilt->search_tuple;
3632  btr_pcur_t* pcur = &prebuilt->pcur;
3633  trx_t* trx = prebuilt->trx;
3634  dict_index_t* clust_index;
3635  que_thr_t* thr;
3636  const rec_t* rec;
3637  const rec_t* result_rec = NULL;
3638  const rec_t* clust_rec;
3639  dberr_t err = DB_SUCCESS;
3640  ibool unique_search = FALSE;
3641  ibool mtr_has_extra_clust_latch = FALSE;
3642  ibool moves_up = FALSE;
3643  ibool set_also_gap_locks = TRUE;
3644  /* if the query is a plain locking SELECT, and the isolation level
3645  is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3646  ibool did_semi_consistent_read = FALSE;
3647  /* if the returned record was locked and we did a semi-consistent
3648  read (fetch the newest committed version), then this is set to
3649  TRUE */
3650 #ifdef UNIV_SEARCH_DEBUG
3651  ulint cnt = 0;
3652 #endif /* UNIV_SEARCH_DEBUG */
3653  ulint next_offs;
3654  ibool same_user_rec;
3655  mtr_t mtr;
3656  mem_heap_t* heap = NULL;
3657  ulint offsets_[REC_OFFS_NORMAL_SIZE];
3658  ulint* offsets = offsets_;
3659  ibool table_lock_waited = FALSE;
3660  byte* next_buf = 0;
3661 
3662  rec_offs_init(offsets_);
3663 
3664  ut_ad(index && pcur && search_tuple);
3665 
3666  /* We don't support FTS queries from the HANDLER interfaces, because
3667  we implemented FTS as reversed inverted index with auxiliary tables.
3668  So anything related to traditional index query would not apply to
3669  it. */
3670  if (index->type & DICT_FTS) {
3671  return(DB_END_OF_INDEX);
3672  }
3673 
3674 #ifdef UNIV_SYNC_DEBUG
3675  ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
3676 #endif /* UNIV_SYNC_DEBUG */
3677 
3678  if (dict_table_is_discarded(prebuilt->table)) {
3679 
3680  return(DB_TABLESPACE_DELETED);
3681 
3682  } else if (prebuilt->table->ibd_file_missing) {
3683 
3684  return(DB_TABLESPACE_NOT_FOUND);
3685 
3686  } else if (!prebuilt->index_usable) {
3687 
3688  return(DB_MISSING_HISTORY);
3689 
3690  } else if (dict_index_is_corrupted(index)) {
3691 
3692  return(DB_CORRUPTION);
3693 
3694  } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
3695  fprintf(stderr,
3696  "InnoDB: Error: trying to free a corrupt\n"
3697  "InnoDB: table handle. Magic n %lu, table name ",
3698  (ulong) prebuilt->magic_n);
3699  ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3700  putc('\n', stderr);
3701 
3702  mem_analyze_corruption(prebuilt);
3703 
3704  ut_error;
3705  }
3706 
3707 #if 0
3708  /* August 19, 2005 by Heikki: temporarily disable this error
3709  print until the cursor lock count is done correctly.
3710  See bugs #12263 and #12456!*/
3711 
3712  if (trx->n_mysql_tables_in_use == 0
3713  && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3714  /* Note that if MySQL uses an InnoDB temp table that it
3715  created inside LOCK TABLES, then n_mysql_tables_in_use can
3716  be zero; in that case select_lock_type is set to LOCK_X in
3717  ::start_stmt. */
3718 
3719  fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3720  "InnoDB: but it has not locked"
3721  " any tables in ::external_lock()!\n",
3722  stderr);
3723  trx_print(stderr, trx, 600);
3724  fputc('\n', stderr);
3725  }
3726 #endif
3727 
3728 #if 0
3729  fprintf(stderr, "Match mode %lu\n search tuple ",
3730  (ulong) match_mode);
3731  dtuple_print(search_tuple);
3732  fprintf(stderr, "N tables locked %lu\n",
3733  (ulong) trx->mysql_n_tables_locked);
3734 #endif
3735  /*-------------------------------------------------------------*/
3736  /* PHASE 0: Release a possible s-latch we are holding on the
3737  adaptive hash index latch if there is someone waiting behind */
3738 
3739  if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3740  && trx->has_search_latch) {
3741 
3742  /* There is an x-latch request on the adaptive hash index:
3743  release the s-latch to reduce starvation and wait for
3744  BTR_SEA_TIMEOUT rounds before trying to keep it again over
3745  calls from MySQL */
3746 
3747  rw_lock_s_unlock(&btr_search_latch);
3748  trx->has_search_latch = FALSE;
3749 
3751  }
3752 
3753  /* Reset the new record lock info if srv_locks_unsafe_for_binlog
3754  is set or session is using a READ COMMITED isolation level. Then
3755  we are able to remove the record locks set here on an individual
3756  row. */
3757  prebuilt->new_rec_locks = 0;
3758 
3759  /*-------------------------------------------------------------*/
3760  /* PHASE 1: Try to pop the row from the prefetch cache */
3761 
3762  if (UNIV_UNLIKELY(direction == 0)) {
3763  trx->op_info = "starting index read";
3764 
3765  prebuilt->n_rows_fetched = 0;
3766  prebuilt->n_fetch_cached = 0;
3767  prebuilt->fetch_cache_first = 0;
3768 
3769  if (prebuilt->sel_graph == NULL) {
3770  /* Build a dummy select query graph */
3771  row_prebuild_sel_graph(prebuilt);
3772  }
3773  } else {
3774  trx->op_info = "fetching rows";
3775 
3776  if (prebuilt->n_rows_fetched == 0) {
3777  prebuilt->fetch_direction = direction;
3778  }
3779 
3780  if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3781  if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3782  ut_error;
3783  /* TODO: scrollable cursor: restore cursor to
3784  the place of the latest returned row,
3785  or better: prevent caching for a scroll
3786  cursor! */
3787  }
3788 
3789  prebuilt->n_rows_fetched = 0;
3790  prebuilt->n_fetch_cached = 0;
3791  prebuilt->fetch_cache_first = 0;
3792 
3793  } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3794  row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
3795 
3796  prebuilt->n_rows_fetched++;
3797 
3798  err = DB_SUCCESS;
3799  goto func_exit;
3800  }
3801 
3802  if (prebuilt->fetch_cache_first > 0
3803  && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3804 
3805  /* The previous returned row was popped from the fetch
3806  cache, but the cache was not full at the time of the
3807  popping: no more rows can exist in the result set */
3808 
3809  err = DB_RECORD_NOT_FOUND;
3810  goto func_exit;
3811  }
3812 
3813  prebuilt->n_rows_fetched++;
3814 
3815  if (prebuilt->n_rows_fetched > 1000000000) {
3816  /* Prevent wrap-over */
3817  prebuilt->n_rows_fetched = 500000000;
3818  }
3819 
3820  mode = pcur->search_mode;
3821  }
3822 
3823  /* In a search where at most one record in the index may match, we
3824  can use a LOCK_REC_NOT_GAP type record lock when locking a
3825  non-delete-marked matching record.
3826 
3827  Note that in a unique secondary index there may be different
3828  delete-marked versions of a record where only the primary key
3829  values differ: thus in a secondary index we must use next-key
3830  locks when locking delete-marked records. */
3831 
3832  if (match_mode == ROW_SEL_EXACT
3833  && dict_index_is_unique(index)
3834  && dtuple_get_n_fields(search_tuple)
3835  == dict_index_get_n_unique(index)
3836  && (dict_index_is_clust(index)
3837  || !dtuple_contains_null(search_tuple))) {
3838 
3839  /* Note above that a UNIQUE secondary index can contain many
3840  rows with the same key value if one of the columns is the SQL
3841  null. A clustered index under MySQL can never contain null
3842  columns because we demand that all the columns in primary key
3843  are non-null. */
3844 
3845  unique_search = TRUE;
3846 
3847  /* Even if the condition is unique, MySQL seems to try to
3848  retrieve also a second row if a primary key contains more than
3849  1 column. Return immediately if this is not a HANDLER
3850  command. */
3851 
3852  if (UNIV_UNLIKELY(direction != 0
3853  && !prebuilt->used_in_HANDLER)) {
3854 
3855  err = DB_RECORD_NOT_FOUND;
3856  goto func_exit;
3857  }
3858  }
3859 
3860  mtr_start(&mtr);
3861 
3862  /*-------------------------------------------------------------*/
3863  /* PHASE 2: Try fast adaptive hash index search if possible */
3864 
3865  /* Next test if this is the special case where we can use the fast
3866  adaptive hash index to try the search. Since we must release the
3867  search system latch when we retrieve an externally stored field, we
3868  cannot use the adaptive hash index in a search in the case the row
3869  may be long and there may be externally stored fields */
3870 
3871  if (UNIV_UNLIKELY(direction == 0)
3872  && unique_search
3873  && dict_index_is_clust(index)
3874  && !prebuilt->templ_contains_blob
3875  && !prebuilt->used_in_HANDLER
3876  && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
3877  && !prebuilt->innodb_api) {
3878 
3879  mode = PAGE_CUR_GE;
3880 
3881  if (trx->mysql_n_tables_locked == 0
3882  && prebuilt->select_lock_type == LOCK_NONE
3883  && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3884  && trx->read_view) {
3885 
3886  /* This is a SELECT query done as a consistent read,
3887  and the read view has already been allocated:
3888  let us try a search shortcut through the hash
3889  index.
3890  NOTE that we must also test that
3891  mysql_n_tables_locked == 0, because this might
3892  also be INSERT INTO ... SELECT ... or
3893  CREATE TABLE ... SELECT ... . Our algorithm is
3894  NOT prepared to inserts interleaved with the SELECT,
3895  and if we try that, we can deadlock on the adaptive
3896  hash index semaphore! */
3897 
3898 #ifndef UNIV_SEARCH_DEBUG
3899  if (!trx->has_search_latch) {
3901  trx->has_search_latch = TRUE;
3902  }
3903 #endif
3904  switch (row_sel_try_search_shortcut_for_mysql(
3905  &rec, prebuilt, &offsets, &heap,
3906  &mtr)) {
3907  case SEL_FOUND:
3908 #ifdef UNIV_SEARCH_DEBUG
3909  ut_a(0 == cmp_dtuple_rec(search_tuple,
3910  rec, offsets));
3911 #endif
3912  /* At this point, rec is protected by
3913  a page latch that was acquired by
3914  row_sel_try_search_shortcut_for_mysql().
3915  The latch will not be released until
3916  mtr_commit(&mtr). */
3917  ut_ad(!rec_get_deleted_flag(rec, comp));
3918 
3919  if (prebuilt->idx_cond) {
3920  switch (row_search_idx_cond_check(
3921  buf, prebuilt,
3922  rec, offsets)) {
3923  case ICP_NO_MATCH:
3924  case ICP_OUT_OF_RANGE:
3925  goto shortcut_mismatch;
3926  case ICP_MATCH:
3927  goto shortcut_match;
3928  }
3929  }
3930 
3931  if (!row_sel_store_mysql_rec(
3932  buf, prebuilt,
3933  rec, FALSE, index, offsets)) {
3934  /* Only fresh inserts may contain
3935  incomplete externally stored
3936  columns. Pretend that such
3937  records do not exist. Such
3938  records may only be accessed
3939  at the READ UNCOMMITTED
3940  isolation level or when
3941  rolling back a recovered
3942  transaction. Rollback happens
3943  at a lower level, not here. */
3944 
3945  /* Proceed as in case SEL_RETRY. */
3946  break;
3947  }
3948 
3949  shortcut_match:
3950  mtr_commit(&mtr);
3951 
3952  /* ut_print_name(stderr, index->name);
3953  fputs(" shortcut\n", stderr); */
3954 
3955  err = DB_SUCCESS;
3956  goto release_search_latch_if_needed;
3957 
3958  case SEL_EXHAUSTED:
3959  shortcut_mismatch:
3960  mtr_commit(&mtr);
3961 
3962  /* ut_print_name(stderr, index->name);
3963  fputs(" record not found 2\n", stderr); */
3964 
3965  err = DB_RECORD_NOT_FOUND;
3966 release_search_latch_if_needed:
3967  if (trx->search_latch_timeout > 0
3968  && trx->has_search_latch) {
3969 
3970  trx->search_latch_timeout--;
3971 
3972  rw_lock_s_unlock(&btr_search_latch);
3973  trx->has_search_latch = FALSE;
3974  }
3975 
3976  /* NOTE that we do NOT store the cursor
3977  position */
3978  goto func_exit;
3979 
3980  case SEL_RETRY:
3981  break;
3982 
3983  default:
3984  ut_ad(0);
3985  }
3986 
3987  mtr_commit(&mtr);
3988  mtr_start(&mtr);
3989  }
3990  }
3991 
3992  /*-------------------------------------------------------------*/
3993  /* PHASE 3: Open or restore index cursor position */
3994 
3995  if (trx->has_search_latch) {
3996  rw_lock_s_unlock(&btr_search_latch);
3997  trx->has_search_latch = FALSE;
3998  }
3999 
4000  /* The state of a running trx can only be changed by the
4001  thread that is currently serving the transaction. Because we
4002  are that thread, we can read trx->state without holding any
4003  mutex. */
4004  ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
4005 
4006  ut_ad(trx->state == TRX_STATE_NOT_STARTED
4007  || trx->state == TRX_STATE_ACTIVE);
4008 
4009  ut_ad(prebuilt->sql_stat_start
4010  || prebuilt->select_lock_type != LOCK_NONE
4011  || trx->read_view);
4012 
4013  trx_start_if_not_started(trx);
4014 
4015  if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
4016  && prebuilt->select_lock_type != LOCK_NONE
4017  && trx->mysql_thd != NULL
4018  && thd_is_select(trx->mysql_thd)) {
4019  /* It is a plain locking SELECT and the isolation
4020  level is low: do not lock gaps */
4021 
4022  set_also_gap_locks = FALSE;
4023  }
4024 
4025  /* Note that if the search mode was GE or G, then the cursor
4026  naturally moves upward (in fetch next) in alphabetical order,
4027  otherwise downward */
4028 
4029  if (UNIV_UNLIKELY(direction == 0)) {
4030  if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
4031  moves_up = TRUE;
4032  }
4033  } else if (direction == ROW_SEL_NEXT) {
4034  moves_up = TRUE;
4035  }
4036 
4037  thr = que_fork_get_first_thr(prebuilt->sel_graph);
4038 
4040 
4041  clust_index = dict_table_get_first_index(index->table);
4042 
4043  /* Do some start-of-statement preparations */
4044 
4045  if (!prebuilt->sql_stat_start) {
4046  /* No need to set an intention lock or assign a read view */
4047 
4048  if (UNIV_UNLIKELY
4049  (trx->read_view == NULL
4050  && prebuilt->select_lock_type == LOCK_NONE)) {
4051 
4052  fputs("InnoDB: Error: MySQL is trying to"
4053  " perform a consistent read\n"
4054  "InnoDB: but the read view is not assigned!\n",
4055  stderr);
4056  trx_print(stderr, trx, 600);
4057  fputc('\n', stderr);
4058  ut_error;
4059  }
4060  } else if (prebuilt->select_lock_type == LOCK_NONE) {
4061  /* This is a consistent read */
4062  /* Assign a read view for the query */
4063 
4064  trx_assign_read_view(trx);
4065  prebuilt->sql_stat_start = FALSE;
4066  } else {
4067 wait_table_again:
4068  err = lock_table(0, index->table,
4069  prebuilt->select_lock_type == LOCK_S
4070  ? LOCK_IS : LOCK_IX, thr);
4071 
4072  if (err != DB_SUCCESS) {
4073 
4074  table_lock_waited = TRUE;
4075  goto lock_table_wait;
4076  }
4077  prebuilt->sql_stat_start = FALSE;
4078  }
4079 
4080  /* Open or restore index cursor position */
4081 
4082  if (UNIV_LIKELY(direction != 0)) {
4083  ibool need_to_process = sel_restore_position_for_mysql(
4084  &same_user_rec, BTR_SEARCH_LEAF,
4085  pcur, moves_up, &mtr);
4086 
4087  if (UNIV_UNLIKELY(need_to_process)) {
4088  if (UNIV_UNLIKELY(prebuilt->row_read_type
4089  == ROW_READ_DID_SEMI_CONSISTENT)) {
4090  /* We did a semi-consistent read,
4091  but the record was removed in
4092  the meantime. */
4093  prebuilt->row_read_type
4094  = ROW_READ_TRY_SEMI_CONSISTENT;
4095  }
4096  } else if (UNIV_LIKELY(prebuilt->row_read_type
4097  != ROW_READ_DID_SEMI_CONSISTENT)) {
4098 
4099  /* The cursor was positioned on the record
4100  that we returned previously. If we need
4101  to repeat a semi-consistent read as a
4102  pessimistic locking read, the record
4103  cannot be skipped. */
4104 
4105  goto next_rec;
4106  }
4107 
4108  } else if (dtuple_get_n_fields(search_tuple) > 0) {
4109 
4110  btr_pcur_open_with_no_init(index, search_tuple, mode,
4112  pcur, 0, &mtr);
4113 
4114  pcur->trx_if_known = trx;
4115 
4116  rec = btr_pcur_get_rec(pcur);
4117 
4118  if (!moves_up
4119  && !page_rec_is_supremum(rec)
4120  && set_also_gap_locks
4122  || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4123  && prebuilt->select_lock_type != LOCK_NONE) {
4124 
4125  /* Try to place a gap lock on the next index record
4126  to prevent phantoms in ORDER BY ... DESC queries */
4127  const rec_t* next_rec = page_rec_get_next_const(rec);
4128 
4129  offsets = rec_get_offsets(next_rec, index, offsets,
4130  ULINT_UNDEFINED, &heap);
4131  err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4132  next_rec, index, offsets,
4133  prebuilt->select_lock_type,
4134  LOCK_GAP, thr);
4135 
4136  switch (err) {
4137  case DB_SUCCESS_LOCKED_REC:
4138  err = DB_SUCCESS;
4139  case DB_SUCCESS:
4140  break;
4141  default:
4142  goto lock_wait_or_error;
4143  }
4144  }
4145  } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
4147  mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
4148  pcur, false, 0, &mtr);
4149  }
4150 
4151 rec_loop:
4152  DEBUG_SYNC_C("row_search_rec_loop");
4153  if (trx_is_interrupted(trx)) {
4154  btr_pcur_store_position(pcur, &mtr);
4155  err = DB_INTERRUPTED;
4156  goto normal_return;
4157  }
4158 
4159  /*-------------------------------------------------------------*/
4160  /* PHASE 4: Look for matching records in a loop */
4161 
4162  rec = btr_pcur_get_rec(pcur);
4163  ut_ad(!!page_rec_is_comp(rec) == comp);
4164 #ifdef UNIV_SEARCH_DEBUG
4165  /*
4166  fputs("Using ", stderr);
4167  dict_index_name_print(stderr, trx, index);
4168  fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
4169  page_get_page_no(page_align(rec)));
4170  rec_print(stderr, rec, index);
4171  printf("delete-mark: %lu\n",
4172  rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
4173  */
4174 #endif /* UNIV_SEARCH_DEBUG */
4175 
4176  if (page_rec_is_infimum(rec)) {
4177 
4178  /* The infimum record on a page cannot be in the result set,
4179  and neither can a record lock be placed on it: we skip such
4180  a record. */
4181 
4182  goto next_rec;
4183  }
4184 
4185  if (page_rec_is_supremum(rec)) {
4186 
4187  if (set_also_gap_locks
4189  || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4190  && prebuilt->select_lock_type != LOCK_NONE) {
4191 
4192  /* Try to place a lock on the index record */
4193 
4194  /* If innodb_locks_unsafe_for_binlog option is used
4195  or this session is using a READ COMMITTED isolation
4196  level we do not lock gaps. Supremum record is really
4197  a gap and therefore we do not set locks there. */
4198 
4199  offsets = rec_get_offsets(rec, index, offsets,
4200  ULINT_UNDEFINED, &heap);
4201  err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4202  rec, index, offsets,
4203  prebuilt->select_lock_type,
4204  LOCK_ORDINARY, thr);
4205 
4206  switch (err) {
4207  case DB_SUCCESS_LOCKED_REC:
4208  err = DB_SUCCESS;
4209  case DB_SUCCESS:
4210  break;
4211  default:
4212  goto lock_wait_or_error;
4213  }
4214  }
4215  /* A page supremum record cannot be in the result set: skip
4216  it now that we have placed a possible lock on it */
4217 
4218  goto next_rec;
4219  }
4220 
4221  /*-------------------------------------------------------------*/
4222  /* Do sanity checks in case our cursor has bumped into page
4223  corruption */
4224 
4225  if (comp) {
4226  next_offs = rec_get_next_offs(rec, TRUE);
4227  if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
4228 
4229  goto wrong_offs;
4230  }
4231  } else {
4232  next_offs = rec_get_next_offs(rec, FALSE);
4233  if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
4234 
4235  goto wrong_offs;
4236  }
4237  }
4238 
4239  if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
4240 
4241 wrong_offs:
4242  if (srv_force_recovery == 0 || moves_up == FALSE) {
4243  ut_print_timestamp(stderr);
4244  buf_page_print(page_align(rec), 0,
4246  fprintf(stderr,
4247  "\nInnoDB: rec address %p,"
4248  " buf block fix count %lu\n",
4249  (void*) rec, (ulong)
4250  btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
4251  ->page.buf_fix_count);
4252  fprintf(stderr,
4253  "InnoDB: Index corruption: rec offs %lu"
4254  " next offs %lu, page no %lu,\n"
4255  "InnoDB: ",
4256  (ulong) page_offset(rec),
4257  (ulong) next_offs,
4258  (ulong) page_get_page_no(page_align(rec)));
4259  dict_index_name_print(stderr, trx, index);
4260  fputs(". Run CHECK TABLE. You may need to\n"
4261  "InnoDB: restore from a backup, or"
4262  " dump + drop + reimport the table.\n",
4263  stderr);
4264  ut_ad(0);
4265  err = DB_CORRUPTION;
4266 
4267  goto lock_wait_or_error;
4268  } else {
4269  /* The user may be dumping a corrupt table. Jump
4270  over the corruption to recover as much as possible. */
4271 
4272  fprintf(stderr,
4273  "InnoDB: Index corruption: rec offs %lu"
4274  " next offs %lu, page no %lu,\n"
4275  "InnoDB: ",
4276  (ulong) page_offset(rec),
4277  (ulong) next_offs,
4278  (ulong) page_get_page_no(page_align(rec)));
4279  dict_index_name_print(stderr, trx, index);
4280  fputs(". We try to skip the rest of the page.\n",
4281  stderr);
4282 
4283  btr_pcur_move_to_last_on_page(pcur, &mtr);
4284 
4285  goto next_rec;
4286  }
4287  }
4288  /*-------------------------------------------------------------*/
4289 
4290  /* Calculate the 'offsets' associated with 'rec' */
4291 
4292  ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX);
4293  ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
4294 
4295  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4296 
4297  if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4298  if (!rec_validate(rec, offsets)
4299  || !btr_index_rec_validate(rec, index, FALSE)) {
4300  fprintf(stderr,
4301  "InnoDB: Index corruption: rec offs %lu"
4302  " next offs %lu, page no %lu,\n"
4303  "InnoDB: ",
4304  (ulong) page_offset(rec),
4305  (ulong) next_offs,
4306  (ulong) page_get_page_no(page_align(rec)));
4307  dict_index_name_print(stderr, trx, index);
4308  fputs(". We try to skip the record.\n",
4309  stderr);
4310 
4311  goto next_rec;
4312  }
4313  }
4314 
4315  /* Note that we cannot trust the up_match value in the cursor at this
4316  place because we can arrive here after moving the cursor! Thus
4317  we have to recompare rec and search_tuple to determine if they
4318  match enough. */
4319 
4320  if (match_mode == ROW_SEL_EXACT) {
4321  /* Test if the index record matches completely to search_tuple
4322  in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4323 
4324  /* fputs("Comparing rec and search tuple\n", stderr); */
4325 
4326  if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4327 
4328  if (set_also_gap_locks
4330  || trx->isolation_level
4331  <= TRX_ISO_READ_COMMITTED)
4332  && prebuilt->select_lock_type != LOCK_NONE) {
4333 
4334  /* Try to place a gap lock on the index
4335  record only if innodb_locks_unsafe_for_binlog
4336  option is not set or this session is not
4337  using a READ COMMITTED isolation level. */
4338 
4339  err = sel_set_rec_lock(
4340  btr_pcur_get_block(pcur),
4341  rec, index, offsets,
4342  prebuilt->select_lock_type, LOCK_GAP,
4343  thr);
4344 
4345  switch (err) {
4346  case DB_SUCCESS_LOCKED_REC:
4347  case DB_SUCCESS:
4348  break;
4349  default:
4350  goto lock_wait_or_error;
4351  }
4352  }
4353 
4354  btr_pcur_store_position(pcur, &mtr);
4355 
4356  err = DB_RECORD_NOT_FOUND;
4357 #if 0
4358  ut_print_name(stderr, trx, FALSE, index->name);
4359  fputs(" record not found 3\n", stderr);
4360 #endif
4361 
4362  goto normal_return;
4363  }
4364 
4365  } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4366 
4367  if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4368 
4369  if (set_also_gap_locks
4371  || trx->isolation_level
4372  <= TRX_ISO_READ_COMMITTED)
4373  && prebuilt->select_lock_type != LOCK_NONE) {
4374 
4375  /* Try to place a gap lock on the index
4376  record only if innodb_locks_unsafe_for_binlog
4377  option is not set or this session is not
4378  using a READ COMMITTED isolation level. */
4379 
4380  err = sel_set_rec_lock(
4381  btr_pcur_get_block(pcur),
4382  rec, index, offsets,
4383  prebuilt->select_lock_type, LOCK_GAP,
4384  thr);
4385 
4386  switch (err) {
4387  case DB_SUCCESS_LOCKED_REC:
4388  case DB_SUCCESS:
4389  break;
4390  default:
4391  goto lock_wait_or_error;
4392  }
4393  }
4394 
4395  btr_pcur_store_position(pcur, &mtr);
4396 
4397  err = DB_RECORD_NOT_FOUND;
4398 #if 0
4399  ut_print_name(stderr, trx, FALSE, index->name);
4400  fputs(" record not found 4\n", stderr);
4401 #endif
4402 
4403  goto normal_return;
4404  }
4405  }
4406 
4407  /* We are ready to look at a possible new index entry in the result
4408  set: the cursor is now placed on a user record */
4409 
4410  if (prebuilt->select_lock_type != LOCK_NONE) {
4411  /* Try to place a lock on the index record; note that delete
4412  marked records are a special case in a unique search. If there
4413  is a non-delete marked record, then it is enough to lock its
4414  existence with LOCK_REC_NOT_GAP. */
4415 
4416  /* If innodb_locks_unsafe_for_binlog option is used
4417  or this session is using a READ COMMITED isolation
4418  level we lock only the record, i.e., next-key locking is
4419  not used. */
4420 
4421  ulint lock_type;
4422 
4423  if (!set_also_gap_locks
4425  || trx->isolation_level <= TRX_ISO_READ_COMMITTED
4426  || (unique_search && !rec_get_deleted_flag(rec, comp))) {
4427 
4428  goto no_gap_lock;
4429  } else {
4430  lock_type = LOCK_ORDINARY;
4431  }
4432 
4433  /* If we are doing a 'greater or equal than a primary key
4434  value' search from a clustered index, and we find a record
4435  that has that exact primary key value, then there is no need
4436  to lock the gap before the record, because no insert in the
4437  gap can be in our search range. That is, no phantom row can
4438  appear that way.
4439 
4440  An example: if col1 is the primary key, the search is WHERE
4441  col1 >= 100, and we find a record where col1 = 100, then no
4442  need to lock the gap before that record. */
4443 
4444  if (index == clust_index
4445  && mode == PAGE_CUR_GE
4446  && direction == 0
4447  && dtuple_get_n_fields_cmp(search_tuple)
4448  == dict_index_get_n_unique(index)
4449  && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4450 no_gap_lock:
4451  lock_type = LOCK_REC_NOT_GAP;
4452  }
4453 
4454  err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4455  rec, index, offsets,
4456  prebuilt->select_lock_type,
4457  lock_type, thr);
4458 
4459  switch (err) {
4460  const rec_t* old_vers;
4461  case DB_SUCCESS_LOCKED_REC:
4463  || trx->isolation_level
4464  <= TRX_ISO_READ_COMMITTED) {
4465  /* Note that a record of
4466  prebuilt->index was locked. */
4467  prebuilt->new_rec_locks = 1;
4468  }
4469  err = DB_SUCCESS;
4470  case DB_SUCCESS:
4471  break;
4472  case DB_LOCK_WAIT:
4473  /* Never unlock rows that were part of a conflict. */
4474  prebuilt->new_rec_locks = 0;
4475 
4476  if (UNIV_LIKELY(prebuilt->row_read_type
4477  != ROW_READ_TRY_SEMI_CONSISTENT)
4478  || unique_search
4479  || index != clust_index) {
4480 
4481  goto lock_wait_or_error;
4482  }
4483 
4484  /* The following call returns 'offsets'
4485  associated with 'old_vers' */
4486  row_sel_build_committed_vers_for_mysql(
4487  clust_index, prebuilt, rec,
4488  &offsets, &heap, &old_vers, &mtr);
4489 
4490  /* Check whether it was a deadlock or not, if not
4491  a deadlock and the transaction had to wait then
4492  release the lock it is waiting on. */
4493 
4494  err = lock_trx_handle_wait(trx);
4495 
4496  switch (err) {
4497  case DB_SUCCESS:
4498  /* The lock was granted while we were
4499  searching for the last committed version.
4500  Do a normal locking read. */
4501 
4502  offsets = rec_get_offsets(
4503  rec, index, offsets, ULINT_UNDEFINED,
4504  &heap);
4505  goto locks_ok;
4506  case DB_DEADLOCK:
4507  goto lock_wait_or_error;
4508  case DB_LOCK_WAIT:
4509  err = DB_SUCCESS;
4510  break;
4511  default:
4512  ut_error;
4513  }
4514 
4515  if (old_vers == NULL) {
4516  /* The row was not yet committed */
4517 
4518  goto next_rec;
4519  }
4520 
4521  did_semi_consistent_read = TRUE;
4522  rec = old_vers;
4523  break;
4524  default:
4525 
4526  goto lock_wait_or_error;
4527  }
4528  } else {
4529  /* This is a non-locking consistent read: if necessary, fetch
4530  a previous version of the record */
4531 
4532  if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4533 
4534  /* Do nothing: we let a non-locking SELECT read the
4535  latest version of the record */
4536 
4537  } else if (index == clust_index) {
4538 
4539  /* Fetch a previous version of the row if the current
4540  one is not visible in the snapshot; if we have a very
4541  high force recovery level set, we try to avoid crashes
4542  by skipping this lookup */
4543 
4544  if (UNIV_LIKELY(srv_force_recovery < 5)
4546  rec, index, offsets, trx->read_view)) {
4547 
4548  rec_t* old_vers;
4549  /* The following call returns 'offsets'
4550  associated with 'old_vers' */
4551  err = row_sel_build_prev_vers_for_mysql(
4552  trx->read_view, clust_index,
4553  prebuilt, rec, &offsets, &heap,
4554  &old_vers, &mtr);
4555 
4556  if (err != DB_SUCCESS) {
4557 
4558  goto lock_wait_or_error;
4559  }
4560 
4561  if (old_vers == NULL) {
4562  /* The row did not exist yet in
4563  the read view */
4564 
4565  goto next_rec;
4566  }
4567 
4568  rec = old_vers;
4569  }
4570  } else {
4571  /* We are looking into a non-clustered index,
4572  and to get the right version of the record we
4573  have to look also into the clustered index: this
4574  is necessary, because we can only get the undo
4575  information via the clustered index record. */
4576 
4577  ut_ad(!dict_index_is_clust(index));
4578 
4580  rec, trx->read_view)) {
4581  /* We should look at the clustered index.
4582  However, as this is a non-locking read,
4583  we can skip the clustered index lookup if
4584  the condition does not match the secondary
4585  index entry. */
4586  switch (row_search_idx_cond_check(
4587  buf, prebuilt, rec, offsets)) {
4588  case ICP_NO_MATCH:
4589  goto next_rec;
4590  case ICP_OUT_OF_RANGE:
4591  err = DB_RECORD_NOT_FOUND;
4592  goto idx_cond_failed;
4593  case ICP_MATCH:
4594  goto requires_clust_rec;
4595  }
4596 
4597  ut_error;
4598  }
4599  }
4600  }
4601 
4602 locks_ok:
4603  /* NOTE that at this point rec can be an old version of a clustered
4604  index record built for a consistent read. We cannot assume after this
4605  point that rec is on a buffer pool page. Functions like
4606  page_rec_is_comp() cannot be used! */
4607 
4608  if (rec_get_deleted_flag(rec, comp)) {
4609 
4610  /* The record is delete-marked: we can skip it */
4611 
4613  || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4614  && prebuilt->select_lock_type != LOCK_NONE
4615  && !did_semi_consistent_read) {
4616 
4617  /* No need to keep a lock on a delete-marked record
4618  if we do not want to use next-key locking. */
4619 
4620  row_unlock_for_mysql(prebuilt, TRUE);
4621  }
4622 
4623  /* This is an optimization to skip setting the next key lock
4624  on the record that follows this delete-marked record. This
4625  optimization works because of the unique search criteria
4626  which precludes the presence of a range lock between this
4627  delete marked record and the record following it.
4628 
4629  For now this is applicable only to clustered indexes while
4630  doing a unique search except for HANDLER queries because
4631  HANDLER allows NEXT and PREV even in unique search on
4632  clustered index. There is scope for further optimization
4633  applicable to unique secondary indexes. Current behaviour is
4634  to widen the scope of a lock on an already delete marked record
4635  if the same record is deleted twice by the same transaction */
4636  if (index == clust_index && unique_search
4637  && !prebuilt->used_in_HANDLER) {
4638 
4639  err = DB_RECORD_NOT_FOUND;
4640 
4641  goto normal_return;
4642  }
4643 
4644  goto next_rec;
4645  }
4646 
4647  /* Check if the record matches the index condition. */
4648  switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
4649  case ICP_NO_MATCH:
4650  if (did_semi_consistent_read) {
4651  row_unlock_for_mysql(prebuilt, TRUE);
4652  }
4653  goto next_rec;
4654  case ICP_OUT_OF_RANGE:
4655  err = DB_RECORD_NOT_FOUND;
4656  goto idx_cond_failed;
4657  case ICP_MATCH:
4658  break;
4659  }
4660 
4661  /* Get the clustered index record if needed, if we did not do the
4662  search using the clustered index. */
4663 
4664  if (index != clust_index && prebuilt->need_to_access_clustered) {
4665 
4666 requires_clust_rec:
4667  ut_ad(index != clust_index);
4668  /* We use a 'goto' to the preceding label if a consistent
4669  read of a secondary index record requires us to look up old
4670  versions of the associated clustered index record. */
4671 
4672  ut_ad(rec_offs_validate(rec, index, offsets));
4673 
4674  /* It was a non-clustered index and we must fetch also the
4675  clustered index record */
4676 
4677  mtr_has_extra_clust_latch = TRUE;
4678 
4679  /* The following call returns 'offsets' associated with
4680  'clust_rec'. Note that 'clust_rec' can be an old version
4681  built for a consistent read. */
4682 
4683  err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4684  thr, &clust_rec,
4685  &offsets, &heap, &mtr);
4686  switch (err) {
4687  case DB_SUCCESS:
4688  if (clust_rec == NULL) {
4689  /* The record did not exist in the read view */
4690  ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4691 
4692  goto next_rec;
4693  }
4694  break;
4695  case DB_SUCCESS_LOCKED_REC:
4696  ut_a(clust_rec != NULL);
4698  || trx->isolation_level
4699  <= TRX_ISO_READ_COMMITTED) {
4700  /* Note that the clustered index record
4701  was locked. */
4702  prebuilt->new_rec_locks = 2;
4703  }
4704  err = DB_SUCCESS;
4705  break;
4706  default:
4707  goto lock_wait_or_error;
4708  }
4709 
4710  if (rec_get_deleted_flag(clust_rec, comp)) {
4711 
4712  /* The record is delete marked: we can skip it */
4713 
4715  || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4716  && prebuilt->select_lock_type != LOCK_NONE) {
4717 
4718  /* No need to keep a lock on a delete-marked
4719  record if we do not want to use next-key
4720  locking. */
4721 
4722  row_unlock_for_mysql(prebuilt, TRUE);
4723  }
4724 
4725  goto next_rec;
4726  }
4727 
4728  result_rec = clust_rec;
4729  ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
4730 
4731  if (prebuilt->idx_cond) {
4732  /* Convert the record to MySQL format. We were
4733  unable to do this in row_search_idx_cond_check(),
4734  because the condition is on the secondary index
4735  and the requested column is in the clustered index.
4736  We convert all fields, including those that
4737  may have been used in ICP, because the
4738  secondary index may contain a column prefix
4739  rather than the full column. Also, as noted
4740  in Bug #56680, the column in the secondary
4741  index may be in the wrong case, and the
4742  authoritative case is in result_rec, the
4743  appropriate version of the clustered index record. */
4744  if (!row_sel_store_mysql_rec(
4745  buf, prebuilt, result_rec,
4746  TRUE, clust_index, offsets)) {
4747  goto next_rec;
4748  }
4749  }
4750  } else {
4751  result_rec = rec;
4752  }
4753 
4754  /* We found a qualifying record 'result_rec'. At this point,
4755  'offsets' are associated with 'result_rec'. */
4756 
4757  ut_ad(rec_offs_validate(result_rec,
4758  result_rec != rec ? clust_index : index,
4759  offsets));
4760  ut_ad(!rec_get_deleted_flag(result_rec, comp));
4761 
4762  /* At this point, the clustered index record is protected
4763  by a page latch that was acquired when pcur was positioned.
4764  The latch will not be released until mtr_commit(&mtr). */
4765 
4766  if ((match_mode == ROW_SEL_EXACT
4767  || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4768  && prebuilt->select_lock_type == LOCK_NONE
4769  && !prebuilt->templ_contains_blob
4770  && !prebuilt->clust_index_was_generated
4771  && !prebuilt->used_in_HANDLER
4772  && !prebuilt->innodb_api
4773  && prebuilt->template_type
4774  != ROW_MYSQL_DUMMY_TEMPLATE
4775  && !prebuilt->in_fts_query) {
4776 
4777  /* Inside an update, for example, we do not cache rows,
4778  since we may use the cursor position to do the actual
4779  update, that is why we require ...lock_type == LOCK_NONE.
4780  Since we keep space in prebuilt only for the BLOBs of
4781  a single row, we cannot cache rows in the case there
4782  are BLOBs in the fields to be fetched. In HANDLER we do
4783  not cache rows because there the cursor is a scrollable
4784  cursor. */
4785 
4786  ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
4787 
4788  /* We only convert from InnoDB row format to MySQL row
4789  format when ICP is disabled. */
4790 
4791  if (!prebuilt->idx_cond) {
4792 
4793  /* We use next_buf to track the allocation of buffers
4794  where we store and enqueue the buffers for our
4795  pre-fetch optimisation.
4796 
4797  If next_buf == 0 then we store the converted record
4798  directly into the MySQL record buffer (buf). If it is
4799  != 0 then we allocate a pre-fetch buffer and store the
4800  converted record there.
4801 
4802  If the conversion fails and the MySQL record buffer
4803  was not written to then we reset next_buf so that
4804  we can re-use the MySQL record buffer in the next
4805  iteration. */
4806 
4807  next_buf = next_buf
4808  ? row_sel_fetch_last_buf(prebuilt) : buf;
4809 
4810  if (!row_sel_store_mysql_rec(
4811  next_buf, prebuilt, result_rec,
4812  result_rec != rec,
4813  result_rec != rec ? clust_index : index,
4814  offsets)) {
4815 
4816  if (next_buf == buf) {
4817  ut_a(prebuilt->n_fetch_cached == 0);
4818  next_buf = 0;
4819  }
4820 
4821  /* Only fresh inserts may contain incomplete
4822  externally stored columns. Pretend that such
4823  records do not exist. Such records may only be
4824  accessed at the READ UNCOMMITTED isolation
4825  level or when rolling back a recovered
4826  transaction. Rollback happens at a lower
4827  level, not here. */
4828  goto next_rec;
4829  }
4830 
4831  if (next_buf != buf) {
4832  row_sel_enqueue_cache_row_for_mysql(
4833  next_buf, prebuilt);
4834  }
4835  } else {
4836  row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
4837  }
4838 
4839  if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
4840  goto next_rec;
4841  }
4842 
4843  } else {
4844  if (UNIV_UNLIKELY
4845  (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
4846  /* CHECK TABLE: fetch the row */
4847 
4848  if (result_rec != rec
4849  && !prebuilt->need_to_access_clustered) {
4850  /* We used 'offsets' for the clust
4851  rec, recalculate them for 'rec' */
4852  offsets = rec_get_offsets(rec, index, offsets,
4853  ULINT_UNDEFINED,
4854  &heap);
4855  result_rec = rec;
4856  }
4857 
4858  memcpy(buf + 4, result_rec
4859  - rec_offs_extra_size(offsets),
4860  rec_offs_size(offsets));
4861  mach_write_to_4(buf,
4862  rec_offs_extra_size(offsets) + 4);
4863  } else if (!prebuilt->idx_cond && !prebuilt->innodb_api) {
4864  /* The record was not yet converted to MySQL format. */
4865  if (!row_sel_store_mysql_rec(
4866  buf, prebuilt, result_rec,
4867  result_rec != rec,
4868  result_rec != rec ? clust_index : index,
4869  offsets)) {
4870  /* Only fresh inserts may contain
4871  incomplete externally stored
4872  columns. Pretend that such records do
4873  not exist. Such records may only be
4874  accessed at the READ UNCOMMITTED
4875  isolation level or when rolling back a
4876  recovered transaction. Rollback
4877  happens at a lower level, not here. */
4878  goto next_rec;
4879  }
4880  }
4881 
4882  if (prebuilt->clust_index_was_generated) {
4883  row_sel_store_row_id_to_prebuilt(
4884  prebuilt, result_rec,
4885  result_rec == rec ? index : clust_index,
4886  offsets);
4887  }
4888  }
4889 
4890  /* From this point on, 'offsets' are invalid. */
4891 
4892  /* We have an optimization to save CPU time: if this is a consistent
4893  read on a unique condition on the clustered index, then we do not
4894  store the pcur position, because any fetch next or prev will anyway
4895  return 'end of file'. Exceptions are locking reads and the MySQL
4896  HANDLER command where the user can move the cursor with PREV or NEXT
4897  even after a unique search. */
4898 
4899  err = DB_SUCCESS;
4900 
4901 idx_cond_failed:
4902  if (!unique_search
4903  || !dict_index_is_clust(index)
4904  || direction != 0
4905  || prebuilt->select_lock_type != LOCK_NONE
4906  || prebuilt->used_in_HANDLER
4907  || prebuilt->innodb_api) {
4908 
4909  /* Inside an update always store the cursor position */
4910 
4911  btr_pcur_store_position(pcur, &mtr);
4912 
4913  if (prebuilt->innodb_api) {
4914  prebuilt->innodb_api_rec = result_rec;
4915  }
4916  }
4917 
4918  goto normal_return;
4919 
4920 next_rec:
4921  /* Reset the old and new "did semi-consistent read" flags. */
4922  if (UNIV_UNLIKELY(prebuilt->row_read_type
4923  == ROW_READ_DID_SEMI_CONSISTENT)) {
4924  prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4925  }
4926  did_semi_consistent_read = FALSE;
4927  prebuilt->new_rec_locks = 0;
4928 
4929  /*-------------------------------------------------------------*/
4930  /* PHASE 5: Move the cursor to the next index record */
4931 
4932  /* NOTE: For moves_up==FALSE, the mini-transaction will be
4933  committed and restarted every time when switching b-tree
4934  pages. For moves_up==TRUE in index condition pushdown, we can
4935  scan an entire secondary index tree within a single
4936  mini-transaction. As long as the prebuilt->idx_cond does not
4937  match, we do not need to consult the clustered index or
4938  return records to MySQL, and thus we can avoid repositioning
4939  the cursor. What prevents us from buffer-fixing all leaf pages
4940  within the mini-transaction is the btr_leaf_page_release()
4941  call in btr_pcur_move_to_next_page(). Only the leaf page where
4942  the cursor is positioned will remain buffer-fixed. */
4943 
4944  if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4945  /* We must commit mtr if we are moving to the next
4946  non-clustered index record, because we could break the
4947  latching order if we would access a different clustered
4948  index page right away without releasing the previous. */
4949 
4950  btr_pcur_store_position(pcur, &mtr);
4951 
4952  mtr_commit(&mtr);
4953  mtr_has_extra_clust_latch = FALSE;
4954 
4955  mtr_start(&mtr);
4956  if (sel_restore_position_for_mysql(&same_user_rec,
4958  pcur, moves_up, &mtr)) {
4959 #ifdef UNIV_SEARCH_DEBUG
4960  cnt++;
4961 #endif /* UNIV_SEARCH_DEBUG */
4962 
4963  goto rec_loop;
4964  }
4965  }
4966 
4967  if (moves_up) {
4968  if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4969 not_moved:
4970  btr_pcur_store_position(pcur, &mtr);
4971 
4972  if (match_mode != 0) {
4973  err = DB_RECORD_NOT_FOUND;
4974  } else {
4975  err = DB_END_OF_INDEX;
4976  }
4977 
4978  goto normal_return;
4979  }
4980  } else {
4981  if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4982  goto not_moved;
4983  }
4984  }
4985 
4986 #ifdef UNIV_SEARCH_DEBUG
4987  cnt++;
4988 #endif /* UNIV_SEARCH_DEBUG */
4989 
4990  goto rec_loop;
4991 
4992 lock_wait_or_error:
4993  /* Reset the old and new "did semi-consistent read" flags. */
4994  if (UNIV_UNLIKELY(prebuilt->row_read_type
4995  == ROW_READ_DID_SEMI_CONSISTENT)) {
4996  prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4997  }
4998  did_semi_consistent_read = FALSE;
4999 
5000  /*-------------------------------------------------------------*/
5001 
5002  btr_pcur_store_position(pcur, &mtr);
5003 
5004 lock_table_wait:
5005  mtr_commit(&mtr);
5006  mtr_has_extra_clust_latch = FALSE;
5007 
5008  trx->error_state = err;
5009 
5010  /* The following is a patch for MySQL */
5011 
5013 
5014  thr->lock_state = QUE_THR_LOCK_ROW;
5015 
5016  if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
5017  /* It was a lock wait, and it ended */
5018 
5019  thr->lock_state = QUE_THR_LOCK_NOLOCK;
5020  mtr_start(&mtr);
5021 
5022  /* Table lock waited, go try to obtain table lock
5023  again */
5024  if (table_lock_waited) {
5025  table_lock_waited = FALSE;
5026 
5027  goto wait_table_again;
5028  }
5029 
5030  sel_restore_position_for_mysql(&same_user_rec,
5031  BTR_SEARCH_LEAF, pcur,
5032  moves_up, &mtr);
5033 
5035  || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
5036  && !same_user_rec) {
5037 
5038  /* Since we were not able to restore the cursor
5039  on the same user record, we cannot use
5040  row_unlock_for_mysql() to unlock any records, and
5041  we must thus reset the new rec lock info. Since
5042  in lock0lock.cc we have blocked the inheriting of gap
5043  X-locks, we actually do not have any new record locks
5044  set in this case.
5045 
5046  Note that if we were able to restore on the 'same'
5047  user record, it is still possible that we were actually
5048  waiting on a delete-marked record, and meanwhile
5049  it was removed by purge and inserted again by some
5050  other user. But that is no problem, because in
5051  rec_loop we will again try to set a lock, and
5052  new_rec_lock_info in trx will be right at the end. */
5053 
5054  prebuilt->new_rec_locks = 0;
5055  }
5056 
5057  mode = pcur->search_mode;
5058 
5059  goto rec_loop;
5060  }
5061 
5062  thr->lock_state = QUE_THR_LOCK_NOLOCK;
5063 
5064 #ifdef UNIV_SEARCH_DEBUG
5065  /* fputs("Using ", stderr);
5066  dict_index_name_print(stderr, index);
5067  fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5068 #endif /* UNIV_SEARCH_DEBUG */
5069  goto func_exit;
5070 
5071 normal_return:
5072  /*-------------------------------------------------------------*/
5074 
5075  mtr_commit(&mtr);
5076 
5077  if (prebuilt->idx_cond != 0) {
5078 
5079  /* When ICP is active we don't write to the MySQL buffer
5080  directly, only to buffers that are enqueued in the pre-fetch
5081  queue. We need to dequeue the first buffer and copy the contents
5082  to the record buffer that was passed in by MySQL. */
5083 
5084  if (prebuilt->n_fetch_cached > 0) {
5085  row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
5086  err = DB_SUCCESS;
5087  }
5088 
5089  } else if (next_buf != 0) {
5090 
5091  /* We may or may not have enqueued some buffers to the
5092  pre-fetch queue, but we definitely wrote to the record
5093  buffer passed to use by MySQL. */
5094 
5095  err = DB_SUCCESS;
5096  }
5097 
5098 #ifdef UNIV_SEARCH_DEBUG
5099  /* fputs("Using ", stderr);
5100  dict_index_name_print(stderr, index);
5101  fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5102 #endif /* UNIV_SEARCH_DEBUG */
5103 
5104 func_exit:
5105  trx->op_info = "";
5106  if (UNIV_LIKELY_NULL(heap)) {
5107  mem_heap_free(heap);
5108  }
5109 
5110  /* Set or reset the "did semi-consistent read" flag on return.
5111  The flag did_semi_consistent_read is set if and only if
5112  the record being returned was fetched with a semi-consistent read. */
5113  ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
5114  || !did_semi_consistent_read);
5115 
5116  if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
5117  if (UNIV_UNLIKELY(did_semi_consistent_read)) {
5118  prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
5119  } else {
5120  prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5121  }
5122  }
5123 
5124 #ifdef UNIV_SYNC_DEBUG
5125  ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
5126 #endif /* UNIV_SYNC_DEBUG */
5127 
5128  DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
5129 
5130  return(err);
5131 }
5132 
5133 /*******************************************************************/
5137 UNIV_INTERN
5138 ibool
5140 /*======================================*/
5141  trx_t* trx,
5142  const char* norm_name)
5144 {
5146  ibool ret = FALSE;
5147 
5148  /* Disable query cache altogether for all tables if recovered XA
5149  transactions in prepared state exist. This is because we do not
5150  restore the table locks for those transactions and we may wrongly
5151  set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See
5152  "Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH
5153  QUERY CACHE ENABLED".
5154  Read trx_sys->n_prepared_recovered_trx without mutex protection,
5155  not possible to end up with a torn read since n_prepared_recovered_trx
5156  is word size. */
5157  if (trx_sys->n_prepared_recovered_trx > 0) {
5158 
5159  return(FALSE);
5160  }
5161 
5162  table = dict_table_open_on_name(norm_name, FALSE, FALSE,
5164 
5165  if (table == NULL) {
5166 
5167  return(FALSE);
5168  }
5169 
5170  /* Start the transaction if it is not started yet */
5171 
5172  trx_start_if_not_started(trx);
5173 
5174  /* If there are locks on the table or some trx has invalidated the
5175  cache up to our trx id, then ret = FALSE.
5176  We do not check what type locks there are on the table, though only
5177  IX type locks actually would require ret = FALSE. */
5178 
5179  if (lock_table_get_n_locks(table) == 0
5180  && trx->id >= table->query_cache_inv_trx_id) {
5181 
5182  ret = TRUE;
5183 
5184  /* If the isolation level is high, assign a read view for the
5185  transaction if it does not yet have one */
5186 
5187  if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
5188  && !trx->read_view) {
5189 
5191  trx->id, trx->global_read_view_heap);
5192 
5193  trx->global_read_view = trx->read_view;
5194  }
5195  }
5196 
5197  dict_table_close(table, FALSE, FALSE);
5198 
5199  return(ret);
5200 }
5201 
5202 /*******************************************************************/
5206 static
5207 ib_uint64_t
5208 row_search_autoinc_read_column(
5209 /*===========================*/
5210  dict_index_t* index,
5211  const rec_t* rec,
5212  ulint col_no,
5213  ulint mtype,
5214  ibool unsigned_type)
5215 {
5216  ulint len;
5217  const byte* data;
5218  ib_uint64_t value;
5219  mem_heap_t* heap = NULL;
5220  ulint offsets_[REC_OFFS_NORMAL_SIZE];
5221  ulint* offsets = offsets_;
5222 
5223  rec_offs_init(offsets_);
5224 
5225  offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap);
5226 
5227  if (rec_offs_nth_sql_null(offsets, col_no)) {
5228  /* There is no non-NULL value in the auto-increment column. */
5229  value = 0;
5230  goto func_exit;
5231  }
5232 
5233  data = rec_get_nth_field(rec, offsets, col_no, &len);
5234 
5235  switch (mtype) {
5236  case DATA_INT:
5237  ut_a(len <= sizeof value);
5238  value = mach_read_int_type(data, len, unsigned_type);
5239  break;
5240 
5241  case DATA_FLOAT:
5242  ut_a(len == sizeof(float));
5243  value = (ib_uint64_t) mach_float_read(data);
5244  break;
5245 
5246  case DATA_DOUBLE:
5247  ut_a(len == sizeof(double));
5248  value = (ib_uint64_t) mach_double_read(data);
5249  break;
5250 
5251  default:
5252  ut_error;
5253  }
5254 
5255  if (!unsigned_type && (ib_int64_t) value < 0) {
5256  value = 0;
5257  }
5258 
5259 func_exit:
5260  if (UNIV_LIKELY_NULL(heap)) {
5261  mem_heap_free(heap);
5262  }
5263 
5264  return(value);
5265 }
5266 
5267 /*******************************************************************/
5270 static
5271 const rec_t*
5272 row_search_autoinc_get_rec(
5273 /*=======================*/
5274  btr_pcur_t* pcur,
5275  mtr_t* mtr)
5276 {
5277  do {
5278  const rec_t* rec = btr_pcur_get_rec(pcur);
5279 
5280  if (page_rec_is_user_rec(rec)) {
5281  return(rec);
5282  }
5283  } while (btr_pcur_move_to_prev(pcur, mtr));
5284 
5285  return(NULL);
5286 }
5287 
5288 /*******************************************************************/
5292 UNIV_INTERN
5293 dberr_t
5295 /*===================*/
5296  dict_index_t* index,
5297  const char* col_name,
5298  ib_uint64_t* value)
5299 {
5300  ulint i;
5301  ulint n_cols;
5302  dict_field_t* dfield = NULL;
5303  dberr_t error = DB_SUCCESS;
5304 
5306 
5307  /* Search the index for the AUTOINC column name */
5308  for (i = 0; i < n_cols; ++i) {
5309  dfield = dict_index_get_nth_field(index, i);
5310 
5311  if (strcmp(col_name, dfield->name) == 0) {
5312  break;
5313  }
5314  }
5315 
5316  *value = 0;
5317 
5318  /* Must find the AUTOINC column name */
5319  if (i < n_cols && dfield) {
5320  mtr_t mtr;
5321  btr_pcur_t pcur;
5322 
5323  mtr_start(&mtr);
5324 
5325  /* Open at the high/right end (false), and init cursor */
5327  false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
5328 
5329  if (!page_is_empty(btr_pcur_get_page(&pcur))) {
5330  const rec_t* rec;
5331 
5332  rec = row_search_autoinc_get_rec(&pcur, &mtr);
5333 
5334  if (rec != NULL) {
5335  ibool unsigned_type = (
5336  dfield->col->prtype & DATA_UNSIGNED);
5337 
5338  *value = row_search_autoinc_read_column(
5339  index, rec, i,
5340  dfield->col->mtype, unsigned_type);
5341  }
5342  }
5343 
5344  btr_pcur_close(&pcur);
5345 
5346  mtr_commit(&mtr);
5347  } else {
5348  error = DB_RECORD_NOT_FOUND;
5349  }
5350 
5351  return(error);
5352 }