MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
btr0cur.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************/
45 #include "btr0cur.h"
46 
47 #ifdef UNIV_NONINL
48 #include "btr0cur.ic"
49 #endif
50 
51 #include "row0upd.h"
52 #ifndef UNIV_HOTBACKUP
53 #include "mtr0log.h"
54 #include "page0page.h"
55 #include "page0zip.h"
56 #include "rem0rec.h"
57 #include "rem0cmp.h"
58 #include "buf0lru.h"
59 #include "btr0btr.h"
60 #include "btr0sea.h"
61 #include "row0log.h"
62 #include "row0purge.h"
63 #include "row0upd.h"
64 #include "trx0rec.h"
65 #include "trx0roll.h" /* trx_is_recv() */
66 #include "que0que.h"
67 #include "row0row.h"
68 #include "srv0srv.h"
69 #include "ibuf0ibuf.h"
70 #include "lock0lock.h"
71 #include "zlib.h"
72 
74 enum btr_op_t {
75  BTR_NO_OP = 0,
80 };
81 
82 #ifdef UNIV_DEBUG
83 
85 UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
86 #endif /* UNIV_DEBUG */
87 
89 UNIV_INTERN ulint btr_cur_n_non_sea = 0;
92 UNIV_INTERN ulint btr_cur_n_sea = 0;
96 UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
100 UNIV_INTERN ulint btr_cur_n_sea_old = 0;
101 
102 #ifdef UNIV_DEBUG
103 /* Flag to limit optimistic insert records */
104 UNIV_INTERN uint btr_cur_limit_optimistic_insert_debug = 0;
105 #endif /* UNIV_DEBUG */
106 
109 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
110 
112 /* @{ */
113 /*--------------------------------------*/
114 #define BTR_BLOB_HDR_PART_LEN 0
116 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4
118 /*--------------------------------------*/
119 #define BTR_BLOB_HDR_SIZE 8
129 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
130  (((value) * (ib_int64_t) index->stat_n_leaf_pages \
131  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
132 
133 /* @} */
134 #endif /* !UNIV_HOTBACKUP */
135 
140  0, 0, 0, 0, 0,
141  0, 0, 0, 0, 0,
142  0, 0, 0, 0, 0,
143  0, 0, 0, 0, 0,
144 };
145 
146 #ifndef UNIV_HOTBACKUP
147 /*******************************************************************/
151 static
152 void
153 btr_cur_unmark_extern_fields(
154 /*=========================*/
155  page_zip_des_t* page_zip,
157  rec_t* rec,
159  const ulint* offsets,
160  mtr_t* mtr);
161 /*******************************************************************/
164 static
165 void
166 btr_cur_add_path_info(
167 /*==================*/
168  btr_cur_t* cursor,
169  ulint height,
171  ulint root_height);
172 /***********************************************************/
175 static
176 void
177 btr_rec_free_updated_extern_fields(
178 /*===============================*/
181  rec_t* rec,
182  page_zip_des_t* page_zip,
184  const ulint* offsets,
185  const upd_t* update,
186  enum trx_rb_ctx rb_ctx,
187  mtr_t* mtr);
189 /***********************************************************/
191 static
192 void
193 btr_rec_free_externally_stored_fields(
194 /*==================================*/
197  rec_t* rec,
198  const ulint* offsets,
199  page_zip_des_t* page_zip,
201  enum trx_rb_ctx rb_ctx,
202  mtr_t* mtr);
205 /***********************************************************/
208 static
209 ulint
210 btr_rec_get_externally_stored_len(
211 /*==============================*/
212  const rec_t* rec,
213  const ulint* offsets);
214 #endif /* !UNIV_HOTBACKUP */
215 
216 /******************************************************/
218 UNIV_INLINE
219 void
221 /*=====================*/
222  rec_t* rec,
223  page_zip_des_t* page_zip,
224  ulint flag)
225 {
226  if (page_rec_is_comp(rec)) {
227  rec_set_deleted_flag_new(rec, page_zip, flag);
228  } else {
229  ut_ad(!page_zip);
230  rec_set_deleted_flag_old(rec, flag);
231  }
232 }
233 
234 #ifndef UNIV_HOTBACKUP
235 /*==================== B-TREE SEARCH =========================*/
236 
237 /********************************************************************/
239 static
240 void
241 btr_cur_latch_leaves(
242 /*=================*/
243  page_t* page,
245  ulint space,
246  ulint zip_size,
248  ulint page_no,
249  ulint latch_mode,
250  btr_cur_t* cursor,
251  mtr_t* mtr)
252 {
253  ulint mode;
254  ulint left_page_no;
255  ulint right_page_no;
256  buf_block_t* get_block;
257 
258  ut_ad(page && mtr);
259 
260  switch (latch_mode) {
261  case BTR_SEARCH_LEAF:
262  case BTR_MODIFY_LEAF:
263  mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
264  get_block = btr_block_get(
265  space, zip_size, page_no, mode, cursor->index, mtr);
266 #ifdef UNIV_BTR_DEBUG
267  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
268 #endif /* UNIV_BTR_DEBUG */
269  get_block->check_index_page_at_flush = TRUE;
270  return;
271  case BTR_MODIFY_TREE:
272  /* x-latch also brothers from left to right */
273  left_page_no = btr_page_get_prev(page, mtr);
274 
275  if (left_page_no != FIL_NULL) {
276  get_block = btr_block_get(
277  space, zip_size, left_page_no,
278  RW_X_LATCH, cursor->index, mtr);
279 #ifdef UNIV_BTR_DEBUG
280  ut_a(page_is_comp(get_block->frame)
281  == page_is_comp(page));
282  ut_a(btr_page_get_next(get_block->frame, mtr)
283  == page_get_page_no(page));
284 #endif /* UNIV_BTR_DEBUG */
285  get_block->check_index_page_at_flush = TRUE;
286  }
287 
288  get_block = btr_block_get(
289  space, zip_size, page_no,
290  RW_X_LATCH, cursor->index, mtr);
291 #ifdef UNIV_BTR_DEBUG
292  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
293 #endif /* UNIV_BTR_DEBUG */
294  get_block->check_index_page_at_flush = TRUE;
295 
296  right_page_no = btr_page_get_next(page, mtr);
297 
298  if (right_page_no != FIL_NULL) {
299  get_block = btr_block_get(
300  space, zip_size, right_page_no,
301  RW_X_LATCH, cursor->index, mtr);
302 #ifdef UNIV_BTR_DEBUG
303  ut_a(page_is_comp(get_block->frame)
304  == page_is_comp(page));
305  ut_a(btr_page_get_prev(get_block->frame, mtr)
306  == page_get_page_no(page));
307 #endif /* UNIV_BTR_DEBUG */
308  get_block->check_index_page_at_flush = TRUE;
309  }
310 
311  return;
312 
313  case BTR_SEARCH_PREV:
314  case BTR_MODIFY_PREV:
315  mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
316  /* latch also left brother */
317  left_page_no = btr_page_get_prev(page, mtr);
318 
319  if (left_page_no != FIL_NULL) {
320  get_block = btr_block_get(
321  space, zip_size,
322  left_page_no, mode, cursor->index, mtr);
323  cursor->left_block = get_block;
324 #ifdef UNIV_BTR_DEBUG
325  ut_a(page_is_comp(get_block->frame)
326  == page_is_comp(page));
327  ut_a(btr_page_get_next(get_block->frame, mtr)
328  == page_get_page_no(page));
329 #endif /* UNIV_BTR_DEBUG */
330  get_block->check_index_page_at_flush = TRUE;
331  }
332 
333  get_block = btr_block_get(
334  space, zip_size, page_no, mode, cursor->index, mtr);
335 #ifdef UNIV_BTR_DEBUG
336  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
337 #endif /* UNIV_BTR_DEBUG */
338  get_block->check_index_page_at_flush = TRUE;
339  return;
340  }
341 
342  ut_error;
343 }
344 
345 /********************************************************************/
357 UNIV_INTERN
358 void
360 /*========================*/
362  ulint level,
363  const dtuple_t* tuple,
366  ulint mode,
369  ulint latch_mode,
380  btr_cur_t* cursor,
382  ulint has_search_latch,
385  const char* file,
386  ulint line,
387  mtr_t* mtr)
388 {
389  page_t* page;
391  ulint space;
392  buf_block_t* guess;
393  ulint height;
394  ulint page_no;
395  ulint up_match;
396  ulint up_bytes;
397  ulint low_match;
398  ulint low_bytes;
399  ulint savepoint;
400  ulint rw_latch;
401  ulint page_mode;
402  ulint buf_mode;
403  ulint estimate;
404  ulint zip_size;
405  page_cur_t* page_cursor;
406  btr_op_t btr_op;
407  ulint root_height = 0; /* remove warning */
408 
409 #ifdef BTR_CUR_ADAPT
410  btr_search_t* info;
411 #endif
412  mem_heap_t* heap = NULL;
413  ulint offsets_[REC_OFFS_NORMAL_SIZE];
414  ulint* offsets = offsets_;
415  rec_offs_init(offsets_);
416  /* Currently, PAGE_CUR_LE is the only search mode used for searches
417  ending to upper levels */
418 
419  ut_ad(level == 0 || mode == PAGE_CUR_LE);
420  ut_ad(dict_index_check_search_tuple(index, tuple));
421  ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
422  ut_ad(dtuple_check_typed(tuple));
423  ut_ad(!(index->type & DICT_FTS));
424  ut_ad(index->page != FIL_NULL);
425 
426  UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
427  UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
428  UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
429  UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
430 #ifdef UNIV_DEBUG
431  cursor->up_match = ULINT_UNDEFINED;
432  cursor->low_match = ULINT_UNDEFINED;
433 #endif
434 
435  ibool s_latch_by_caller;
436 
437  s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
438 
439  ut_ad(!s_latch_by_caller
440  || mtr_memo_contains(mtr, dict_index_get_lock(index),
441  MTR_MEMO_S_LOCK));
442 
443  /* These flags are mutually exclusive, they are lumped together
444  with the latch mode for historical reasons. It's possible for
445  none of the flags to be set. */
446  switch (UNIV_EXPECT(latch_mode
448  0)) {
449  case 0:
450  btr_op = BTR_NO_OP;
451  break;
452  case BTR_INSERT:
453  btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
455  : BTR_INSERT_OP;
456  break;
457  case BTR_DELETE:
458  btr_op = BTR_DELETE_OP;
459  ut_a(cursor->purge_node);
460  break;
461  case BTR_DELETE_MARK:
462  btr_op = BTR_DELMARK_OP;
463  break;
464  default:
465  /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
466  should be specified at a time */
467  ut_error;
468  }
469 
470  /* Operations on the insert buffer tree cannot be buffered. */
471  ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
472  /* Operations on the clustered index cannot be buffered. */
473  ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
474 
475  estimate = latch_mode & BTR_ESTIMATE;
476 
477  /* Turn the flags unrelated to the latch mode off. */
478  latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
479 
480  ut_ad(!s_latch_by_caller
481  || latch_mode == BTR_SEARCH_LEAF
482  || latch_mode == BTR_MODIFY_LEAF);
483 
484  cursor->flag = BTR_CUR_BINARY;
485  cursor->index = index;
486 
487 #ifndef BTR_CUR_ADAPT
488  guess = NULL;
489 #else
490  info = btr_search_get_info(index);
491 
492  guess = info->root_guess;
493 
494 #ifdef BTR_CUR_HASH_ADAPT
495 
496 # ifdef UNIV_SEARCH_PERF_STAT
497  info->n_searches++;
498 # endif
499  if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
500  && latch_mode <= BTR_MODIFY_LEAF
501  && info->last_hash_succ
502  && !estimate
503 # ifdef PAGE_CUR_LE_OR_EXTENDS
504  && mode != PAGE_CUR_LE_OR_EXTENDS
505 # endif /* PAGE_CUR_LE_OR_EXTENDS */
506  /* If !has_search_latch, we do a dirty read of
507  btr_search_enabled below, and btr_search_guess_on_hash()
508  will have to check it again. */
509  && UNIV_LIKELY(btr_search_enabled)
510  && btr_search_guess_on_hash(index, info, tuple, mode,
511  latch_mode, cursor,
512  has_search_latch, mtr)) {
513 
514  /* Search using the hash index succeeded */
515 
516  ut_ad(cursor->up_match != ULINT_UNDEFINED
517  || mode != PAGE_CUR_GE);
518  ut_ad(cursor->up_match != ULINT_UNDEFINED
519  || mode != PAGE_CUR_LE);
520  ut_ad(cursor->low_match != ULINT_UNDEFINED
521  || mode != PAGE_CUR_LE);
522  btr_cur_n_sea++;
523 
524  return;
525  }
526 # endif /* BTR_CUR_HASH_ADAPT */
527 #endif /* BTR_CUR_ADAPT */
529 
530  /* If the hash search did not succeed, do binary search down the
531  tree */
532 
533  if (has_search_latch) {
534  /* Release possible search latch to obey latching order */
535  rw_lock_s_unlock(&btr_search_latch);
536  }
537 
538  /* Store the position of the tree latch we push to mtr so that we
539  know how to release it when we have latched leaf node(s) */
540 
541  savepoint = mtr_set_savepoint(mtr);
542 
543  switch (latch_mode) {
544  case BTR_MODIFY_TREE:
545  mtr_x_lock(dict_index_get_lock(index), mtr);
546  break;
548  /* Do nothing */
549  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
550  MTR_MEMO_X_LOCK));
551  break;
552  default:
553  if (!s_latch_by_caller) {
554  mtr_s_lock(dict_index_get_lock(index), mtr);
555  }
556  }
557 
558  page_cursor = btr_cur_get_page_cur(cursor);
559 
560  space = dict_index_get_space(index);
561  page_no = dict_index_get_page(index);
562 
563  up_match = 0;
564  up_bytes = 0;
565  low_match = 0;
566  low_bytes = 0;
567 
568  height = ULINT_UNDEFINED;
569 
570  /* We use these modified search modes on non-leaf levels of the
571  B-tree. These let us end up in the right B-tree leaf. In that leaf
572  we use the original search mode. */
573 
574  switch (mode) {
575  case PAGE_CUR_GE:
576  page_mode = PAGE_CUR_L;
577  break;
578  case PAGE_CUR_G:
579  page_mode = PAGE_CUR_LE;
580  break;
581  default:
582 #ifdef PAGE_CUR_LE_OR_EXTENDS
583  ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
584  || mode == PAGE_CUR_LE_OR_EXTENDS);
585 #else /* PAGE_CUR_LE_OR_EXTENDS */
586  ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
587 #endif /* PAGE_CUR_LE_OR_EXTENDS */
588  page_mode = mode;
589  break;
590  }
591 
592  /* Loop and search until we arrive at the desired level */
593 
594 search_loop:
595  buf_mode = BUF_GET;
596  rw_latch = RW_NO_LATCH;
597 
598  if (height != 0) {
599  /* We are about to fetch the root or a non-leaf page. */
600  } else if (latch_mode <= BTR_MODIFY_LEAF) {
601  rw_latch = latch_mode;
602 
603  if (btr_op != BTR_NO_OP
604  && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
605 
606  /* Try to buffer the operation if the leaf
607  page is not in the buffer pool. */
608 
609  buf_mode = btr_op == BTR_DELETE_OP
612  }
613  }
614 
615  zip_size = dict_table_zip_size(index->table);
616 
617 retry_page_get:
618  block = buf_page_get_gen(
619  space, zip_size, page_no, rw_latch, guess, buf_mode,
620  file, line, mtr);
621 
622  if (block == NULL) {
623  /* This must be a search to perform an insert/delete
624  mark/ delete; try using the insert/delete buffer */
625 
626  ut_ad(height == 0);
627  ut_ad(cursor->thr);
628 
629  switch (btr_op) {
630  case BTR_INSERT_OP:
632  ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
633 
634  if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
635  space, zip_size, page_no,
636  cursor->thr)) {
637 
638  cursor->flag = BTR_CUR_INSERT_TO_IBUF;
639 
640  goto func_exit;
641  }
642  break;
643 
644  case BTR_DELMARK_OP:
645  ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
646 
647  if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
648  index, space, zip_size,
649  page_no, cursor->thr)) {
650 
651  cursor->flag = BTR_CUR_DEL_MARK_IBUF;
652 
653  goto func_exit;
654  }
655 
656  break;
657 
658  case BTR_DELETE_OP:
659  ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
660 
661  if (!row_purge_poss_sec(cursor->purge_node,
662  index, tuple)) {
663 
664  /* The record cannot be purged yet. */
665  cursor->flag = BTR_CUR_DELETE_REF;
666  } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
667  index, space, zip_size,
668  page_no,
669  cursor->thr)) {
670 
671  /* The purge was buffered. */
672  cursor->flag = BTR_CUR_DELETE_IBUF;
673  } else {
674  /* The purge could not be buffered. */
675  buf_pool_watch_unset(space, page_no);
676  break;
677  }
678 
679  buf_pool_watch_unset(space, page_no);
680  goto func_exit;
681 
682  default:
683  ut_error;
684  }
685 
686  /* Insert to the insert/delete buffer did not succeed, we
687  must read the page from disk. */
688 
689  buf_mode = BUF_GET;
690 
691  goto retry_page_get;
692  }
693 
694  block->check_index_page_at_flush = TRUE;
695  page = buf_block_get_frame(block);
696 
697  if (rw_latch != RW_NO_LATCH) {
698 #ifdef UNIV_ZIP_DEBUG
699  const page_zip_des_t* page_zip
700  = buf_block_get_page_zip(block);
701  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
702 #endif /* UNIV_ZIP_DEBUG */
703 
704  buf_block_dbg_add_level(
705  block, dict_index_is_ibuf(index)
706  ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
707  }
708 
710  ut_ad(index->id == btr_page_get_index_id(page));
711 
712  if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
713  /* We are in the root node */
714 
715  height = btr_page_get_level(page, mtr);
716  root_height = height;
717  cursor->tree_height = root_height + 1;
718 
719 #ifdef BTR_CUR_ADAPT
720  if (block != guess) {
721  info->root_guess = block;
722  }
723 #endif
724  }
725 
726  if (height == 0) {
727  if (rw_latch == RW_NO_LATCH) {
728 
729  btr_cur_latch_leaves(
730  page, space, zip_size, page_no, latch_mode,
731  cursor, mtr);
732  }
733 
734  switch (latch_mode) {
735  case BTR_MODIFY_TREE:
737  break;
738  default:
739  if (!s_latch_by_caller) {
740  /* Release the tree s-latch */
742  mtr, savepoint,
743  dict_index_get_lock(index));
744  }
745  }
746 
747  page_mode = mode;
748  }
749 
751  block, index, tuple, page_mode, &up_match, &up_bytes,
752  &low_match, &low_bytes, page_cursor);
753 
754  if (estimate) {
755  btr_cur_add_path_info(cursor, height, root_height);
756  }
757 
758  /* If this is the desired level, leave the loop */
759 
760  ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
761  mtr));
762 
763  if (level != height) {
764 
765  const rec_t* node_ptr;
766  ut_ad(height > 0);
767 
768  height--;
769  guess = NULL;
770 
771  node_ptr = page_cur_get_rec(page_cursor);
772 
773  offsets = rec_get_offsets(
774  node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
775 
776  /* Go to the child node */
777  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
778 
779  if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
780  /* We're doing a search on an ibuf tree and we're one
781  level above the leaf page. */
782 
783  ut_ad(level == 0);
784 
785  buf_mode = BUF_GET;
786  rw_latch = RW_NO_LATCH;
787  goto retry_page_get;
788  }
789 
790  goto search_loop;
791  }
792 
793  if (level != 0) {
794  /* x-latch the page */
795  buf_block_t* child_block = btr_block_get(
796  space, zip_size, page_no, RW_X_LATCH, index, mtr);
797 
798  page = buf_block_get_frame(child_block);
799  btr_assert_not_corrupted(child_block, index);
800  } else {
801  cursor->low_match = low_match;
802  cursor->low_bytes = low_bytes;
803  cursor->up_match = up_match;
804  cursor->up_bytes = up_bytes;
805 
806 #ifdef BTR_CUR_ADAPT
807  /* We do a dirty read of btr_search_enabled here. We
808  will properly check btr_search_enabled again in
809  btr_search_build_page_hash_index() before building a
810  page hash index, while holding btr_search_latch. */
811  if (btr_search_enabled) {
812  btr_search_info_update(index, cursor);
813  }
814 #endif
815  ut_ad(cursor->up_match != ULINT_UNDEFINED
816  || mode != PAGE_CUR_GE);
817  ut_ad(cursor->up_match != ULINT_UNDEFINED
818  || mode != PAGE_CUR_LE);
819  ut_ad(cursor->low_match != ULINT_UNDEFINED
820  || mode != PAGE_CUR_LE);
821  }
822 
823 func_exit:
824 
825  if (UNIV_LIKELY_NULL(heap)) {
826  mem_heap_free(heap);
827  }
828 
829  if (has_search_latch) {
830 
832  }
833 }
834 
835 /*****************************************************************/
837 UNIV_INTERN
838 void
840 /*============================*/
841  bool from_left,
844  ulint latch_mode,
845  btr_cur_t* cursor,
846  ulint level,
848  const char* file,
849  ulint line,
850  mtr_t* mtr)
851 {
852  page_cur_t* page_cursor;
853  ulint page_no;
854  ulint space;
855  ulint zip_size;
856  ulint height;
857  ulint root_height = 0; /* remove warning */
858  rec_t* node_ptr;
859  ulint estimate;
860  ulint savepoint;
861  mem_heap_t* heap = NULL;
862  ulint offsets_[REC_OFFS_NORMAL_SIZE];
863  ulint* offsets = offsets_;
864  rec_offs_init(offsets_);
865 
866  estimate = latch_mode & BTR_ESTIMATE;
867  latch_mode &= ~BTR_ESTIMATE;
868 
869  ut_ad(level != ULINT_UNDEFINED);
870 
871  /* Store the position of the tree latch we push to mtr so that we
872  know how to release it when we have latched the leaf node */
873 
874  savepoint = mtr_set_savepoint(mtr);
875 
876  switch (latch_mode) {
878  break;
879  case BTR_MODIFY_TREE:
880  mtr_x_lock(dict_index_get_lock(index), mtr);
881  break;
884  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
885  MTR_MEMO_S_LOCK));
886  break;
887  default:
888  mtr_s_lock(dict_index_get_lock(index), mtr);
889  }
890 
891  page_cursor = btr_cur_get_page_cur(cursor);
892  cursor->index = index;
893 
894  space = dict_index_get_space(index);
895  zip_size = dict_table_zip_size(index->table);
896  page_no = dict_index_get_page(index);
897 
898  height = ULINT_UNDEFINED;
899 
900  for (;;) {
902  page_t* page;
903  block = buf_page_get_gen(space, zip_size, page_no,
904  RW_NO_LATCH, NULL, BUF_GET,
905  file, line, mtr);
906  page = buf_block_get_frame(block);
908  ut_ad(index->id == btr_page_get_index_id(page));
909 
910  block->check_index_page_at_flush = TRUE;
911 
912  if (height == ULINT_UNDEFINED) {
913  /* We are in the root node */
914 
915  height = btr_page_get_level(page, mtr);
916  root_height = height;
917  ut_a(height >= level);
918  } else {
919  /* TODO: flag the index corrupted if this fails */
920  ut_ad(height == btr_page_get_level(page, mtr));
921  }
922 
923  if (height == level) {
924  btr_cur_latch_leaves(
925  page, space, zip_size, page_no,
926  latch_mode & ~BTR_ALREADY_S_LATCHED,
927  cursor, mtr);
928 
929  if (height == 0) {
930  /* In versions <= 3.23.52 we had
931  forgotten to release the tree latch
932  here. If in an index scan we had to
933  scan far to find a record visible to
934  the current transaction, that could
935  starve others waiting for the tree
936  latch. */
937 
938  switch (latch_mode) {
939  case BTR_MODIFY_TREE:
943  break;
944  default:
945  /* Release the tree s-latch */
946 
948  mtr, savepoint,
949  dict_index_get_lock(index));
950  }
951  }
952  }
953 
954  if (from_left) {
955  page_cur_set_before_first(block, page_cursor);
956  } else {
957  page_cur_set_after_last(block, page_cursor);
958  }
959 
960  if (height == level) {
961  if (estimate) {
962  btr_cur_add_path_info(cursor, height,
963  root_height);
964  }
965 
966  break;
967  }
968 
969  ut_ad(height > 0);
970 
971  if (from_left) {
972  page_cur_move_to_next(page_cursor);
973  } else {
974  page_cur_move_to_prev(page_cursor);
975  }
976 
977  if (estimate) {
978  btr_cur_add_path_info(cursor, height, root_height);
979  }
980 
981  height--;
982 
983  node_ptr = page_cur_get_rec(page_cursor);
984  offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
985  ULINT_UNDEFINED, &heap);
986  /* Go to the child node */
987  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
988  }
989 
990  if (heap) {
991  mem_heap_free(heap);
992  }
993 }
994 
995 /**********************************************************************/
997 UNIV_INTERN
998 void
1000 /*=========================*/
1001  dict_index_t* index,
1002  ulint latch_mode,
1003  btr_cur_t* cursor,
1004  const char* file,
1005  ulint line,
1006  mtr_t* mtr)
1007 {
1008  page_cur_t* page_cursor;
1009  ulint page_no;
1010  ulint space;
1011  ulint zip_size;
1012  ulint height;
1013  rec_t* node_ptr;
1014  mem_heap_t* heap = NULL;
1015  ulint offsets_[REC_OFFS_NORMAL_SIZE];
1016  ulint* offsets = offsets_;
1017  rec_offs_init(offsets_);
1018 
1019  switch (latch_mode) {
1020  case BTR_MODIFY_TREE:
1021  mtr_x_lock(dict_index_get_lock(index), mtr);
1022  break;
1023  default:
1024  ut_ad(latch_mode != BTR_CONT_MODIFY_TREE);
1025  mtr_s_lock(dict_index_get_lock(index), mtr);
1026  }
1027 
1028  page_cursor = btr_cur_get_page_cur(cursor);
1029  cursor->index = index;
1030 
1031  space = dict_index_get_space(index);
1032  zip_size = dict_table_zip_size(index->table);
1033  page_no = dict_index_get_page(index);
1034 
1035  height = ULINT_UNDEFINED;
1036 
1037  for (;;) {
1038  buf_block_t* block;
1039  page_t* page;
1040 
1041  block = buf_page_get_gen(space, zip_size, page_no,
1042  RW_NO_LATCH, NULL, BUF_GET,
1043  file, line, mtr);
1044  page = buf_block_get_frame(block);
1046  ut_ad(index->id == btr_page_get_index_id(page));
1047 
1048  if (height == ULINT_UNDEFINED) {
1049  /* We are in the root node */
1050 
1051  height = btr_page_get_level(page, mtr);
1052  }
1053 
1054  if (height == 0) {
1055  btr_cur_latch_leaves(page, space, zip_size, page_no,
1056  latch_mode, cursor, mtr);
1057  }
1058 
1059  page_cur_open_on_rnd_user_rec(block, page_cursor);
1060 
1061  if (height == 0) {
1062 
1063  break;
1064  }
1065 
1066  ut_ad(height > 0);
1067 
1068  height--;
1069 
1070  node_ptr = page_cur_get_rec(page_cursor);
1071  offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
1072  ULINT_UNDEFINED, &heap);
1073  /* Go to the child node */
1074  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
1075  }
1076 
1077  if (UNIV_LIKELY_NULL(heap)) {
1078  mem_heap_free(heap);
1079  }
1080 }
1081 
1082 /*==================== B-TREE INSERT =========================*/
1083 
1084 /*************************************************************/
1096 static __attribute__((nonnull, warn_unused_result))
1097 rec_t*
1098 btr_cur_insert_if_possible(
1099 /*=======================*/
1100  btr_cur_t* cursor,
1102  const dtuple_t* tuple,
1104  ulint** offsets,
1105  mem_heap_t** heap,
1106  ulint n_ext,
1107  mtr_t* mtr)
1108 {
1109  page_cur_t* page_cursor;
1110  rec_t* rec;
1111 
1112  ut_ad(dtuple_check_typed(tuple));
1113 
1114  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
1115  MTR_MEMO_PAGE_X_FIX));
1116  page_cursor = btr_cur_get_page_cur(cursor);
1117 
1118  /* Now, try the insert */
1119  rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
1120  offsets, heap, n_ext, mtr);
1121 
1122  /* If the record did not fit, reorganize.
1123  For compressed pages, page_cur_tuple_insert()
1124  attempted this already. */
1125  if (!rec && !page_cur_get_page_zip(page_cursor)
1126  && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
1127  rec = page_cur_tuple_insert(
1128  page_cursor, tuple, cursor->index,
1129  offsets, heap, n_ext, mtr);
1130  }
1131 
1132  ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
1133  return(rec);
1134 }
1135 
1136 /*************************************************************/
1139 UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6)))
1140 dberr_t
1141 btr_cur_ins_lock_and_undo(
1142 /*======================*/
1143  ulint flags,
1146  btr_cur_t* cursor,
1147  dtuple_t* entry,
1148  que_thr_t* thr,
1149  mtr_t* mtr,
1150  ibool* inherit)
1153 {
1155  dberr_t err;
1156  rec_t* rec;
1157  roll_ptr_t roll_ptr;
1158 
1159  /* Check if we have to wait for a lock: enqueue an explicit lock
1160  request if yes */
1161 
1162  rec = btr_cur_get_rec(cursor);
1163  index = cursor->index;
1164 
1166  || dict_index_is_clust(index)
1167  || (flags & BTR_CREATE_FLAG));
1168 
1169  err = lock_rec_insert_check_and_lock(flags, rec,
1170  btr_cur_get_block(cursor),
1171  index, thr, mtr, inherit);
1172 
1173  if (err != DB_SUCCESS
1174  || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
1175 
1176  return(err);
1177  }
1178 
1179  err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
1180  thr, index, entry,
1181  NULL, 0, NULL, NULL,
1182  &roll_ptr);
1183  if (err != DB_SUCCESS) {
1184 
1185  return(err);
1186  }
1187 
1188  /* Now we can fill in the roll ptr field in entry */
1189 
1190  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1191 
1192  row_upd_index_entry_sys_field(entry, index,
1193  DATA_ROLL_PTR, roll_ptr);
1194  }
1195 
1196  return(DB_SUCCESS);
1197 }
1198 
1199 #ifdef UNIV_DEBUG
1200 /*************************************************************/
1202 static
1203 void
1204 btr_cur_trx_report(
1205 /*===============*/
1206  trx_id_t trx_id,
1207  const dict_index_t* index,
1208  const char* op)
1209 {
1210  fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id);
1211  fputs(op, stderr);
1212  dict_index_name_print(stderr, NULL, index);
1213  putc('\n', stderr);
1214 }
1215 #endif /* UNIV_DEBUG */
1216 
1217 /*************************************************************/
1224 UNIV_INTERN
1225 dberr_t
1227 /*======================*/
1228  ulint flags,
1231  btr_cur_t* cursor,
1233  ulint** offsets,
1234  mem_heap_t** heap,
1235  dtuple_t* entry,
1236  rec_t** rec,
1238  big_rec_t** big_rec,
1241  ulint n_ext,
1242  que_thr_t* thr,
1243  mtr_t* mtr)
1249 {
1250  big_rec_t* big_rec_vec = NULL;
1252  page_cur_t* page_cursor;
1253  buf_block_t* block;
1254  page_t* page;
1255  rec_t* dummy;
1256  ibool leaf;
1257  ibool reorg;
1258  ibool inherit;
1259  ulint zip_size;
1260  ulint rec_size;
1261  dberr_t err;
1262 
1263  *big_rec = NULL;
1264 
1265  block = btr_cur_get_block(cursor);
1266  page = buf_block_get_frame(block);
1267  index = cursor->index;
1268 
1269  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1271  || dict_index_is_clust(index)
1272  || (flags & BTR_CREATE_FLAG));
1273  ut_ad(dtuple_check_typed(entry));
1274 
1275  zip_size = buf_block_get_zip_size(block);
1276 #ifdef UNIV_DEBUG_VALGRIND
1277  if (zip_size) {
1278  UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
1279  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
1280  }
1281 #endif /* UNIV_DEBUG_VALGRIND */
1282 
1283 #ifdef UNIV_DEBUG
1284  if (btr_cur_print_record_ops && thr) {
1285  btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert ");
1286  dtuple_print(stderr, entry);
1287  }
1288 #endif /* UNIV_DEBUG */
1289 
1290  leaf = page_is_leaf(page);
1291 
1292  /* Calculate the record size when entry is converted to a record */
1293  rec_size = rec_get_converted_size(index, entry, n_ext);
1294 
1295  if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
1296  dtuple_get_n_fields(entry), zip_size)) {
1297 
1298  /* The record is so big that we have to store some fields
1299  externally on separate database pages */
1300  big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1301 
1302  if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
1303 
1304  return(DB_TOO_BIG_RECORD);
1305  }
1306 
1307  rec_size = rec_get_converted_size(index, entry, n_ext);
1308  }
1309 
1310  if (zip_size) {
1311  /* Estimate the free space of an empty compressed page.
1312  Subtract one byte for the encoded heap_no in the
1313  modification log. */
1314  ulint free_space_zip = page_zip_empty_size(
1315  cursor->index->n_fields, zip_size);
1316  ulint n_uniq = dict_index_get_n_unique_in_tree(index);
1317 
1318  ut_ad(dict_table_is_comp(index->table));
1319 
1320  if (free_space_zip == 0) {
1321 too_big:
1322  if (big_rec_vec) {
1324  index, entry, big_rec_vec);
1325  }
1326 
1327  return(DB_TOO_BIG_RECORD);
1328  }
1329 
1330  /* Subtract one byte for the encoded heap_no in the
1331  modification log. */
1332  free_space_zip--;
1333 
1334  /* There should be enough room for two node pointer
1335  records on an empty non-leaf page. This prevents
1336  infinite page splits. */
1337 
1338  if (entry->n_fields >= n_uniq
1339  && (REC_NODE_PTR_SIZE
1341  index, entry->fields, n_uniq, NULL)
1342  /* On a compressed page, there is
1343  a two-byte entry in the dense
1344  page directory for every record.
1345  But there is no record header. */
1346  - (REC_N_NEW_EXTRA_BYTES - 2)
1347  > free_space_zip / 2)) {
1348  goto too_big;
1349  }
1350  }
1351 
1352  LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
1353  goto fail);
1354 
1355  if (leaf && zip_size
1356  && (page_get_data_size(page) + rec_size
1358  /* If compression padding tells us that insertion will
1359  result in too packed up page i.e.: which is likely to
1360  cause compression failure then don't do an optimistic
1361  insertion. */
1362 fail:
1363  err = DB_FAIL;
1364 fail_err:
1365 
1366  if (big_rec_vec) {
1367  dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1368  }
1369 
1370  return(err);
1371  }
1372 
1373  ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
1374 
1375  if (page_has_garbage(page)) {
1376  if ((max_size < rec_size
1377  || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
1378  && page_get_n_recs(page) > 1
1379  && page_get_max_insert_size(page, 1) < rec_size) {
1380 
1381  goto fail;
1382  }
1383  } else if (max_size < rec_size) {
1384  goto fail;
1385  }
1386 
1387  /* If there have been many consecutive inserts to the
1388  clustered index leaf page of an uncompressed table, check if
1389  we have to split the page to reserve enough free space for
1390  future updates of records. */
1391 
1392  if (leaf && !zip_size && dict_index_is_clust(index)
1393  && page_get_n_recs(page) >= 2
1394  && dict_index_get_space_reserve() + rec_size > max_size
1395  && (btr_page_get_split_rec_to_right(cursor, &dummy)
1396  || btr_page_get_split_rec_to_left(cursor, &dummy))) {
1397  goto fail;
1398  }
1399 
1400  /* Check locks and write to the undo log, if specified */
1401  err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1402  thr, mtr, &inherit);
1403 
1404  if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1405 
1406  goto fail_err;
1407  }
1408 
1409  page_cursor = btr_cur_get_page_cur(cursor);
1410 
1411  /* Now, try the insert */
1412 
1413  {
1414  const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
1415  *rec = page_cur_tuple_insert(page_cursor, entry, index,
1416  offsets, heap, n_ext, mtr);
1417  reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
1418  }
1419 
1420  if (*rec) {
1421  } else if (zip_size) {
1422  /* Reset the IBUF_BITMAP_FREE bits, because
1423  page_cur_tuple_insert() will have attempted page
1424  reorganize before failing. */
1425  if (leaf && !dict_index_is_clust(index)) {
1426  ibuf_reset_free_bits(block);
1427  }
1428 
1429  goto fail;
1430  } else {
1431  ut_ad(!reorg);
1432 
1433  /* If the record did not fit, reorganize */
1434  if (!btr_page_reorganize(page_cursor, index, mtr)) {
1435  ut_ad(0);
1436  goto fail;
1437  }
1438 
1439  ut_ad(page_get_max_insert_size(page, 1) == max_size);
1440 
1441  reorg = TRUE;
1442 
1443  *rec = page_cur_tuple_insert(page_cursor, entry, index,
1444  offsets, heap, n_ext, mtr);
1445 
1446  if (UNIV_UNLIKELY(!*rec)) {
1447  fputs("InnoDB: Error: cannot insert tuple ", stderr);
1448  dtuple_print(stderr, entry);
1449  fputs(" into ", stderr);
1450  dict_index_name_print(stderr, thr_get_trx(thr), index);
1451  fprintf(stderr, "\nInnoDB: max insert size %lu\n",
1452  (ulong) max_size);
1453  ut_error;
1454  }
1455  }
1456 
1457 #ifdef BTR_CUR_HASH_ADAPT
1458  if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
1460  } else {
1462  }
1463 #endif
1464 
1465  if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
1466 
1467  lock_update_insert(block, *rec);
1468  }
1469 
1470  if (leaf && !dict_index_is_clust(index)) {
1471  /* Update the free bits of the B-tree page in the
1472  insert buffer bitmap. */
1473 
1474  /* The free bits in the insert buffer bitmap must
1475  never exceed the free space on a page. It is safe to
1476  decrement or reset the bits in the bitmap in a
1477  mini-transaction that is committed before the
1478  mini-transaction that affects the free space. */
1479 
1480  /* It is unsafe to increment the bits in a separately
1481  committed mini-transaction, because in crash recovery,
1482  the free bits could momentarily be set too high. */
1483 
1484  if (zip_size) {
1485  /* Update the bits in the same mini-transaction. */
1486  ibuf_update_free_bits_zip(block, mtr);
1487  } else {
1488  /* Decrement the bits in a separate
1489  mini-transaction. */
1491  block, max_size,
1492  rec_size + PAGE_DIR_SLOT_SIZE);
1493  }
1494  }
1495 
1496  *big_rec = big_rec_vec;
1497 
1498  return(DB_SUCCESS);
1499 }
1500 
1501 /*************************************************************/
1507 UNIV_INTERN
1508 dberr_t
1510 /*=======================*/
1511  ulint flags,
1517  btr_cur_t* cursor,
1519  ulint** offsets,
1520  mem_heap_t** heap,
1522  dtuple_t* entry,
1523  rec_t** rec,
1525  big_rec_t** big_rec,
1528  ulint n_ext,
1529  que_thr_t* thr,
1530  mtr_t* mtr)
1531 {
1532  dict_index_t* index = cursor->index;
1533  ulint zip_size = dict_table_zip_size(index->table);
1534  big_rec_t* big_rec_vec = NULL;
1535  dberr_t err;
1536  ibool dummy_inh;
1537  ibool success;
1538  ulint n_reserved = 0;
1539 
1540  ut_ad(dtuple_check_typed(entry));
1541 
1542  *big_rec = NULL;
1543 
1544  ut_ad(mtr_memo_contains(mtr,
1546  MTR_MEMO_X_LOCK));
1547  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
1548  MTR_MEMO_PAGE_X_FIX));
1550  || dict_index_is_clust(index)
1551  || (flags & BTR_CREATE_FLAG));
1552 
1553  cursor->flag = BTR_CUR_BINARY;
1554 
1555  /* Check locks and write to undo log, if specified */
1556 
1557  err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1558  thr, mtr, &dummy_inh);
1559 
1560  if (err != DB_SUCCESS) {
1561 
1562  return(err);
1563  }
1564 
1565  if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
1566  /* First reserve enough free space for the file segments
1567  of the index tree, so that the insert will not fail because
1568  of lack of space */
1569 
1570  ulint n_extents = cursor->tree_height / 16 + 3;
1571 
1572  success = fsp_reserve_free_extents(&n_reserved, index->space,
1573  n_extents, FSP_NORMAL, mtr);
1574  if (!success) {
1575  return(DB_OUT_OF_FILE_SPACE);
1576  }
1577  }
1578 
1579  if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
1580  dict_table_is_comp(index->table),
1581  dtuple_get_n_fields(entry),
1582  zip_size)) {
1583  /* The record is so big that we have to store some fields
1584  externally on separate database pages */
1585 
1586  if (UNIV_LIKELY_NULL(big_rec_vec)) {
1587  /* This should never happen, but we handle
1588  the situation in a robust manner. */
1589  ut_ad(0);
1590  dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1591  }
1592 
1593  big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1594 
1595  if (big_rec_vec == NULL) {
1596 
1597  if (n_reserved > 0) {
1599  n_reserved);
1600  }
1601  return(DB_TOO_BIG_RECORD);
1602  }
1603  }
1604 
1605  if (dict_index_get_page(index)
1606  == buf_block_get_page_no(btr_cur_get_block(cursor))) {
1607 
1608  /* The page is the root page */
1610  flags, cursor, offsets, heap, entry, n_ext, mtr);
1611  } else {
1613  flags, cursor, offsets, heap, entry, n_ext, mtr);
1614  }
1615 
1616  ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
1617 
1618 #ifdef BTR_CUR_ADAPT
1620 #endif
1621  if (!(flags & BTR_NO_LOCKING_FLAG)) {
1622 
1623  lock_update_insert(btr_cur_get_block(cursor), *rec);
1624  }
1625 
1626  if (n_reserved > 0) {
1627  fil_space_release_free_extents(index->space, n_reserved);
1628  }
1629 
1630  *big_rec = big_rec_vec;
1631 
1632  return(DB_SUCCESS);
1633 }
1634 
1635 /*==================== B-TREE UPDATE =========================*/
1636 
1637 /*************************************************************/
1640 UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7)))
1641 dberr_t
1642 btr_cur_upd_lock_and_undo(
1643 /*======================*/
1644  ulint flags,
1645  btr_cur_t* cursor,
1646  const ulint* offsets,
1647  const upd_t* update,
1648  ulint cmpl_info,
1650  que_thr_t* thr,
1652  mtr_t* mtr,
1653  roll_ptr_t* roll_ptr)
1654 {
1656  const rec_t* rec;
1657  dberr_t err;
1658 
1659  ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG));
1660 
1661  rec = btr_cur_get_rec(cursor);
1662  index = cursor->index;
1663 
1664  ut_ad(rec_offs_validate(rec, index, offsets));
1665 
1666  if (!dict_index_is_clust(index)) {
1668  == !!(flags & BTR_CREATE_FLAG));
1669 
1670  /* We do undo logging only when we update a clustered index
1671  record */
1673  flags, btr_cur_get_block(cursor), rec,
1674  index, thr, mtr));
1675  }
1676 
1677  /* Check if we have to wait for a lock: enqueue an explicit lock
1678  request if yes */
1679 
1680  if (!(flags & BTR_NO_LOCKING_FLAG)) {
1682  flags, btr_cur_get_block(cursor), rec, index,
1683  offsets, thr);
1684  if (err != DB_SUCCESS) {
1685  return(err);
1686  }
1687  }
1688 
1689  /* Append the info about the update in the undo log */
1690 
1692  flags, TRX_UNDO_MODIFY_OP, thr,
1693  index, NULL, update,
1694  cmpl_info, rec, offsets, roll_ptr));
1695 }
1696 
1697 /***********************************************************/
1699 UNIV_INLINE __attribute__((nonnull))
1700 void
1701 btr_cur_update_in_place_log(
1702 /*========================*/
1703  ulint flags,
1704  const rec_t* rec,
1705  dict_index_t* index,
1706  const upd_t* update,
1707  trx_id_t trx_id,
1708  roll_ptr_t roll_ptr,
1709  mtr_t* mtr)
1710 {
1711  byte* log_ptr;
1712  const page_t* page = page_align(rec);
1713  ut_ad(flags < 256);
1714  ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1715 
1716  log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
1719  1 + DATA_ROLL_PTR_LEN + 14 + 2
1720  + MLOG_BUF_MARGIN);
1721 
1722  if (!log_ptr) {
1723  /* Logging in mtr is switched off during crash recovery */
1724  return;
1725  }
1726 
1727  /* The code below assumes index is a clustered index: change index to
1728  the clustered index if we are updating a secondary index record (or we
1729  could as well skip writing the sys col values to the log in this case
1730  because they are not needed for a secondary index record update) */
1731 
1732  index = dict_table_get_first_index(index->table);
1733 
1734  mach_write_to_1(log_ptr, flags);
1735  log_ptr++;
1736 
1738  index, trx_id, roll_ptr, log_ptr, mtr);
1739  mach_write_to_2(log_ptr, page_offset(rec));
1740  log_ptr += 2;
1741 
1742  row_upd_index_write_log(update, log_ptr, mtr);
1743 }
1744 #endif /* UNIV_HOTBACKUP */
1745 
1746 /***********************************************************/
1749 UNIV_INTERN
1750 byte*
1752 /*==========================*/
1753  byte* ptr,
1754  byte* end_ptr,
1755  page_t* page,
1756  page_zip_des_t* page_zip,
1757  dict_index_t* index)
1758 {
1759  ulint flags;
1760  rec_t* rec;
1761  upd_t* update;
1762  ulint pos;
1763  trx_id_t trx_id;
1764  roll_ptr_t roll_ptr;
1765  ulint rec_offset;
1766  mem_heap_t* heap;
1767  ulint* offsets;
1768 
1769  if (end_ptr < ptr + 1) {
1770 
1771  return(NULL);
1772  }
1773 
1774  flags = mach_read_from_1(ptr);
1775  ptr++;
1776 
1777  ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
1778 
1779  if (ptr == NULL) {
1780 
1781  return(NULL);
1782  }
1783 
1784  if (end_ptr < ptr + 2) {
1785 
1786  return(NULL);
1787  }
1788 
1789  rec_offset = mach_read_from_2(ptr);
1790  ptr += 2;
1791 
1792  ut_a(rec_offset <= UNIV_PAGE_SIZE);
1793 
1794  heap = mem_heap_create(256);
1795 
1796  ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
1797 
1798  if (!ptr || !page) {
1799 
1800  goto func_exit;
1801  }
1802 
1803  ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
1804  rec = page + rec_offset;
1805 
1806  /* We do not need to reserve btr_search_latch, as the page is only
1807  being recovered, and there cannot be a hash index to it. */
1808 
1809  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1810 
1811  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1812  row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
1813  pos, trx_id, roll_ptr);
1814  }
1815 
1816  row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1817 
1818 func_exit:
1819  mem_heap_free(heap);
1820 
1821  return(ptr);
1822 }
1823 
1824 #ifndef UNIV_HOTBACKUP
1825 /*************************************************************/
1837 UNIV_INTERN
1838 bool
1840 /*==========================*/
1841  page_zip_des_t* page_zip,
1842  page_cur_t* cursor,
1843  dict_index_t* index,
1844 #ifdef UNIV_DEBUG
1845  ulint* offsets,
1846 #endif /* UNIV_DEBUG */
1847  ulint length,
1848  bool create,
1850  mtr_t* mtr)
1851 {
1852  const page_t* page = page_cur_get_page(cursor);
1853 
1854  ut_ad(page_zip == page_cur_get_page_zip(cursor));
1855  ut_ad(page_zip);
1856  ut_ad(!dict_index_is_ibuf(index));
1857  ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
1858 
1859  if (page_zip_available(page_zip, dict_index_is_clust(index),
1860  length, create)) {
1861  return(true);
1862  }
1863 
1864  if (!page_zip->m_nonempty && !page_has_garbage(page)) {
1865  /* The page has been freshly compressed, so
1866  reorganizing it will not help. */
1867  return(false);
1868  }
1869 
1870  if (create && page_is_leaf(page)
1871  && (length + page_get_data_size(page)
1873  return(false);
1874  }
1875 
1876  if (!btr_page_reorganize(cursor, index, mtr)) {
1877  goto out_of_space;
1878  }
1879 
1880  rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
1881 
1882  /* After recompressing a page, we must make sure that the free
1883  bits in the insert buffer bitmap will not exceed the free
1884  space on the page. Because this function will not attempt
1885  recompression unless page_zip_available() fails above, it is
1886  safe to reset the free bits if page_zip_available() fails
1887  again, below. The free bits can safely be reset in a separate
1888  mini-transaction. If page_zip_available() succeeds below, we
1889  can be sure that the btr_page_reorganize() above did not reduce
1890  the free space available on the page. */
1891 
1892  if (page_zip_available(page_zip, dict_index_is_clust(index),
1893  length, create)) {
1894  return(true);
1895  }
1896 
1897 out_of_space:
1898  ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
1899 
1900  /* Out of space: reset the free bits. */
1901  if (!dict_index_is_clust(index) && page_is_leaf(page)) {
1902  ibuf_reset_free_bits(page_cur_get_block(cursor));
1903  }
1904 
1905  return(false);
1906 }
1907 
1908 /*************************************************************/
1915 UNIV_INTERN
1916 dberr_t
1918 /*====================*/
1919  ulint flags,
1920  btr_cur_t* cursor,
1923  ulint* offsets,
1924  const upd_t* update,
1925  ulint cmpl_info,
1927  que_thr_t* thr,
1928  trx_id_t trx_id,
1929  mtr_t* mtr)
1933 {
1935  buf_block_t* block;
1936  page_zip_des_t* page_zip;
1937  dberr_t err;
1938  rec_t* rec;
1939  roll_ptr_t roll_ptr = 0;
1940  ulint was_delete_marked;
1941  ibool is_hashed;
1942 
1943  rec = btr_cur_get_rec(cursor);
1944  index = cursor->index;
1945  ut_ad(rec_offs_validate(rec, index, offsets));
1946  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1947  /* The insert buffer tree should never be updated in place. */
1948  ut_ad(!dict_index_is_ibuf(index));
1949  ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
1950  || dict_index_is_clust(index));
1951  ut_ad(thr_get_trx(thr)->id == trx_id
1952  || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
1953  == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
1954  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
1956  ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
1957 
1958 #ifdef UNIV_DEBUG
1959  if (btr_cur_print_record_ops) {
1960  btr_cur_trx_report(trx_id, index, "update ");
1961  rec_print_new(stderr, rec, offsets);
1962  }
1963 #endif /* UNIV_DEBUG */
1964 
1965  block = btr_cur_get_block(cursor);
1966  page_zip = buf_block_get_page_zip(block);
1967 
1968  /* Check that enough space is available on the compressed page. */
1969  if (page_zip) {
1970  if (!btr_cur_update_alloc_zip(
1971  page_zip, btr_cur_get_page_cur(cursor),
1972  index, offsets, rec_offs_size(offsets),
1973  false, mtr)) {
1974  return(DB_ZIP_OVERFLOW);
1975  }
1976 
1977  rec = btr_cur_get_rec(cursor);
1978  }
1979 
1980  /* Do lock checking and undo logging */
1981  err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
1982  update, cmpl_info,
1983  thr, mtr, &roll_ptr);
1984  if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1985  /* We may need to update the IBUF_BITMAP_FREE
1986  bits after a reorganize that was done in
1987  btr_cur_update_alloc_zip(). */
1988  goto func_exit;
1989  }
1990 
1991  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1992  row_upd_rec_sys_fields(rec, NULL, index, offsets,
1993  thr_get_trx(thr), roll_ptr);
1994  }
1995 
1996  was_delete_marked = rec_get_deleted_flag(
1997  rec, page_is_comp(buf_block_get_frame(block)));
1998 
1999  is_hashed = (block->index != NULL);
2000 
2001  if (is_hashed) {
2002  /* TO DO: Can we skip this if none of the fields
2003  index->search_info->curr_n_fields
2004  are being updated? */
2005 
2006  /* The function row_upd_changes_ord_field_binary works only
2007  if the update vector was built for a clustered index, we must
2008  NOT call it if index is secondary */
2009 
2010  if (!dict_index_is_clust(index)
2011  || row_upd_changes_ord_field_binary(index, update, thr,
2012  NULL, NULL)) {
2013 
2014  /* Remove possible hash index pointer to this record */
2016  }
2017 
2018  rw_lock_x_lock(&btr_search_latch);
2019  }
2020 
2021  row_upd_rec_in_place(rec, index, offsets, update, page_zip);
2022 
2023  if (is_hashed) {
2024  rw_lock_x_unlock(&btr_search_latch);
2025  }
2026 
2027  btr_cur_update_in_place_log(flags, rec, index, update,
2028  trx_id, roll_ptr, mtr);
2029 
2030  if (was_delete_marked
2032  rec, page_is_comp(buf_block_get_frame(block)))) {
2033  /* The new updated record owns its possible externally
2034  stored fields */
2035 
2036  btr_cur_unmark_extern_fields(page_zip,
2037  rec, index, offsets, mtr);
2038  }
2039 
2040  ut_ad(err == DB_SUCCESS);
2041 
2042 func_exit:
2043  if (page_zip
2044  && !(flags & BTR_KEEP_IBUF_BITMAP)
2045  && !dict_index_is_clust(index)
2046  && page_is_leaf(buf_block_get_frame(block))) {
2047  /* Update the free bits in the insert buffer. */
2048  ibuf_update_free_bits_zip(block, mtr);
2049  }
2050 
2051  return(err);
2052 }
2053 
2054 /*************************************************************/
2066 UNIV_INTERN
2067 dberr_t
2069 /*======================*/
2070  ulint flags,
2071  btr_cur_t* cursor,
2074  ulint** offsets,
2075  mem_heap_t** heap,
2076  const upd_t* update,
2078  ulint cmpl_info,
2080  que_thr_t* thr,
2082  trx_id_t trx_id,
2083  mtr_t* mtr)
2087 {
2089  page_cur_t* page_cursor;
2090  dberr_t err;
2091  buf_block_t* block;
2092  page_t* page;
2093  page_zip_des_t* page_zip;
2094  rec_t* rec;
2095  ulint max_size;
2096  ulint new_rec_size;
2097  ulint old_rec_size;
2098  dtuple_t* new_entry;
2099  roll_ptr_t roll_ptr;
2100  ulint i;
2101  ulint n_ext;
2102 
2103  block = btr_cur_get_block(cursor);
2104  page = buf_block_get_frame(block);
2105  rec = btr_cur_get_rec(cursor);
2106  index = cursor->index;
2107  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2108  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2109  /* The insert buffer tree should never be updated in place. */
2110  ut_ad(!dict_index_is_ibuf(index));
2111  ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
2112  || dict_index_is_clust(index));
2113  ut_ad(thr_get_trx(thr)->id == trx_id
2114  || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
2115  == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
2116  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
2118  ut_ad(btr_page_get_index_id(page) == index->id);
2119 
2120  *offsets = rec_get_offsets(rec, index, *offsets,
2121  ULINT_UNDEFINED, heap);
2122 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
2123  ut_a(!rec_offs_any_null_extern(rec, *offsets)
2124  || trx_is_recv(thr_get_trx(thr)));
2125 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
2126 
2127 #ifdef UNIV_DEBUG
2128  if (btr_cur_print_record_ops) {
2129  btr_cur_trx_report(trx_id, index, "update ");
2130  rec_print_new(stderr, rec, *offsets);
2131  }
2132 #endif /* UNIV_DEBUG */
2133 
2134  if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
2135 
2136  /* The simplest and the most common case: the update does not
2137  change the size of any field and none of the updated fields is
2138  externally stored in rec or update, and there is enough space
2139  on the compressed page to log the update. */
2140 
2141  return(btr_cur_update_in_place(
2142  flags, cursor, *offsets, update,
2143  cmpl_info, thr, trx_id, mtr));
2144  }
2145 
2146  if (rec_offs_any_extern(*offsets)) {
2147 any_extern:
2148  /* Externally stored fields are treated in pessimistic
2149  update */
2150 
2151  return(DB_OVERFLOW);
2152  }
2153 
2154  for (i = 0; i < upd_get_n_fields(update); i++) {
2155  if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
2156 
2157  goto any_extern;
2158  }
2159  }
2160 
2161  page_cursor = btr_cur_get_page_cur(cursor);
2162 
2163  if (!*heap) {
2164  *heap = mem_heap_create(
2165  rec_offs_size(*offsets)
2166  + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
2167  }
2168 
2169  new_entry = row_rec_to_index_entry(rec, index, *offsets,
2170  &n_ext, *heap);
2171  /* We checked above that there are no externally stored fields. */
2172  ut_a(!n_ext);
2173 
2174  /* The page containing the clustered index record
2175  corresponding to new_entry is latched in mtr.
2176  Thus the following call is safe. */
2177  row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2178  FALSE, *heap);
2179  old_rec_size = rec_offs_size(*offsets);
2180  new_rec_size = rec_get_converted_size(index, new_entry, 0);
2181 
2182  page_zip = buf_block_get_page_zip(block);
2183 #ifdef UNIV_ZIP_DEBUG
2184  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2185 #endif /* UNIV_ZIP_DEBUG */
2186 
2187  if (page_zip) {
2188  if (!btr_cur_update_alloc_zip(
2189  page_zip, page_cursor, index, *offsets,
2190  new_rec_size, true, mtr)) {
2191  return(DB_ZIP_OVERFLOW);
2192  }
2193 
2194  rec = page_cur_get_rec(page_cursor);
2195  }
2196 
2197  if (UNIV_UNLIKELY(new_rec_size
2199  / 2))) {
2200  /* We may need to update the IBUF_BITMAP_FREE
2201  bits after a reorganize that was done in
2202  btr_cur_update_alloc_zip(). */
2203  err = DB_OVERFLOW;
2204  goto func_exit;
2205  }
2206 
2207  if (UNIV_UNLIKELY(page_get_data_size(page)
2208  - old_rec_size + new_rec_size
2210  /* We may need to update the IBUF_BITMAP_FREE
2211  bits after a reorganize that was done in
2212  btr_cur_update_alloc_zip(). */
2213 
2214  /* The page would become too empty */
2215  err = DB_UNDERFLOW;
2216  goto func_exit;
2217  }
2218 
2219  /* We do not attempt to reorganize if the page is compressed.
2220  This is because the page may fail to compress after reorganization. */
2221  max_size = page_zip
2222  ? page_get_max_insert_size(page, 1)
2223  : (old_rec_size
2225 
2226  if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
2227  && (max_size >= new_rec_size))
2228  || (page_get_n_recs(page) <= 1))) {
2229 
2230  /* We may need to update the IBUF_BITMAP_FREE
2231  bits after a reorganize that was done in
2232  btr_cur_update_alloc_zip(). */
2233 
2234  /* There was not enough space, or it did not pay to
2235  reorganize: for simplicity, we decide what to do assuming a
2236  reorganization is needed, though it might not be necessary */
2237 
2238  err = DB_OVERFLOW;
2239  goto func_exit;
2240  }
2241 
2242  /* Do lock checking and undo logging */
2243  err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
2244  update, cmpl_info,
2245  thr, mtr, &roll_ptr);
2246  if (err != DB_SUCCESS) {
2247  /* We may need to update the IBUF_BITMAP_FREE
2248  bits after a reorganize that was done in
2249  btr_cur_update_alloc_zip(). */
2250  goto func_exit;
2251  }
2252 
2253  /* Ok, we may do the replacement. Store on the page infimum the
2254  explicit locks on rec, before deleting rec (see the comment in
2255  btr_cur_pessimistic_update). */
2256 
2257  lock_rec_store_on_page_infimum(block, rec);
2258 
2260 
2261  page_cur_delete_rec(page_cursor, index, *offsets, mtr);
2262 
2263  page_cur_move_to_prev(page_cursor);
2264 
2265  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2266  row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2267  roll_ptr);
2268  row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2269  trx_id);
2270  }
2271 
2272  /* There are no externally stored columns in new_entry */
2273  rec = btr_cur_insert_if_possible(
2274  cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
2275  ut_a(rec); /* <- We calculated above the insert would fit */
2276 
2277  /* Restore the old explicit lock state on the record */
2278 
2279  lock_rec_restore_from_page_infimum(block, rec, block);
2280 
2281  page_cur_move_to_next(page_cursor);
2282  ut_ad(err == DB_SUCCESS);
2283 
2284 func_exit:
2285  if (page_zip
2286  && !(flags & BTR_KEEP_IBUF_BITMAP)
2287  && !dict_index_is_clust(index)
2288  && page_is_leaf(page)) {
2289  /* Update the free bits in the insert buffer. */
2290  ibuf_update_free_bits_zip(block, mtr);
2291  }
2292 
2293  return(err);
2294 }
2295 
2296 /*************************************************************/
2302 static
2303 void
2304 btr_cur_pess_upd_restore_supremum(
2305 /*==============================*/
2306  buf_block_t* block,
2307  const rec_t* rec,
2308  mtr_t* mtr)
2309 {
2310  page_t* page;
2311  buf_block_t* prev_block;
2312  ulint space;
2313  ulint zip_size;
2314  ulint prev_page_no;
2315 
2316  page = buf_block_get_frame(block);
2317 
2318  if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
2319  /* Updated record is not the first user record on its page */
2320 
2321  return;
2322  }
2323 
2324  space = buf_block_get_space(block);
2325  zip_size = buf_block_get_zip_size(block);
2326  prev_page_no = btr_page_get_prev(page, mtr);
2327 
2328  ut_ad(prev_page_no != FIL_NULL);
2329  prev_block = buf_page_get_with_no_latch(space, zip_size,
2330  prev_page_no, mtr);
2331 #ifdef UNIV_BTR_DEBUG
2332  ut_a(btr_page_get_next(prev_block->frame, mtr)
2333  == page_get_page_no(page));
2334 #endif /* UNIV_BTR_DEBUG */
2335 
2336  /* We must already have an x-latch on prev_block! */
2337  ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
2338 
2339  lock_rec_reset_and_inherit_gap_locks(prev_block, block,
2340  PAGE_HEAP_NO_SUPREMUM,
2341  page_rec_get_heap_no(rec));
2342 }
2343 
2344 /*************************************************************/
2351 UNIV_INTERN
2352 dberr_t
2354 /*=======================*/
2355  ulint flags,
2357  btr_cur_t* cursor,
2360  ulint** offsets,
2364  mem_heap_t* entry_heap,
2367  big_rec_t** big_rec,
2369  const upd_t* update,
2372  ulint cmpl_info,
2374  que_thr_t* thr,
2376  trx_id_t trx_id,
2377  mtr_t* mtr)
2379 {
2380  big_rec_t* big_rec_vec = NULL;
2381  big_rec_t* dummy_big_rec;
2383  buf_block_t* block;
2384  page_t* page;
2385  page_zip_des_t* page_zip;
2386  rec_t* rec;
2387  page_cur_t* page_cursor;
2388  dberr_t err;
2389  dberr_t optim_err;
2390  roll_ptr_t roll_ptr;
2391  ibool was_first;
2392  ulint n_reserved = 0;
2393  ulint n_ext;
2394 
2395  *offsets = NULL;
2396  *big_rec = NULL;
2397 
2398  block = btr_cur_get_block(cursor);
2399  page = buf_block_get_frame(block);
2400  page_zip = buf_block_get_page_zip(block);
2401  index = cursor->index;
2402 
2403  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2404  MTR_MEMO_X_LOCK));
2405  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2406 #ifdef UNIV_ZIP_DEBUG
2407  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2408 #endif /* UNIV_ZIP_DEBUG */
2409  /* The insert buffer tree should never be updated in place. */
2410  ut_ad(!dict_index_is_ibuf(index));
2411  ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
2412  || dict_index_is_clust(index));
2413  ut_ad(thr_get_trx(thr)->id == trx_id
2414  || (flags & ~BTR_KEEP_POS_FLAG)
2415  == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
2416  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
2417 
2418  err = optim_err = btr_cur_optimistic_update(
2419  flags | BTR_KEEP_IBUF_BITMAP,
2420  cursor, offsets, offsets_heap, update,
2421  cmpl_info, thr, trx_id, mtr);
2422 
2423  switch (err) {
2424  case DB_ZIP_OVERFLOW:
2425  case DB_UNDERFLOW:
2426  case DB_OVERFLOW:
2427  break;
2428  default:
2429  err_exit:
2430  /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
2431  For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
2432  already reset by btr_cur_update_alloc_zip() if the
2433  page was recompressed. */
2434  if (page_zip
2435  && optim_err != DB_ZIP_OVERFLOW
2436  && !dict_index_is_clust(index)
2437  && page_is_leaf(page)) {
2438  ibuf_update_free_bits_zip(block, mtr);
2439  }
2440 
2441  return(err);
2442  }
2443 
2444  /* Do lock checking and undo logging */
2445  err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
2446  update, cmpl_info,
2447  thr, mtr, &roll_ptr);
2448  if (err != DB_SUCCESS) {
2449  goto err_exit;
2450  }
2451 
2452  if (optim_err == DB_OVERFLOW) {
2453  ulint reserve_flag;
2454 
2455  /* First reserve enough free space for the file segments
2456  of the index tree, so that the update will not fail because
2457  of lack of space */
2458 
2459  ulint n_extents = cursor->tree_height / 16 + 3;
2460 
2461  if (flags & BTR_NO_UNDO_LOG_FLAG) {
2462  reserve_flag = FSP_CLEANING;
2463  } else {
2464  reserve_flag = FSP_NORMAL;
2465  }
2466 
2467  if (!fsp_reserve_free_extents(&n_reserved, index->space,
2468  n_extents, reserve_flag, mtr)) {
2469  err = DB_OUT_OF_FILE_SPACE;
2470  goto err_exit;
2471  }
2472  }
2473 
2474  rec = btr_cur_get_rec(cursor);
2475 
2476  *offsets = rec_get_offsets(
2477  rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
2478 
2479  dtuple_t* new_entry = row_rec_to_index_entry(
2480  rec, index, *offsets, &n_ext, entry_heap);
2481 
2482  /* The page containing the clustered index record
2483  corresponding to new_entry is latched in mtr. If the
2484  clustered index record is delete-marked, then its externally
2485  stored fields cannot have been purged yet, because then the
2486  purge would also have removed the clustered index record
2487  itself. Thus the following call is safe. */
2488  row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2489  FALSE, entry_heap);
2490  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2491  row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2492  roll_ptr);
2493  row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2494  trx_id);
2495  }
2496 
2497  if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) {
2498  /* We are in a transaction rollback undoing a row
2499  update: we must free possible externally stored fields
2500  which got new values in the update, if they are not
2501  inherited values. They can be inherited if we have
2502  updated the primary key to another value, and then
2503  update it back again. */
2504 
2505  ut_ad(big_rec_vec == NULL);
2506 
2507  btr_rec_free_updated_extern_fields(
2508  index, rec, page_zip, *offsets, update,
2509  trx_is_recv(thr_get_trx(thr))
2510  ? RB_RECOVERY : RB_NORMAL, mtr);
2511  }
2512 
2513  /* We have to set appropriate extern storage bits in the new
2514  record to be inserted: we have to remember which fields were such */
2515 
2516  ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
2517  ut_ad(rec_offs_validate(rec, index, *offsets));
2518  n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
2519 
2520  if (page_zip) {
2521  ut_ad(page_is_comp(page));
2523  rec_get_converted_size(index, new_entry, n_ext),
2524  TRUE,
2525  dict_index_get_n_fields(index),
2526  page_zip_get_size(page_zip))) {
2527 
2528  goto make_external;
2529  }
2530  } else if (page_zip_rec_needs_ext(
2531  rec_get_converted_size(index, new_entry, n_ext),
2532  page_is_comp(page), 0, 0)) {
2533 make_external:
2534  big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
2535  if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
2536 
2537  /* We cannot goto return_after_reservations,
2538  because we may need to update the
2539  IBUF_BITMAP_FREE bits, which was suppressed by
2540  BTR_KEEP_IBUF_BITMAP. */
2541 #ifdef UNIV_ZIP_DEBUG
2542  ut_a(!page_zip
2543  || page_zip_validate(page_zip, page, index));
2544 #endif /* UNIV_ZIP_DEBUG */
2545  if (n_reserved > 0) {
2547  index->space, n_reserved);
2548  }
2549 
2550  err = DB_TOO_BIG_RECORD;
2551  goto err_exit;
2552  }
2553 
2554  ut_ad(page_is_leaf(page));
2555  ut_ad(dict_index_is_clust(index));
2556  ut_ad(flags & BTR_KEEP_POS_FLAG);
2557  }
2558 
2559  /* Store state of explicit locks on rec on the page infimum record,
2560  before deleting rec. The page infimum acts as a dummy carrier of the
2561  locks, taking care also of lock releases, before we can move the locks
2562  back on the actual record. There is a special case: if we are
2563  inserting on the root page and the insert causes a call of
2564  btr_root_raise_and_insert. Therefore we cannot in the lock system
2565  delete the lock structs set on the root page even if the root
2566  page carries just node pointers. */
2567 
2568  lock_rec_store_on_page_infimum(block, rec);
2569 
2571 
2572 #ifdef UNIV_ZIP_DEBUG
2573  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2574 #endif /* UNIV_ZIP_DEBUG */
2575  page_cursor = btr_cur_get_page_cur(cursor);
2576 
2577  page_cur_delete_rec(page_cursor, index, *offsets, mtr);
2578 
2579  page_cur_move_to_prev(page_cursor);
2580 
2581  rec = btr_cur_insert_if_possible(cursor, new_entry,
2582  offsets, offsets_heap, n_ext, mtr);
2583 
2584  if (rec) {
2585  page_cursor->rec = rec;
2586 
2587  lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2588  rec, block);
2589 
2590  if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
2591  /* The new inserted record owns its possible externally
2592  stored fields */
2593  btr_cur_unmark_extern_fields(
2594  page_zip, rec, index, *offsets, mtr);
2595  }
2596 
2597  bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
2598 
2599  if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
2600  if (adjust) {
2601  rec_offs_make_valid(
2602  page_cursor->rec, index, *offsets);
2603  }
2604  } else if (page_zip &&
2605  !dict_index_is_clust(index)
2606  && page_is_leaf(page)) {
2607  /* Update the free bits in the insert buffer.
2608  This is the same block which was skipped by
2609  BTR_KEEP_IBUF_BITMAP. */
2610  ibuf_update_free_bits_zip(block, mtr);
2611  }
2612 
2613  err = DB_SUCCESS;
2614  goto return_after_reservations;
2615  } else {
2616  /* If the page is compressed and it initially
2617  compresses very well, and there is a subsequent insert
2618  of a badly-compressing record, it is possible for
2619  btr_cur_optimistic_update() to return DB_UNDERFLOW and
2620  btr_cur_insert_if_possible() to return FALSE. */
2621  ut_a(page_zip || optim_err != DB_UNDERFLOW);
2622 
2623  /* Out of space: reset the free bits.
2624  This is the same block which was skipped by
2625  BTR_KEEP_IBUF_BITMAP. */
2626  if (!dict_index_is_clust(index) && page_is_leaf(page)) {
2627  ibuf_reset_free_bits(block);
2628  }
2629  }
2630 
2631  if (big_rec_vec) {
2632  ut_ad(page_is_leaf(page));
2633  ut_ad(dict_index_is_clust(index));
2634  ut_ad(flags & BTR_KEEP_POS_FLAG);
2635 
2636  /* btr_page_split_and_insert() in
2637  btr_cur_pessimistic_insert() invokes
2638  mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK).
2639  We must keep the index->lock when we created a
2640  big_rec, so that row_upd_clust_rec() can store the
2641  big_rec in the same mini-transaction. */
2642 
2643  mtr_x_lock(dict_index_get_lock(index), mtr);
2644  }
2645 
2646  /* Was the record to be updated positioned as the first user
2647  record on its page? */
2648  was_first = page_cur_is_before_first(page_cursor);
2649 
2650  /* Lock checks and undo logging were already performed by
2651  btr_cur_upd_lock_and_undo(). We do not try
2652  btr_cur_optimistic_insert() because
2653  btr_cur_insert_if_possible() already failed above. */
2654 
2655  err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
2656  | BTR_NO_LOCKING_FLAG
2657  | BTR_KEEP_SYS_FLAG,
2658  cursor, offsets, offsets_heap,
2659  new_entry, &rec,
2660  &dummy_big_rec, n_ext, NULL, mtr);
2661  ut_a(rec);
2662  ut_a(err == DB_SUCCESS);
2663  ut_a(dummy_big_rec == NULL);
2664  ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
2665  page_cursor->rec = rec;
2666 
2667  if (dict_index_is_sec_or_ibuf(index)) {
2668  /* Update PAGE_MAX_TRX_ID in the index page header.
2669  It was not updated by btr_cur_pessimistic_insert()
2670  because of BTR_NO_LOCKING_FLAG. */
2671  buf_block_t* rec_block;
2672 
2673  rec_block = btr_cur_get_block(cursor);
2674 
2675  page_update_max_trx_id(rec_block,
2676  buf_block_get_page_zip(rec_block),
2677  trx_id, mtr);
2678  }
2679 
2680  if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
2681  /* The new inserted record owns its possible externally
2682  stored fields */
2683  buf_block_t* rec_block = btr_cur_get_block(cursor);
2684 
2685 #ifdef UNIV_ZIP_DEBUG
2686  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2687  page = buf_block_get_frame(rec_block);
2688 #endif /* UNIV_ZIP_DEBUG */
2689  page_zip = buf_block_get_page_zip(rec_block);
2690 
2691  btr_cur_unmark_extern_fields(page_zip,
2692  rec, index, *offsets, mtr);
2693  }
2694 
2695  lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2696  rec, block);
2697 
2698  /* If necessary, restore also the correct lock state for a new,
2699  preceding supremum record created in a page split. While the old
2700  record was nonexistent, the supremum might have inherited its locks
2701  from a wrong record. */
2702 
2703  if (!was_first) {
2704  btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
2705  rec, mtr);
2706  }
2707 
2708 return_after_reservations:
2709 #ifdef UNIV_ZIP_DEBUG
2710  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2711 #endif /* UNIV_ZIP_DEBUG */
2712 
2713  if (n_reserved > 0) {
2714  fil_space_release_free_extents(index->space, n_reserved);
2715  }
2716 
2717  *big_rec = big_rec_vec;
2718 
2719  return(err);
2720 }
2721 
2722 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
2723 
2724 /****************************************************************/
2727 UNIV_INLINE
2728 void
2729 btr_cur_del_mark_set_clust_rec_log(
2730 /*===============================*/
2731  rec_t* rec,
2732  dict_index_t* index,
2733  trx_id_t trx_id,
2734  roll_ptr_t roll_ptr,
2735  mtr_t* mtr)
2736 {
2737  byte* log_ptr;
2738 
2739  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2740 
2741  log_ptr = mlog_open_and_write_index(mtr, rec, index,
2742  page_rec_is_comp(rec)
2745  1 + 1 + DATA_ROLL_PTR_LEN
2746  + 14 + 2);
2747 
2748  if (!log_ptr) {
2749  /* Logging in mtr is switched off during crash recovery */
2750  return;
2751  }
2752 
2753  *log_ptr++ = 0;
2754  *log_ptr++ = 1;
2755 
2757  index, trx_id, roll_ptr, log_ptr, mtr);
2758  mach_write_to_2(log_ptr, page_offset(rec));
2759  log_ptr += 2;
2760 
2761  mlog_close(mtr, log_ptr);
2762 }
2763 #endif /* !UNIV_HOTBACKUP */
2764 
2765 /****************************************************************/
2769 UNIV_INTERN
2770 byte*
2772 /*=================================*/
2773  byte* ptr,
2774  byte* end_ptr,
2775  page_t* page,
2776  page_zip_des_t* page_zip,
2777  dict_index_t* index)
2778 {
2779  ulint flags;
2780  ulint val;
2781  ulint pos;
2782  trx_id_t trx_id;
2783  roll_ptr_t roll_ptr;
2784  ulint offset;
2785  rec_t* rec;
2786 
2787  ut_ad(!page
2788  || !!page_is_comp(page) == dict_table_is_comp(index->table));
2789 
2790  if (end_ptr < ptr + 2) {
2791 
2792  return(NULL);
2793  }
2794 
2795  flags = mach_read_from_1(ptr);
2796  ptr++;
2797  val = mach_read_from_1(ptr);
2798  ptr++;
2799 
2800  ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
2801 
2802  if (ptr == NULL) {
2803 
2804  return(NULL);
2805  }
2806 
2807  if (end_ptr < ptr + 2) {
2808 
2809  return(NULL);
2810  }
2811 
2812  offset = mach_read_from_2(ptr);
2813  ptr += 2;
2814 
2815  ut_a(offset <= UNIV_PAGE_SIZE);
2816 
2817  if (page) {
2818  rec = page + offset;
2819 
2820  /* We do not need to reserve btr_search_latch, as the page
2821  is only being recovered, and there cannot be a hash index to
2822  it. Besides, these fields are being updated in place
2823  and the adaptive hash index does not depend on them. */
2824 
2825  btr_rec_set_deleted_flag(rec, page_zip, val);
2826 
2827  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2828  mem_heap_t* heap = NULL;
2829  ulint offsets_[REC_OFFS_NORMAL_SIZE];
2830  rec_offs_init(offsets_);
2831 
2833  rec, page_zip,
2834  rec_get_offsets(rec, index, offsets_,
2835  ULINT_UNDEFINED, &heap),
2836  pos, trx_id, roll_ptr);
2837  if (UNIV_LIKELY_NULL(heap)) {
2838  mem_heap_free(heap);
2839  }
2840  }
2841  }
2842 
2843  return(ptr);
2844 }
2845 
2846 #ifndef UNIV_HOTBACKUP
2847 /***********************************************************/
2853 UNIV_INTERN
2854 dberr_t
2856 /*===========================*/
2857  buf_block_t* block,
2858  rec_t* rec,
2859  dict_index_t* index,
2860  const ulint* offsets,
2861  que_thr_t* thr,
2862  mtr_t* mtr)
2863 {
2864  roll_ptr_t roll_ptr;
2865  dberr_t err;
2866  page_zip_des_t* page_zip;
2867  trx_t* trx;
2868 
2869  ut_ad(dict_index_is_clust(index));
2870  ut_ad(rec_offs_validate(rec, index, offsets));
2871  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2872  ut_ad(buf_block_get_frame(block) == page_align(rec));
2873  ut_ad(page_is_leaf(page_align(rec)));
2874 
2875 #ifdef UNIV_DEBUG
2876  if (btr_cur_print_record_ops && thr) {
2877  btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark ");
2878  rec_print_new(stderr, rec, offsets);
2879  }
2880 #endif /* UNIV_DEBUG */
2881 
2882  ut_ad(dict_index_is_clust(index));
2883  ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2884 
2885  err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
2886  rec, index, offsets, thr);
2887 
2888  if (err != DB_SUCCESS) {
2889 
2890  return(err);
2891  }
2892 
2893  err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr,
2894  index, NULL, NULL, 0, rec, offsets,
2895  &roll_ptr);
2896  if (err != DB_SUCCESS) {
2897 
2898  return(err);
2899  }
2900 
2901  /* The btr_search_latch is not needed here, because
2902  the adaptive hash index does not depend on the delete-mark
2903  and the delete-mark is being updated in place. */
2904 
2905  page_zip = buf_block_get_page_zip(block);
2906 
2907  btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE);
2908  btr_rec_set_deleted_flag(rec, page_zip, TRUE);
2909 
2910  trx = thr_get_trx(thr);
2911 
2912  if (dict_index_is_online_ddl(index)) {
2914  rec, index, offsets, false,
2915  trx_read_trx_id(row_get_trx_id_offset(index, offsets)
2916  + rec));
2917  }
2918 
2919  row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
2920 
2921  btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
2922  roll_ptr, mtr);
2923 
2924  return(err);
2925 }
2926 
2927 /****************************************************************/
2930 UNIV_INLINE
2931 void
2932 btr_cur_del_mark_set_sec_rec_log(
2933 /*=============================*/
2934  rec_t* rec,
2935  ibool val,
2936  mtr_t* mtr)
2937 {
2938  byte* log_ptr;
2939  ut_ad(val <= 1);
2940 
2941  log_ptr = mlog_open(mtr, 11 + 1 + 2);
2942 
2943  if (!log_ptr) {
2944  /* Logging in mtr is switched off during crash recovery:
2945  in that case mlog_open returns NULL */
2946  return;
2947  }
2948 
2950  rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
2951  mach_write_to_1(log_ptr, val);
2952  log_ptr++;
2953 
2954  mach_write_to_2(log_ptr, page_offset(rec));
2955  log_ptr += 2;
2956 
2957  mlog_close(mtr, log_ptr);
2958 }
2959 #endif /* !UNIV_HOTBACKUP */
2960 
2961 /****************************************************************/
2965 UNIV_INTERN
2966 byte*
2968 /*===============================*/
2969  byte* ptr,
2970  byte* end_ptr,
2971  page_t* page,
2972  page_zip_des_t* page_zip)
2973 {
2974  ulint val;
2975  ulint offset;
2976  rec_t* rec;
2977 
2978  if (end_ptr < ptr + 3) {
2979 
2980  return(NULL);
2981  }
2982 
2983  val = mach_read_from_1(ptr);
2984  ptr++;
2985 
2986  offset = mach_read_from_2(ptr);
2987  ptr += 2;
2988 
2989  ut_a(offset <= UNIV_PAGE_SIZE);
2990 
2991  if (page) {
2992  rec = page + offset;
2993 
2994  /* We do not need to reserve btr_search_latch, as the page
2995  is only being recovered, and there cannot be a hash index to
2996  it. Besides, the delete-mark flag is being updated in place
2997  and the adaptive hash index does not depend on it. */
2998 
2999  btr_rec_set_deleted_flag(rec, page_zip, val);
3000  }
3001 
3002  return(ptr);
3003 }
3004 
3005 #ifndef UNIV_HOTBACKUP
3006 /***********************************************************/
3009 UNIV_INTERN
3010 dberr_t
3012 /*=========================*/
3013  ulint flags,
3014  btr_cur_t* cursor,
3015  ibool val,
3016  que_thr_t* thr,
3017  mtr_t* mtr)
3018 {
3019  buf_block_t* block;
3020  rec_t* rec;
3021  dberr_t err;
3022 
3023  block = btr_cur_get_block(cursor);
3024  rec = btr_cur_get_rec(cursor);
3025 
3026 #ifdef UNIV_DEBUG
3027  if (btr_cur_print_record_ops && thr) {
3028  btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index,
3029  "del mark ");
3030  rec_print(stderr, rec, cursor->index);
3031  }
3032 #endif /* UNIV_DEBUG */
3033 
3035  btr_cur_get_block(cursor),
3036  rec, cursor->index, thr, mtr);
3037  if (err != DB_SUCCESS) {
3038 
3039  return(err);
3040  }
3041 
3042  ut_ad(!!page_rec_is_comp(rec)
3043  == dict_table_is_comp(cursor->index->table));
3044 
3045  /* We do not need to reserve btr_search_latch, as the
3046  delete-mark flag is being updated in place and the adaptive
3047  hash index does not depend on it. */
3049 
3050  btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
3051 
3052  return(DB_SUCCESS);
3053 }
3054 
3055 /***********************************************************/
3058 UNIV_INTERN
3059 void
3061 /*==============================*/
3062  rec_t* rec,
3063  page_zip_des_t* page_zip,
3067  ibool val,
3068  mtr_t* mtr)
3069 {
3070  /* We do not need to reserve btr_search_latch, as the page
3071  has just been read to the buffer pool and there cannot be
3072  a hash index to it. Besides, the delete-mark flag is being
3073  updated in place and the adaptive hash index does not depend
3074  on it. */
3075 
3076  btr_rec_set_deleted_flag(rec, page_zip, val);
3077 
3078  btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
3079 }
3080 
3081 /*==================== B-TREE RECORD REMOVE =========================*/
3082 
3083 /*************************************************************/
3090 UNIV_INTERN
3091 ibool
3093 /*=======================*/
3094  btr_cur_t* cursor,
3097  ibool adjust,
3099  mtr_t* mtr)
3100 {
3101  ut_ad(mtr_memo_contains(mtr,
3103  MTR_MEMO_X_LOCK));
3104  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
3105  MTR_MEMO_PAGE_X_FIX));
3106 
3107  return(btr_cur_compress_recommendation(cursor, mtr)
3108  && btr_compress(cursor, adjust, mtr));
3109 }
3110 
3111 /*******************************************************/
3116 UNIV_INTERN
3117 ibool
3119 /*===========================*/
3120  btr_cur_t* cursor,
3124 #ifdef UNIV_DEBUG
3125  ulint flags,
3126 #endif /* UNIV_DEBUG */
3127  mtr_t* mtr)
3131 {
3132  buf_block_t* block;
3133  rec_t* rec;
3134  mem_heap_t* heap = NULL;
3135  ulint offsets_[REC_OFFS_NORMAL_SIZE];
3136  ulint* offsets = offsets_;
3137  ibool no_compress_needed;
3138  rec_offs_init(offsets_);
3139 
3140  ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
3141  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
3142  MTR_MEMO_PAGE_X_FIX));
3143  /* This is intended only for leaf page deletions */
3144 
3145  block = btr_cur_get_block(cursor);
3146 
3147  ut_ad(page_is_leaf(buf_block_get_frame(block)));
3149  || dict_index_is_clust(cursor->index)
3150  || (flags & BTR_CREATE_FLAG));
3151 
3152  rec = btr_cur_get_rec(cursor);
3153  offsets = rec_get_offsets(rec, cursor->index, offsets,
3154  ULINT_UNDEFINED, &heap);
3155 
3156  no_compress_needed = !rec_offs_any_extern(offsets)
3157  && btr_cur_can_delete_without_compress(
3158  cursor, rec_offs_size(offsets), mtr);
3159 
3160  if (no_compress_needed) {
3161 
3162  page_t* page = buf_block_get_frame(block);
3163  page_zip_des_t* page_zip= buf_block_get_page_zip(block);
3164 
3165  lock_update_delete(block, rec);
3166 
3168 
3169  if (page_zip) {
3170 #ifdef UNIV_ZIP_DEBUG
3171  ut_a(page_zip_validate(page_zip, page, cursor->index));
3172 #endif /* UNIV_ZIP_DEBUG */
3173  page_cur_delete_rec(btr_cur_get_page_cur(cursor),
3174  cursor->index, offsets, mtr);
3175 #ifdef UNIV_ZIP_DEBUG
3176  ut_a(page_zip_validate(page_zip, page, cursor->index));
3177 #endif /* UNIV_ZIP_DEBUG */
3178 
3179  /* On compressed pages, the IBUF_BITMAP_FREE
3180  space is not affected by deleting (purging)
3181  records, because it is defined as the minimum
3182  of space available *without* reorganize, and
3183  space available in the modification log. */
3184  } else {
3185  const ulint max_ins
3187  page, 1);
3188 
3189  page_cur_delete_rec(btr_cur_get_page_cur(cursor),
3190  cursor->index, offsets, mtr);
3191 
3192  /* The change buffer does not handle inserts
3193  into non-leaf pages, into clustered indexes,
3194  or into the change buffer. */
3195  if (page_is_leaf(page)
3196  && !dict_index_is_clust(cursor->index)
3197  && !dict_index_is_ibuf(cursor->index)) {
3198  ibuf_update_free_bits_low(block, max_ins, mtr);
3199  }
3200  }
3201  }
3202 
3203  if (UNIV_LIKELY_NULL(heap)) {
3204  mem_heap_free(heap);
3205  }
3206 
3207  return(no_compress_needed);
3208 }
3209 
3210 /*************************************************************/
3218 UNIV_INTERN
3219 ibool
3221 /*=======================*/
3222  dberr_t* err,
3227  ibool has_reserved_extents,
3231  btr_cur_t* cursor,
3235  ulint flags,
3236  enum trx_rb_ctx rb_ctx,
3237  mtr_t* mtr)
3238 {
3239  buf_block_t* block;
3240  page_t* page;
3241  page_zip_des_t* page_zip;
3243  rec_t* rec;
3244  ulint n_reserved = 0;
3245  ibool success;
3246  ibool ret = FALSE;
3247  ulint level;
3248  mem_heap_t* heap;
3249  ulint* offsets;
3250 
3251  block = btr_cur_get_block(cursor);
3252  page = buf_block_get_frame(block);
3253  index = btr_cur_get_index(cursor);
3254 
3255  ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
3257  || dict_index_is_clust(index)
3258  || (flags & BTR_CREATE_FLAG));
3259  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3260  MTR_MEMO_X_LOCK));
3261  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3262  if (!has_reserved_extents) {
3263  /* First reserve enough free space for the file segments
3264  of the index tree, so that the node pointer updates will
3265  not fail because of lack of space */
3266 
3267  ulint n_extents = cursor->tree_height / 32 + 1;
3268 
3269  success = fsp_reserve_free_extents(&n_reserved,
3270  index->space,
3271  n_extents,
3272  FSP_CLEANING, mtr);
3273  if (!success) {
3274  *err = DB_OUT_OF_FILE_SPACE;
3275 
3276  return(FALSE);
3277  }
3278  }
3279 
3280  heap = mem_heap_create(1024);
3281  rec = btr_cur_get_rec(cursor);
3282  page_zip = buf_block_get_page_zip(block);
3283 #ifdef UNIV_ZIP_DEBUG
3284  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3285 #endif /* UNIV_ZIP_DEBUG */
3286 
3287  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3288 
3289  if (rec_offs_any_extern(offsets)) {
3290  btr_rec_free_externally_stored_fields(index,
3291  rec, offsets, page_zip,
3292  rb_ctx, mtr);
3293 #ifdef UNIV_ZIP_DEBUG
3294  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3295 #endif /* UNIV_ZIP_DEBUG */
3296  }
3297 
3298  if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
3299  && UNIV_UNLIKELY(dict_index_get_page(index)
3300  != buf_block_get_page_no(block))) {
3301 
3302  /* If there is only one record, drop the whole page in
3303  btr_discard_page, if this is not the root page */
3304 
3305  btr_discard_page(cursor, mtr);
3306 
3307  ret = TRUE;
3308 
3309  goto return_after_reservations;
3310  }
3311 
3312  if (flags == 0) {
3313  lock_update_delete(block, rec);
3314  }
3315 
3316  level = btr_page_get_level(page, mtr);
3317 
3318  if (level > 0
3319  && UNIV_UNLIKELY(rec == page_rec_get_next(
3320  page_get_infimum_rec(page)))) {
3321 
3322  rec_t* next_rec = page_rec_get_next(rec);
3323 
3324  if (btr_page_get_prev(page, mtr) == FIL_NULL) {
3325 
3326  /* If we delete the leftmost node pointer on a
3327  non-leaf level, we must mark the new leftmost node
3328  pointer as the predefined minimum record */
3329 
3330  /* This will make page_zip_validate() fail until
3331  page_cur_delete_rec() completes. This is harmless,
3332  because everything will take place within a single
3333  mini-transaction and because writing to the redo log
3334  is an atomic operation (performed by mtr_commit()). */
3335  btr_set_min_rec_mark(next_rec, mtr);
3336  } else {
3337  /* Otherwise, if we delete the leftmost node pointer
3338  on a page, we have to change the father node pointer
3339  so that it is equal to the new leftmost node pointer
3340  on the page */
3341 
3342  btr_node_ptr_delete(index, block, mtr);
3343 
3344  dtuple_t* node_ptr = dict_index_build_node_ptr(
3345  index, next_rec, buf_block_get_page_no(block),
3346  heap, level);
3347 
3348  btr_insert_on_non_leaf_level(
3349  flags, index, level + 1, node_ptr, mtr);
3350  }
3351  }
3352 
3354 
3355  page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
3356 #ifdef UNIV_ZIP_DEBUG
3357  ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3358 #endif /* UNIV_ZIP_DEBUG */
3359 
3360  ut_ad(btr_check_node_ptr(index, block, mtr));
3361 
3362 return_after_reservations:
3363  *err = DB_SUCCESS;
3364 
3365  mem_heap_free(heap);
3366 
3367  if (ret == FALSE) {
3368  ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
3369  }
3370 
3371  if (n_reserved > 0) {
3372  fil_space_release_free_extents(index->space, n_reserved);
3373  }
3374 
3375  return(ret);
3376 }
3377 
3378 /*******************************************************************/
3381 static
3382 void
3383 btr_cur_add_path_info(
3384 /*==================*/
3385  btr_cur_t* cursor,
3386  ulint height,
3388  ulint root_height)
3389 {
3390  btr_path_t* slot;
3391  const rec_t* rec;
3392  const page_t* page;
3393 
3394  ut_a(cursor->path_arr);
3395 
3396  if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
3397  /* Do nothing; return empty path */
3398 
3399  slot = cursor->path_arr;
3400  slot->nth_rec = ULINT_UNDEFINED;
3401 
3402  return;
3403  }
3404 
3405  if (height == 0) {
3406  /* Mark end of slots for path */
3407  slot = cursor->path_arr + root_height + 1;
3408  slot->nth_rec = ULINT_UNDEFINED;
3409  }
3410 
3411  rec = btr_cur_get_rec(cursor);
3412 
3413  slot = cursor->path_arr + (root_height - height);
3414 
3415  page = page_align(rec);
3416 
3417  slot->nth_rec = page_rec_get_n_recs_before(rec);
3418  slot->n_recs = page_get_n_recs(page);
3419  slot->page_no = page_get_page_no(page);
3420  slot->page_level = btr_page_get_level_low(page);
3421 }
3422 
3423 /*******************************************************************/
3435 static
3436 ib_int64_t
3437 btr_estimate_n_rows_in_range_on_level(
3438 /*==================================*/
3439  dict_index_t* index,
3440  btr_path_t* slot1,
3441  btr_path_t* slot2,
3442  ib_int64_t n_rows_on_prev_level,
3447  ibool* is_n_rows_exact)
3450 {
3451  ulint space;
3452  ib_int64_t n_rows;
3453  ulint n_pages_read;
3454  ulint page_no;
3455  ulint zip_size;
3456  ulint level;
3457 
3458  space = dict_index_get_space(index);
3459 
3460  n_rows = 0;
3461  n_pages_read = 0;
3462 
3463  /* Assume by default that we will scan all pages between
3464  slot1->page_no and slot2->page_no */
3465  *is_n_rows_exact = TRUE;
3466 
3467  /* add records from slot1->page_no which are to the right of
3468  the record which serves as a left border of the range, if any */
3469  if (slot1->nth_rec < slot1->n_recs) {
3470  n_rows += slot1->n_recs - slot1->nth_rec;
3471  }
3472 
3473  /* add records from slot2->page_no which are to the left of
3474  the record which servers as a right border of the range, if any */
3475  if (slot2->nth_rec > 1) {
3476  n_rows += slot2->nth_rec - 1;
3477  }
3478 
3479  /* count the records in the pages between slot1->page_no and
3480  slot2->page_no (non inclusive), if any */
3481 
3482  zip_size = fil_space_get_zip_size(space);
3483 
3484  /* Do not read more than this number of pages in order not to hurt
3485  performance with this code which is just an estimation. If we read
3486  this many pages before reaching slot2->page_no then we estimate the
3487  average from the pages scanned so far */
3488 # define N_PAGES_READ_LIMIT 10
3489 
3490  page_no = slot1->page_no;
3491  level = slot1->page_level;
3492 
3493  do {
3494  mtr_t mtr;
3495  page_t* page;
3496  buf_block_t* block;
3497 
3498  mtr_start(&mtr);
3499 
3500  /* Fetch the page. Because we are not holding the
3501  index->lock, the tree may have changed and we may be
3502  attempting to read a page that is no longer part of
3503  the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
3504  silence a debug assertion about this. */
3505  block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
3506  NULL, BUF_GET_POSSIBLY_FREED,
3507  __FILE__, __LINE__, &mtr);
3508 
3509  page = buf_block_get_frame(block);
3510 
3511  /* It is possible that the tree has been reorganized in the
3512  meantime and this is a different page. If this happens the
3513  calculated estimate will be bogus, which is not fatal as
3514  this is only an estimate. We are sure that a page with
3515  page_no exists because InnoDB never frees pages, only
3516  reuses them. */
3517  if (fil_page_get_type(page) != FIL_PAGE_INDEX
3518  || btr_page_get_index_id(page) != index->id
3519  || btr_page_get_level_low(page) != level) {
3520 
3521  /* The page got reused for something else */
3522  mtr_commit(&mtr);
3523  goto inexact;
3524  }
3525 
3526  /* It is possible but highly unlikely that the page was
3527  originally written by an old version of InnoDB that did
3528  not initialize FIL_PAGE_TYPE on other than B-tree pages.
3529  For example, this could be an almost-empty BLOB page
3530  that happens to contain the magic values in the fields
3531  that we checked above. */
3532 
3533  n_pages_read++;
3534 
3535  if (page_no != slot1->page_no) {
3536  /* Do not count the records on slot1->page_no,
3537  we already counted them before this loop. */
3538  n_rows += page_get_n_recs(page);
3539  }
3540 
3541  page_no = btr_page_get_next(page, &mtr);
3542 
3543  mtr_commit(&mtr);
3544 
3545  if (n_pages_read == N_PAGES_READ_LIMIT
3546  || page_no == FIL_NULL) {
3547  /* Either we read too many pages or
3548  we reached the end of the level without passing
3549  through slot2->page_no, the tree must have changed
3550  in the meantime */
3551  goto inexact;
3552  }
3553 
3554  } while (page_no != slot2->page_no);
3555 
3556  return(n_rows);
3557 
3558 inexact:
3559 
3560  *is_n_rows_exact = FALSE;
3561 
3562  /* We did interrupt before reaching slot2->page */
3563 
3564  if (n_pages_read > 0) {
3565  /* The number of pages on this level is
3566  n_rows_on_prev_level, multiply it by the
3567  average number of recs per page so far */
3568  n_rows = n_rows_on_prev_level
3569  * n_rows / n_pages_read;
3570  } else {
3571  /* The tree changed before we could even
3572  start with slot1->page_no */
3573  n_rows = 10;
3574  }
3575 
3576  return(n_rows);
3577 }
3578 
3579 /*******************************************************************/
3582 UNIV_INTERN
3583 ib_int64_t
3585 /*=========================*/
3586  dict_index_t* index,
3587  const dtuple_t* tuple1,
3588  ulint mode1,
3589  const dtuple_t* tuple2,
3590  ulint mode2)
3591 {
3594  btr_cur_t cursor;
3595  btr_path_t* slot1;
3596  btr_path_t* slot2;
3597  ibool diverged;
3598  ibool diverged_lot;
3599  ulint divergence_level;
3600  ib_int64_t n_rows;
3601  ibool is_n_rows_exact;
3602  ulint i;
3603  mtr_t mtr;
3604  ib_int64_t table_n_rows;
3605 
3606  table_n_rows = dict_table_get_n_rows(index->table);
3607 
3608  mtr_start(&mtr);
3609 
3610  cursor.path_arr = path1;
3611 
3612  if (dtuple_get_n_fields(tuple1) > 0) {
3613 
3614  btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
3616  &cursor, 0,
3617  __FILE__, __LINE__, &mtr);
3618  } else {
3619  btr_cur_open_at_index_side(true, index,
3621  &cursor, 0, &mtr);
3622  }
3623 
3624  mtr_commit(&mtr);
3625 
3626  mtr_start(&mtr);
3627 
3628  cursor.path_arr = path2;
3629 
3630  if (dtuple_get_n_fields(tuple2) > 0) {
3631 
3632  btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
3634  &cursor, 0,
3635  __FILE__, __LINE__, &mtr);
3636  } else {
3637  btr_cur_open_at_index_side(false, index,
3639  &cursor, 0, &mtr);
3640  }
3641 
3642  mtr_commit(&mtr);
3643 
3644  /* We have the path information for the range in path1 and path2 */
3645 
3646  n_rows = 1;
3647  is_n_rows_exact = TRUE;
3648  diverged = FALSE; /* This becomes true when the path is not
3649  the same any more */
3650  diverged_lot = FALSE; /* This becomes true when the paths are
3651  not the same or adjacent any more */
3652  divergence_level = 1000000; /* This is the level where paths diverged
3653  a lot */
3654  for (i = 0; ; i++) {
3656 
3657  slot1 = path1 + i;
3658  slot2 = path2 + i;
3659 
3660  if (slot1->nth_rec == ULINT_UNDEFINED
3661  || slot2->nth_rec == ULINT_UNDEFINED) {
3662 
3663  if (i > divergence_level + 1 && !is_n_rows_exact) {
3664  /* In trees whose height is > 1 our algorithm
3665  tends to underestimate: multiply the estimate
3666  by 2: */
3667 
3668  n_rows = n_rows * 2;
3669  }
3670 
3671  DBUG_EXECUTE_IF("bug14007649", return(n_rows););
3672 
3673  /* Do not estimate the number of rows in the range
3674  to over 1 / 2 of the estimated rows in the whole
3675  table */
3676 
3677  if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
3678 
3679  n_rows = table_n_rows / 2;
3680 
3681  /* If there are just 0 or 1 rows in the table,
3682  then we estimate all rows are in the range */
3683 
3684  if (n_rows == 0) {
3685  n_rows = table_n_rows;
3686  }
3687  }
3688 
3689  return(n_rows);
3690  }
3691 
3692  if (!diverged && slot1->nth_rec != slot2->nth_rec) {
3693 
3694  diverged = TRUE;
3695 
3696  if (slot1->nth_rec < slot2->nth_rec) {
3697  n_rows = slot2->nth_rec - slot1->nth_rec;
3698 
3699  if (n_rows > 1) {
3700  diverged_lot = TRUE;
3701  divergence_level = i;
3702  }
3703  } else {
3704  /* It is possible that
3705  slot1->nth_rec >= slot2->nth_rec
3706  if, for example, we have a single page
3707  tree which contains (inf, 5, 6, supr)
3708  and we select where x > 20 and x < 30;
3709  in this case slot1->nth_rec will point
3710  to the supr record and slot2->nth_rec
3711  will point to 6 */
3712  n_rows = 0;
3713  }
3714 
3715  } else if (diverged && !diverged_lot) {
3716 
3717  if (slot1->nth_rec < slot1->n_recs
3718  || slot2->nth_rec > 1) {
3719 
3720  diverged_lot = TRUE;
3721  divergence_level = i;
3722 
3723  n_rows = 0;
3724 
3725  if (slot1->nth_rec < slot1->n_recs) {
3726  n_rows += slot1->n_recs
3727  - slot1->nth_rec;
3728  }
3729 
3730  if (slot2->nth_rec > 1) {
3731  n_rows += slot2->nth_rec - 1;
3732  }
3733  }
3734  } else if (diverged_lot) {
3735 
3736  n_rows = btr_estimate_n_rows_in_range_on_level(
3737  index, slot1, slot2, n_rows,
3738  &is_n_rows_exact);
3739  }
3740  }
3741 }
3742 
3743 /*******************************************************************/
3748 static
3749 void
3750 btr_record_not_null_field_in_rec(
3751 /*=============================*/
3752  ulint n_unique,
3755  const ulint* offsets,
3758  ib_uint64_t* n_not_null)
3760 {
3761  ulint i;
3762 
3763  ut_ad(rec_offs_n_fields(offsets) >= n_unique);
3764 
3765  if (n_not_null == NULL) {
3766  return;
3767  }
3768 
3769  for (i = 0; i < n_unique; i++) {
3770  if (rec_offs_nth_sql_null(offsets, i)) {
3771  break;
3772  }
3773 
3774  n_not_null[i]++;
3775  }
3776 }
3777 
3778 /*******************************************************************/
3787 UNIV_INTERN
3788 void
3790 /*======================================*/
3791  dict_index_t* index)
3792 {
3793  btr_cur_t cursor;
3794  page_t* page;
3795  rec_t* rec;
3796  ulint n_cols;
3797  ulint matched_fields;
3798  ulint matched_bytes;
3799  ib_uint64_t* n_diff;
3800  ib_uint64_t* n_not_null;
3801  ibool stats_null_not_equal;
3802  ullint n_sample_pages; /* number of pages to sample */
3803  ulint not_empty_flag = 0;
3804  ulint total_external_size = 0;
3805  ulint i;
3806  ulint j;
3807  ullint add_on;
3808  mtr_t mtr;
3809  mem_heap_t* heap = NULL;
3810  ulint* offsets_rec = NULL;
3811  ulint* offsets_next_rec = NULL;
3812 
3813  n_cols = dict_index_get_n_unique(index);
3814 
3815  heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
3816  * n_cols
3817  + dict_index_get_n_fields(index)
3818  * (sizeof *offsets_rec
3819  + sizeof *offsets_next_rec));
3820 
3821  n_diff = (ib_uint64_t*) mem_heap_zalloc(
3822  heap, n_cols * sizeof(ib_int64_t));
3823 
3824  n_not_null = NULL;
3825 
3826  /* Check srv_innodb_stats_method setting, and decide whether we
3827  need to record non-null value and also decide if NULL is
3828  considered equal (by setting stats_null_not_equal value) */
3829  switch (srv_innodb_stats_method) {
3830  case SRV_STATS_NULLS_IGNORED:
3831  n_not_null = (ib_uint64_t*) mem_heap_zalloc(
3832  heap, n_cols * sizeof *n_not_null);
3833  /* fall through */
3834 
3835  case SRV_STATS_NULLS_UNEQUAL:
3836  /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
3837  case, we will treat NULLs as unequal value */
3838  stats_null_not_equal = TRUE;
3839  break;
3840 
3841  case SRV_STATS_NULLS_EQUAL:
3842  stats_null_not_equal = FALSE;
3843  break;
3844 
3845  default:
3846  ut_error;
3847  }
3848 
3849  /* It makes no sense to test more pages than are contained
3850  in the index, thus we lower the number if it is too high */
3851  if (srv_stats_transient_sample_pages > index->stat_index_size) {
3852  if (index->stat_index_size > 0) {
3853  n_sample_pages = index->stat_index_size;
3854  } else {
3855  n_sample_pages = 1;
3856  }
3857  } else {
3858  n_sample_pages = srv_stats_transient_sample_pages;
3859  }
3860 
3861  /* We sample some pages in the index to get an estimate */
3862 
3863  for (i = 0; i < n_sample_pages; i++) {
3864  mtr_start(&mtr);
3865 
3866  btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
3867 
3868  /* Count the number of different key values for each prefix of
3869  the key on this index page. If the prefix does not determine
3870  the index record uniquely in the B-tree, then we subtract one
3871  because otherwise our algorithm would give a wrong estimate
3872  for an index where there is just one key value. */
3873 
3874  page = btr_cur_get_page(&cursor);
3875 
3876  rec = page_rec_get_next(page_get_infimum_rec(page));
3877 
3878  if (!page_rec_is_supremum(rec)) {
3879  not_empty_flag = 1;
3880  offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3881  ULINT_UNDEFINED, &heap);
3882 
3883  if (n_not_null != NULL) {
3884  btr_record_not_null_field_in_rec(
3885  n_cols, offsets_rec, n_not_null);
3886  }
3887  }
3888 
3889  while (!page_rec_is_supremum(rec)) {
3890  rec_t* next_rec = page_rec_get_next(rec);
3891  if (page_rec_is_supremum(next_rec)) {
3892  total_external_size +=
3893  btr_rec_get_externally_stored_len(
3894  rec, offsets_rec);
3895  break;
3896  }
3897 
3898  matched_fields = 0;
3899  matched_bytes = 0;
3900  offsets_next_rec = rec_get_offsets(next_rec, index,
3901  offsets_next_rec,
3902  ULINT_UNDEFINED,
3903  &heap);
3904 
3905  cmp_rec_rec_with_match(rec, next_rec,
3906  offsets_rec, offsets_next_rec,
3907  index, stats_null_not_equal,
3908  &matched_fields,
3909  &matched_bytes);
3910 
3911  for (j = matched_fields; j < n_cols; j++) {
3912  /* We add one if this index record has
3913  a different prefix from the previous */
3914 
3915  n_diff[j]++;
3916  }
3917 
3918  if (n_not_null != NULL) {
3919  btr_record_not_null_field_in_rec(
3920  n_cols, offsets_next_rec, n_not_null);
3921  }
3922 
3923  total_external_size
3924  += btr_rec_get_externally_stored_len(
3925  rec, offsets_rec);
3926 
3927  rec = next_rec;
3928  /* Initialize offsets_rec for the next round
3929  and assign the old offsets_rec buffer to
3930  offsets_next_rec. */
3931  {
3932  ulint* offsets_tmp = offsets_rec;
3933  offsets_rec = offsets_next_rec;
3934  offsets_next_rec = offsets_tmp;
3935  }
3936  }
3937 
3938 
3939  if (n_cols == dict_index_get_n_unique_in_tree(index)) {
3940 
3941  /* If there is more than one leaf page in the tree,
3942  we add one because we know that the first record
3943  on the page certainly had a different prefix than the
3944  last record on the previous index page in the
3945  alphabetical order. Before this fix, if there was
3946  just one big record on each clustered index page, the
3947  algorithm grossly underestimated the number of rows
3948  in the table. */
3949 
3950  if (btr_page_get_prev(page, &mtr) != FIL_NULL
3951  || btr_page_get_next(page, &mtr) != FIL_NULL) {
3952 
3953  n_diff[n_cols - 1]++;
3954  }
3955  }
3956 
3957  mtr_commit(&mtr);
3958  }
3959 
3960  /* If we saw k borders between different key values on
3961  n_sample_pages leaf pages, we can estimate how many
3962  there will be in index->stat_n_leaf_pages */
3963 
3964  /* We must take into account that our sample actually represents
3965  also the pages used for external storage of fields (those pages are
3966  included in index->stat_n_leaf_pages) */
3967 
3968  for (j = 0; j < n_cols; j++) {
3969  index->stat_n_diff_key_vals[j]
3971  n_diff[j], index, n_sample_pages,
3972  total_external_size, not_empty_flag);
3973 
3974  /* If the tree is small, smaller than
3975  10 * n_sample_pages + total_external_size, then
3976  the above estimate is ok. For bigger trees it is common that we
3977  do not see any borders between key values in the few pages
3978  we pick. But still there may be n_sample_pages
3979  different key values, or even more. Let us try to approximate
3980  that: */
3981 
3982  add_on = index->stat_n_leaf_pages
3983  / (10 * (n_sample_pages
3984  + total_external_size));
3985 
3986  if (add_on > n_sample_pages) {
3987  add_on = n_sample_pages;
3988  }
3989 
3990  index->stat_n_diff_key_vals[j] += add_on;
3991 
3992  index->stat_n_sample_sizes[j] = n_sample_pages;
3993 
3994  /* Update the stat_n_non_null_key_vals[] with our
3995  sampled result. stat_n_non_null_key_vals[] is created
3996  and initialized to zero in dict_index_add_to_cache(),
3997  along with stat_n_diff_key_vals[] array */
3998  if (n_not_null != NULL) {
3999  index->stat_n_non_null_key_vals[j] =
4001  n_not_null[j], index, n_sample_pages,
4002  total_external_size, not_empty_flag);
4003  }
4004  }
4005 
4006  mem_heap_free(heap);
4007 }
4008 
4009 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
4010 
4011 /***********************************************************/
4014 static
4015 ulint
4016 btr_rec_get_field_ref_offs(
4017 /*=======================*/
4018  const ulint* offsets,
4019  ulint n)
4020 {
4021  ulint field_ref_offs;
4022  ulint local_len;
4023 
4024  ut_a(rec_offs_nth_extern(offsets, n));
4025  field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
4026  ut_a(local_len != UNIV_SQL_NULL);
4027  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4028 
4029  return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
4030 }
4031 
4037 #define btr_rec_get_field_ref(rec, offsets, n) \
4038  ((rec) + btr_rec_get_field_ref_offs(offsets, n))
4039 
4040 /***********************************************************/
4043 static
4044 ulint
4045 btr_rec_get_externally_stored_len(
4046 /*==============================*/
4047  const rec_t* rec,
4048  const ulint* offsets)
4049 {
4050  ulint n_fields;
4051  ulint total_extern_len = 0;
4052  ulint i;
4053 
4054  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
4055 
4056  if (!rec_offs_any_extern(offsets)) {
4057  return(0);
4058  }
4059 
4060  n_fields = rec_offs_n_fields(offsets);
4061 
4062  for (i = 0; i < n_fields; i++) {
4063  if (rec_offs_nth_extern(offsets, i)) {
4064 
4065  ulint extern_len = mach_read_from_4(
4066  btr_rec_get_field_ref(rec, offsets, i)
4067  + BTR_EXTERN_LEN + 4);
4068 
4069  total_extern_len += ut_calc_align(extern_len,
4070  UNIV_PAGE_SIZE);
4071  }
4072  }
4073 
4074  return(total_extern_len / UNIV_PAGE_SIZE);
4075 }
4076 
4077 /*******************************************************************/
4079 static
4080 void
4081 btr_cur_set_ownership_of_extern_field(
4082 /*==================================*/
4083  page_zip_des_t* page_zip,
4085  rec_t* rec,
4086  dict_index_t* index,
4087  const ulint* offsets,
4088  ulint i,
4089  ibool val,
4090  mtr_t* mtr)
4091 {
4092  byte* data;
4093  ulint local_len;
4094  ulint byte_val;
4095 
4096  data = rec_get_nth_field(rec, offsets, i, &local_len);
4097  ut_ad(rec_offs_nth_extern(offsets, i));
4098  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4099 
4100  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4101 
4102  byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
4103 
4104  if (val) {
4105  byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
4106  } else {
4107 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4108  ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
4109 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4110  byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
4111  }
4112 
4113  if (page_zip) {
4114  mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
4115  page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
4116  } else if (mtr != NULL) {
4117 
4118  mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
4119  MLOG_1BYTE, mtr);
4120  } else {
4121  mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
4122  }
4123 
4124  btr_blob_dbg_owner(rec, index, offsets, i, val);
4125 }
4126 
4127 /*******************************************************************/
4132 UNIV_INTERN
4133 void
4135 /*============================*/
4136  page_zip_des_t* page_zip,
4138  rec_t* rec,
4139  dict_index_t* index,
4140  const ulint* offsets,
4141  const upd_t* update,
4142  mtr_t* mtr)
4143 {
4144  ulint i;
4145 
4146  ut_ad(rec_offs_validate(rec, index, offsets));
4147  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
4148  ut_ad(rec_offs_any_extern(offsets));
4149  ut_ad(mtr);
4150 
4151  for (i = 0; i < rec_offs_n_fields(offsets); i++) {
4152  if (rec_offs_nth_extern(offsets, i)
4153  && !upd_get_field_by_field_no(update, i)) {
4154  btr_cur_set_ownership_of_extern_field(
4155  page_zip, rec, index, offsets, i, FALSE, mtr);
4156  }
4157  }
4158 }
4159 
4160 /*******************************************************************/
4164 static
4165 void
4166 btr_cur_unmark_extern_fields(
4167 /*=========================*/
4168  page_zip_des_t* page_zip,
4170  rec_t* rec,
4171  dict_index_t* index,
4172  const ulint* offsets,
4173  mtr_t* mtr)
4174 {
4175  ulint n;
4176  ulint i;
4177 
4178  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
4179  n = rec_offs_n_fields(offsets);
4180 
4181  if (!rec_offs_any_extern(offsets)) {
4182 
4183  return;
4184  }
4185 
4186  for (i = 0; i < n; i++) {
4187  if (rec_offs_nth_extern(offsets, i)) {
4188 
4189  btr_cur_set_ownership_of_extern_field(
4190  page_zip, rec, index, offsets, i, TRUE, mtr);
4191  }
4192  }
4193 }
4194 
4195 /*******************************************************************/
4200 UNIV_INTERN
4201 ulint
4203 /*==========================*/
4204  dtuple_t* tuple,
4205  const upd_t* update,
4206  mem_heap_t* heap)
4207 {
4208  ulint n_pushed = 0;
4209  ulint n;
4210  const upd_field_t* uf;
4211 
4212  ut_ad(tuple);
4213  ut_ad(update);
4214 
4215  uf = update->fields;
4216  n = upd_get_n_fields(update);
4217 
4218  for (; n--; uf++) {
4219  if (dfield_is_ext(&uf->new_val)) {
4220  dfield_t* field
4221  = dtuple_get_nth_field(tuple, uf->field_no);
4222 
4223  if (!dfield_is_ext(field)) {
4224  dfield_set_ext(field);
4225  n_pushed++;
4226  }
4227 
4228  switch (uf->orig_len) {
4229  byte* data;
4230  ulint len;
4231  byte* buf;
4232  case 0:
4233  break;
4235  /* Restore the original locally stored
4236  part of the column. In the undo log,
4237  InnoDB writes a longer prefix of externally
4238  stored columns, so that column prefixes
4239  in secondary indexes can be reconstructed. */
4240  dfield_set_data(field, (byte*) dfield_get_data(field)
4241  + dfield_get_len(field)
4244  dfield_set_ext(field);
4245  break;
4246  default:
4247  /* Reconstruct the original locally
4248  stored part of the column. The data
4249  will have to be copied. */
4251 
4252  data = (byte*) dfield_get_data(field);
4253  len = dfield_get_len(field);
4254 
4255  buf = (byte*) mem_heap_alloc(heap,
4256  uf->orig_len);
4257  /* Copy the locally stored prefix. */
4258  memcpy(buf, data,
4259  uf->orig_len
4261  /* Copy the BLOB pointer. */
4262  memcpy(buf + uf->orig_len
4264  data + len - BTR_EXTERN_FIELD_REF_SIZE,
4266 
4267  dfield_set_data(field, buf, uf->orig_len);
4268  dfield_set_ext(field);
4269  }
4270  }
4271  }
4272 
4273  return(n_pushed);
4274 }
4275 
4276 /*******************************************************************/
4279 static
4280 ulint
4281 btr_blob_get_part_len(
4282 /*==================*/
4283  const byte* blob_header)
4284 {
4285  return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
4286 }
4287 
4288 /*******************************************************************/
4291 static
4292 ulint
4293 btr_blob_get_next_page_no(
4294 /*======================*/
4295  const byte* blob_header)
4296 {
4297  return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
4298 }
4299 
4300 /*******************************************************************/
4302 static
4303 void
4304 btr_blob_free(
4305 /*==========*/
4306  buf_block_t* block,
4307  ibool all,
4309  mtr_t* mtr)
4310 {
4311  buf_pool_t* buf_pool = buf_pool_from_block(block);
4312  ulint space = buf_block_get_space(block);
4313  ulint page_no = buf_block_get_page_no(block);
4314 
4315  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4316 
4317  mtr_commit(mtr);
4318 
4319  buf_pool_mutex_enter(buf_pool);
4320 
4321  /* Only free the block if it is still allocated to
4322  the same file page. */
4323 
4324  if (buf_block_get_state(block)
4326  && buf_block_get_space(block) == space
4327  && buf_block_get_page_no(block) == page_no) {
4328 
4329  if (!buf_LRU_free_page(&block->page, all)
4330  && all && block->page.zip.data) {
4331  /* Attempt to deallocate the uncompressed page
4332  if the whole block cannot be deallocted. */
4333 
4334  buf_LRU_free_page(&block->page, false);
4335  }
4336  }
4337 
4338  buf_pool_mutex_exit(buf_pool);
4339 }
4340 
4341 /*******************************************************************/
4347 UNIV_INTERN
4348 dberr_t
4350 /*============================*/
4351  dict_index_t* index,
4353  buf_block_t* rec_block,
4354  rec_t* rec,
4355  const ulint* offsets,
4359  const big_rec_t*big_rec_vec,
4361  mtr_t* btr_mtr,
4363  enum blob_op op)
4364 {
4365  ulint rec_page_no;
4366  byte* field_ref;
4367  ulint extern_len;
4368  ulint store_len;
4369  ulint page_no;
4370  ulint space_id;
4371  ulint zip_size;
4372  ulint prev_page_no;
4373  ulint hint_page_no;
4374  ulint i;
4375  mtr_t mtr;
4376  mtr_t* alloc_mtr;
4377  mem_heap_t* heap = NULL;
4378  page_zip_des_t* page_zip;
4379  z_stream c_stream;
4380  buf_block_t** freed_pages = NULL;
4381  ulint n_freed_pages = 0;
4382  dberr_t error = DB_SUCCESS;
4383 
4384  ut_ad(rec_offs_validate(rec, index, offsets));
4385  ut_ad(rec_offs_any_extern(offsets));
4386  ut_ad(btr_mtr);
4387  ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index),
4388  MTR_MEMO_X_LOCK));
4389  ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
4390  ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
4391  ut_a(dict_index_is_clust(index));
4392 
4393  page_zip = buf_block_get_page_zip(rec_block);
4395  == buf_block_get_zip_size(rec_block));
4396 
4397  space_id = buf_block_get_space(rec_block);
4398  zip_size = buf_block_get_zip_size(rec_block);
4399  rec_page_no = buf_block_get_page_no(rec_block);
4401 
4402  if (page_zip) {
4403  int err;
4404 
4405  /* Zlib deflate needs 128 kilobytes for the default
4406  window size, plus 512 << memLevel, plus a few
4407  kilobytes for small objects. We use reduced memLevel
4408  to limit the memory consumption, and preallocate the
4409  heap, hoping to avoid memory fragmentation. */
4410  heap = mem_heap_create(250000);
4411  page_zip_set_alloc(&c_stream, heap);
4412 
4413  err = deflateInit2(&c_stream, page_zip_level,
4414  Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
4415  ut_a(err == Z_OK);
4416  }
4417 
4418  if (btr_blob_op_is_update(op)) {
4419  /* Avoid reusing pages that have been previously freed
4420  in btr_mtr. */
4421  if (btr_mtr->n_freed_pages) {
4422  if (heap == NULL) {
4423  heap = mem_heap_create(
4424  btr_mtr->n_freed_pages
4425  * sizeof *freed_pages);
4426  }
4427 
4428  freed_pages = static_cast<buf_block_t**>(
4430  heap,
4431  btr_mtr->n_freed_pages
4432  * sizeof *freed_pages));
4433  n_freed_pages = 0;
4434  }
4435 
4436  /* Because btr_mtr will be committed after mtr, it is
4437  possible that the tablespace has been extended when
4438  the B-tree record was updated or inserted, or it will
4439  be extended while allocating pages for big_rec.
4440 
4441  TODO: In mtr (not btr_mtr), write a redo log record
4442  about extending the tablespace to its current size,
4443  and remember the current size. Whenever the tablespace
4444  grows as pages are allocated, write further redo log
4445  records to mtr. (Currently tablespace extension is not
4446  covered by the redo log. If it were, the record would
4447  only be written to btr_mtr, which is committed after
4448  mtr.) */
4449  alloc_mtr = btr_mtr;
4450  } else {
4451  /* Use the local mtr for allocations. */
4452  alloc_mtr = &mtr;
4453  }
4454 
4455 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4456  /* All pointers to externally stored columns in the record
4457  must either be zero or they must be pointers to inherited
4458  columns, owned by this record or an earlier record version. */
4459  for (i = 0; i < rec_offs_n_fields(offsets); i++) {
4460  if (!rec_offs_nth_extern(offsets, i)) {
4461  continue;
4462  }
4463  field_ref = btr_rec_get_field_ref(rec, offsets, i);
4464 
4465  ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
4466  /* Either this must be an update in place,
4467  or the BLOB must be inherited, or the BLOB pointer
4468  must be zero (will be written in this function). */
4469  ut_a(op == BTR_STORE_UPDATE
4470  || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
4471  || !memcmp(field_ref, field_ref_zero,
4473  }
4474 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4475  /* We have to create a file segment to the tablespace
4476  for each field and put the pointer to the field in rec */
4477 
4478  for (i = 0; i < big_rec_vec->n_fields; i++) {
4479  field_ref = btr_rec_get_field_ref(
4480  rec, offsets, big_rec_vec->fields[i].field_no);
4481 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4482  /* A zero BLOB pointer should have been initially inserted. */
4483  ut_a(!memcmp(field_ref, field_ref_zero,
4485 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4486  extern_len = big_rec_vec->fields[i].len;
4487  UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
4488  extern_len);
4489 
4490  ut_a(extern_len > 0);
4491 
4492  prev_page_no = FIL_NULL;
4493 
4494  if (page_zip) {
4495  int err = deflateReset(&c_stream);
4496  ut_a(err == Z_OK);
4497 
4498  c_stream.next_in = (Bytef*)
4499  big_rec_vec->fields[i].data;
4500  c_stream.avail_in = extern_len;
4501  }
4502 
4503  for (;;) {
4504  buf_block_t* block;
4505  page_t* page;
4506 
4507  mtr_start(&mtr);
4508 
4509  if (prev_page_no == FIL_NULL) {
4510  hint_page_no = 1 + rec_page_no;
4511  } else {
4512  hint_page_no = prev_page_no + 1;
4513  }
4514 
4515 alloc_another:
4516  block = btr_page_alloc(index, hint_page_no,
4517  FSP_NO_DIR, 0, alloc_mtr, &mtr);
4518  if (UNIV_UNLIKELY(block == NULL)) {
4519  mtr_commit(&mtr);
4520  error = DB_OUT_OF_FILE_SPACE;
4521  goto func_exit;
4522  }
4523 
4524  if (rw_lock_get_x_lock_count(&block->lock) > 1) {
4525  /* This page must have been freed in
4526  btr_mtr previously. Put it aside, and
4527  allocate another page for the BLOB data. */
4528  ut_ad(alloc_mtr == btr_mtr);
4530  ut_ad(n_freed_pages < btr_mtr->n_freed_pages);
4531  freed_pages[n_freed_pages++] = block;
4532  goto alloc_another;
4533  }
4534 
4535  page_no = buf_block_get_page_no(block);
4536  page = buf_block_get_frame(block);
4537 
4538  if (prev_page_no != FIL_NULL) {
4539  buf_block_t* prev_block;
4540  page_t* prev_page;
4541 
4542  prev_block = buf_page_get(space_id, zip_size,
4543  prev_page_no,
4544  RW_X_LATCH, &mtr);
4545  buf_block_dbg_add_level(prev_block,
4546  SYNC_EXTERN_STORAGE);
4547  prev_page = buf_block_get_frame(prev_block);
4548 
4549  if (page_zip) {
4551  prev_page + FIL_PAGE_NEXT,
4552  page_no, MLOG_4BYTES, &mtr);
4553  memcpy(buf_block_get_page_zip(
4554  prev_block)
4555  ->data + FIL_PAGE_NEXT,
4556  prev_page + FIL_PAGE_NEXT, 4);
4557  } else {
4559  prev_page + FIL_PAGE_DATA
4561  page_no, MLOG_4BYTES, &mtr);
4562  }
4563 
4564  } else if (dict_index_is_online_ddl(index)) {
4565  row_log_table_blob_alloc(index, page_no);
4566  }
4567 
4568  if (page_zip) {
4569  int err;
4570  page_zip_des_t* blob_page_zip;
4571 
4572  /* Write FIL_PAGE_TYPE to the redo log
4573  separately, before logging any other
4574  changes to the page, so that the debug
4575  assertions in
4576  recv_parse_or_apply_log_rec_body() can
4577  be made simpler. Before InnoDB Plugin
4578  1.0.4, the initialization of
4579  FIL_PAGE_TYPE was logged as part of
4580  the mlog_log_string() below. */
4581 
4583  prev_page_no == FIL_NULL
4586  MLOG_2BYTES, &mtr);
4587 
4588  c_stream.next_out = page
4589  + FIL_PAGE_DATA;
4590  c_stream.avail_out
4591  = page_zip_get_size(page_zip)
4592  - FIL_PAGE_DATA;
4593 
4594  err = deflate(&c_stream, Z_FINISH);
4595  ut_a(err == Z_OK || err == Z_STREAM_END);
4596  ut_a(err == Z_STREAM_END
4597  || c_stream.avail_out == 0);
4598 
4599  /* Write the "next BLOB page" pointer */
4601  FIL_NULL, MLOG_4BYTES, &mtr);
4602  /* Initialize the unused "prev page" pointer */
4604  FIL_NULL, MLOG_4BYTES, &mtr);
4605  /* Write a back pointer to the record
4606  into the otherwise unused area. This
4607  information could be useful in
4608  debugging. Later, we might want to
4609  implement the possibility to relocate
4610  BLOB pages. Then, we would need to be
4611  able to adjust the BLOB pointer in the
4612  record. We do not store the heap
4613  number of the record, because it can
4614  change in page_zip_reorganize() or
4615  btr_page_reorganize(). However, also
4616  the page number of the record may
4617  change when B-tree nodes are split or
4618  merged. */
4619  mlog_write_ulint(page
4621  space_id,
4622  MLOG_4BYTES, &mtr);
4623  mlog_write_ulint(page
4625  rec_page_no,
4626  MLOG_4BYTES, &mtr);
4627 
4628  /* Zero out the unused part of the page. */
4629  memset(page + page_zip_get_size(page_zip)
4630  - c_stream.avail_out,
4631  0, c_stream.avail_out);
4633  page_zip_get_size(page_zip)
4635  &mtr);
4636  /* Copy the page to compressed storage,
4637  because it will be flushed to disk
4638  from there. */
4639  blob_page_zip = buf_block_get_page_zip(block);
4640  ut_ad(blob_page_zip);
4641  ut_ad(page_zip_get_size(blob_page_zip)
4642  == page_zip_get_size(page_zip));
4643  memcpy(blob_page_zip->data, page,
4644  page_zip_get_size(page_zip));
4645 
4646  if (err == Z_OK && prev_page_no != FIL_NULL) {
4647 
4648  goto next_zip_page;
4649  }
4650 
4651  if (alloc_mtr == &mtr) {
4652  rec_block = buf_page_get(
4653  space_id, zip_size,
4654  rec_page_no,
4655  RW_X_LATCH, &mtr);
4656  buf_block_dbg_add_level(
4657  rec_block,
4658  SYNC_NO_ORDER_CHECK);
4659  }
4660 
4661  if (err == Z_STREAM_END) {
4662  mach_write_to_4(field_ref
4663  + BTR_EXTERN_LEN, 0);
4664  mach_write_to_4(field_ref
4665  + BTR_EXTERN_LEN + 4,
4666  c_stream.total_in);
4667  } else {
4668  memset(field_ref + BTR_EXTERN_LEN,
4669  0, 8);
4670  }
4671 
4672  if (prev_page_no == FIL_NULL) {
4673  btr_blob_dbg_add_blob(
4674  rec, big_rec_vec->fields[i]
4675  .field_no, page_no, index,
4676  "store");
4677 
4678  mach_write_to_4(field_ref
4680  space_id);
4681 
4682  mach_write_to_4(field_ref
4684  page_no);
4685 
4686  mach_write_to_4(field_ref
4688  FIL_PAGE_NEXT);
4689  }
4690 
4692  page_zip, rec, index, offsets,
4693  big_rec_vec->fields[i].field_no,
4694  alloc_mtr);
4695 
4696 next_zip_page:
4697  prev_page_no = page_no;
4698 
4699  /* Commit mtr and release the
4700  uncompressed page frame to save memory. */
4701  btr_blob_free(block, FALSE, &mtr);
4702 
4703  if (err == Z_STREAM_END) {
4704  break;
4705  }
4706  } else {
4709  MLOG_2BYTES, &mtr);
4710 
4711  if (extern_len > (UNIV_PAGE_SIZE
4712  - FIL_PAGE_DATA
4714  - FIL_PAGE_DATA_END)) {
4715  store_len = UNIV_PAGE_SIZE
4716  - FIL_PAGE_DATA
4719  } else {
4720  store_len = extern_len;
4721  }
4722 
4723  mlog_write_string(page + FIL_PAGE_DATA
4725  (const byte*)
4726  big_rec_vec->fields[i].data
4727  + big_rec_vec->fields[i].len
4728  - extern_len,
4729  store_len, &mtr);
4730  mlog_write_ulint(page + FIL_PAGE_DATA
4732  store_len, MLOG_4BYTES, &mtr);
4733  mlog_write_ulint(page + FIL_PAGE_DATA
4735  FIL_NULL, MLOG_4BYTES, &mtr);
4736 
4737  extern_len -= store_len;
4738 
4739  if (alloc_mtr == &mtr) {
4740  rec_block = buf_page_get(
4741  space_id, zip_size,
4742  rec_page_no,
4743  RW_X_LATCH, &mtr);
4744  buf_block_dbg_add_level(
4745  rec_block,
4746  SYNC_NO_ORDER_CHECK);
4747  }
4748 
4749  mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
4750  MLOG_4BYTES, alloc_mtr);
4751  mlog_write_ulint(field_ref
4752  + BTR_EXTERN_LEN + 4,
4753  big_rec_vec->fields[i].len
4754  - extern_len,
4755  MLOG_4BYTES, alloc_mtr);
4756 
4757  if (prev_page_no == FIL_NULL) {
4758  btr_blob_dbg_add_blob(
4759  rec, big_rec_vec->fields[i]
4760  .field_no, page_no, index,
4761  "store");
4762 
4763  mlog_write_ulint(field_ref
4765  space_id, MLOG_4BYTES,
4766  alloc_mtr);
4767 
4768  mlog_write_ulint(field_ref
4770  page_no, MLOG_4BYTES,
4771  alloc_mtr);
4772 
4773  mlog_write_ulint(field_ref
4775  FIL_PAGE_DATA,
4776  MLOG_4BYTES,
4777  alloc_mtr);
4778  }
4779 
4780  prev_page_no = page_no;
4781 
4782  mtr_commit(&mtr);
4783 
4784  if (extern_len == 0) {
4785  break;
4786  }
4787  }
4788  }
4789 
4790  DBUG_EXECUTE_IF("btr_store_big_rec_extern",
4791  error = DB_OUT_OF_FILE_SPACE;
4792  goto func_exit;);
4793  }
4794 
4795 func_exit:
4796  if (page_zip) {
4797  deflateEnd(&c_stream);
4798  }
4799 
4800  if (n_freed_pages) {
4801  ulint i;
4802 
4803  ut_ad(alloc_mtr == btr_mtr);
4805 
4806  for (i = 0; i < n_freed_pages; i++) {
4807  btr_page_free_low(index, freed_pages[i], 0, alloc_mtr);
4808  }
4809  }
4810 
4811  if (heap != NULL) {
4812  mem_heap_free(heap);
4813  }
4814 
4815 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4816  /* All pointers to externally stored columns in the record
4817  must be valid. */
4818  for (i = 0; i < rec_offs_n_fields(offsets); i++) {
4819  if (!rec_offs_nth_extern(offsets, i)) {
4820  continue;
4821  }
4822 
4823  field_ref = btr_rec_get_field_ref(rec, offsets, i);
4824 
4825  /* The pointer must not be zero if the operation
4826  succeeded. */
4827  ut_a(0 != memcmp(field_ref, field_ref_zero,
4829  || error != DB_SUCCESS);
4830  /* The column must not be disowned by this record. */
4831  ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
4832  }
4833 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4834  return(error);
4835 }
4836 
4837 /*******************************************************************/
4839 static
4840 void
4841 btr_check_blob_fil_page_type(
4842 /*=========================*/
4843  ulint space_id,
4844  ulint page_no,
4845  const page_t* page,
4846  ibool read)
4847 {
4848  ulint type = fil_page_get_type(page);
4849 
4850  ut_a(space_id == page_get_space_id(page));
4851  ut_a(page_no == page_get_page_no(page));
4852 
4853  if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
4854  ulint flags = fil_space_get_flags(space_id);
4855 
4856 #ifndef UNIV_DEBUG /* Improve debug test coverage */
4857  if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
4858  /* Old versions of InnoDB did not initialize
4859  FIL_PAGE_TYPE on BLOB pages. Do not print
4860  anything about the type mismatch when reading
4861  a BLOB page that is in Antelope format.*/
4862  return;
4863  }
4864 #endif /* !UNIV_DEBUG */
4865 
4866  ut_print_timestamp(stderr);
4867  fprintf(stderr,
4868  " InnoDB: FIL_PAGE_TYPE=%lu"
4869  " on BLOB %s space %lu page %lu flags %lx\n",
4870  (ulong) type, read ? "read" : "purge",
4871  (ulong) space_id, (ulong) page_no, (ulong) flags);
4872  ut_error;
4873  }
4874 }
4875 
4876 /*******************************************************************/
4881 UNIV_INTERN
4882 void
4884 /*=============================*/
4885  dict_index_t* index,
4893  byte* field_ref,
4894  const rec_t* rec,
4896  const ulint* offsets,
4898  page_zip_des_t* page_zip,
4900  ulint i,
4902  enum trx_rb_ctx rb_ctx,
4903  mtr_t* local_mtr __attribute__((unused)))
4906 {
4907  page_t* page;
4908  const ulint space_id = mach_read_from_4(
4909  field_ref + BTR_EXTERN_SPACE_ID);
4910  const ulint start_page = mach_read_from_4(
4911  field_ref + BTR_EXTERN_PAGE_NO);
4912  ulint rec_zip_size = dict_table_zip_size(index->table);
4913  ulint ext_zip_size;
4914  ulint page_no;
4915  ulint next_page_no;
4916  mtr_t mtr;
4917 
4918  ut_ad(dict_index_is_clust(index));
4919  ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
4920  MTR_MEMO_X_LOCK));
4921  ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
4922  MTR_MEMO_PAGE_X_FIX));
4923  ut_ad(!rec || rec_offs_validate(rec, index, offsets));
4924  ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
4925 
4926  if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
4928  /* In the rollback, we may encounter a clustered index
4929  record with some unwritten off-page columns. There is
4930  nothing to free then. */
4931  ut_a(rb_ctx != RB_NONE);
4932  return;
4933  }
4934 
4935  ut_ad(space_id == index->space);
4936 
4937  if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
4938  ext_zip_size = fil_space_get_zip_size(space_id);
4939  /* This must be an undo log record in the system tablespace,
4940  that is, in row_purge_upd_exist_or_extern().
4941  Currently, externally stored records are stored in the
4942  same tablespace as the referring records. */
4943  ut_ad(!page_get_space_id(page_align(field_ref)));
4944  ut_ad(!rec);
4945  ut_ad(!page_zip);
4946  } else {
4947  ext_zip_size = rec_zip_size;
4948  }
4949 
4950  if (!rec) {
4951  /* This is a call from row_purge_upd_exist_or_extern(). */
4952  ut_ad(!page_zip);
4953  rec_zip_size = 0;
4954  }
4955 
4956 #ifdef UNIV_BLOB_DEBUG
4957  if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
4958  && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
4959  && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
4960  /* This off-page column will be freed.
4961  Check that no references remain. */
4962 
4963  btr_blob_dbg_t b;
4964 
4965  b.blob_page_no = start_page;
4966 
4967  if (rec) {
4968  /* Remove the reference from the record to the
4969  BLOB. If the BLOB were not freed, the
4970  reference would be removed when the record is
4971  removed. Freeing the BLOB will overwrite the
4972  BTR_EXTERN_PAGE_NO in the field_ref of the
4973  record with FIL_NULL, which would make the
4974  btr_blob_dbg information inconsistent with the
4975  record. */
4976  b.ref_page_no = page_get_page_no(page_align(rec));
4977  b.ref_heap_no = page_rec_get_heap_no(rec);
4978  b.ref_field_no = i;
4979  btr_blob_dbg_rbt_delete(index, &b, "free");
4980  }
4981 
4982  btr_blob_dbg_assert_empty(index, b.blob_page_no);
4983  }
4984 #endif /* UNIV_BLOB_DEBUG */
4985 
4986  for (;;) {
4987 #ifdef UNIV_SYNC_DEBUG
4988  buf_block_t* rec_block;
4989 #endif /* UNIV_SYNC_DEBUG */
4990  buf_block_t* ext_block;
4991 
4992  mtr_start(&mtr);
4993 
4994 #ifdef UNIV_SYNC_DEBUG
4995  rec_block =
4996 #endif /* UNIV_SYNC_DEBUG */
4998  rec_zip_size,
4999  page_get_page_no(page_align(field_ref)),
5000  RW_X_LATCH, &mtr);
5001  buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
5002  page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
5003 
5004  if (/* There is no external storage data */
5005  page_no == FIL_NULL
5006  /* This field does not own the externally stored field */
5007  || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
5008  & BTR_EXTERN_OWNER_FLAG)
5009  /* Rollback and inherited field */
5010  || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
5011  && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
5012  & BTR_EXTERN_INHERITED_FLAG))) {
5013 
5014  /* Do not free */
5015  mtr_commit(&mtr);
5016 
5017  return;
5018  }
5019 
5020  if (page_no == start_page && dict_index_is_online_ddl(index)) {
5021  row_log_table_blob_free(index, start_page);
5022  }
5023 
5024  ext_block = buf_page_get(space_id, ext_zip_size, page_no,
5025  RW_X_LATCH, &mtr);
5026  buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
5027  page = buf_block_get_frame(ext_block);
5028 
5029  if (ext_zip_size) {
5030  /* Note that page_zip will be NULL
5031  in row_purge_upd_exist_or_extern(). */
5032  switch (fil_page_get_type(page)) {
5033  case FIL_PAGE_TYPE_ZBLOB:
5034  case FIL_PAGE_TYPE_ZBLOB2:
5035  break;
5036  default:
5037  ut_error;
5038  }
5039  next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
5040 
5041  btr_page_free_low(index, ext_block, 0, &mtr);
5042 
5043  if (page_zip != NULL) {
5044  mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
5045  next_page_no);
5046  mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
5047  0);
5048  page_zip_write_blob_ptr(page_zip, rec, index,
5049  offsets, i, &mtr);
5050  } else {
5051  mlog_write_ulint(field_ref
5053  next_page_no,
5054  MLOG_4BYTES, &mtr);
5055  mlog_write_ulint(field_ref
5056  + BTR_EXTERN_LEN + 4, 0,
5057  MLOG_4BYTES, &mtr);
5058  }
5059  } else {
5060  ut_a(!page_zip);
5061  btr_check_blob_fil_page_type(space_id, page_no, page,
5062  FALSE);
5063 
5064  next_page_no = mach_read_from_4(
5065  page + FIL_PAGE_DATA
5067 
5068  /* We must supply the page level (= 0) as an argument
5069  because we did not store it on the page (we save the
5070  space overhead from an index page header. */
5071 
5072  btr_page_free_low(index, ext_block, 0, &mtr);
5073 
5075  next_page_no,
5076  MLOG_4BYTES, &mtr);
5077  /* Zero out the BLOB length. If the server
5078  crashes during the execution of this function,
5079  trx_rollback_or_clean_all_recovered() could
5080  dereference the half-deleted BLOB, fetching a
5081  wrong prefix for the BLOB. */
5082  mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
5083  0,
5084  MLOG_4BYTES, &mtr);
5085  }
5086 
5087  /* Commit mtr and release the BLOB block to save memory. */
5088  btr_blob_free(ext_block, TRUE, &mtr);
5089  }
5090 }
5091 
5092 /***********************************************************/
5094 static
5095 void
5096 btr_rec_free_externally_stored_fields(
5097 /*==================================*/
5098  dict_index_t* index,
5100  rec_t* rec,
5101  const ulint* offsets,
5102  page_zip_des_t* page_zip,
5104  enum trx_rb_ctx rb_ctx,
5105  mtr_t* mtr)
5108 {
5109  ulint n_fields;
5110  ulint i;
5111 
5112  ut_ad(rec_offs_validate(rec, index, offsets));
5113  ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
5114  /* Free possible externally stored fields in the record */
5115 
5116  ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
5117  n_fields = rec_offs_n_fields(offsets);
5118 
5119  for (i = 0; i < n_fields; i++) {
5120  if (rec_offs_nth_extern(offsets, i)) {
5122  index, btr_rec_get_field_ref(rec, offsets, i),
5123  rec, offsets, page_zip, i, rb_ctx, mtr);
5124  }
5125  }
5126 }
5127 
5128 /***********************************************************/
5131 static
5132 void
5133 btr_rec_free_updated_extern_fields(
5134 /*===============================*/
5135  dict_index_t* index,
5137  rec_t* rec,
5138  page_zip_des_t* page_zip,
5140  const ulint* offsets,
5141  const upd_t* update,
5142  enum trx_rb_ctx rb_ctx,
5143  mtr_t* mtr)
5145 {
5146  ulint n_fields;
5147  ulint i;
5148 
5149  ut_ad(rec_offs_validate(rec, index, offsets));
5150  ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
5151 
5152  /* Free possible externally stored fields in the record */
5153 
5154  n_fields = upd_get_n_fields(update);
5155 
5156  for (i = 0; i < n_fields; i++) {
5157  const upd_field_t* ufield = upd_get_nth_field(update, i);
5158 
5159  if (rec_offs_nth_extern(offsets, ufield->field_no)) {
5160  ulint len;
5161  byte* data = rec_get_nth_field(
5162  rec, offsets, ufield->field_no, &len);
5164 
5166  index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
5167  rec, offsets, page_zip,
5168  ufield->field_no, rb_ctx, mtr);
5169  }
5170  }
5171 }
5172 
5173 /*******************************************************************/
5177 static
5178 ulint
5179 btr_copy_blob_prefix(
5180 /*=================*/
5181  byte* buf,
5183  ulint len,
5184  ulint space_id,
5185  ulint page_no,
5186  ulint offset)
5187 {
5188  ulint copied_len = 0;
5189 
5190  for (;;) {
5191  mtr_t mtr;
5192  buf_block_t* block;
5193  const page_t* page;
5194  const byte* blob_header;
5195  ulint part_len;
5196  ulint copy_len;
5197 
5198  mtr_start(&mtr);
5199 
5200  block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
5201  buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
5202  page = buf_block_get_frame(block);
5203 
5204  btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
5205 
5206  blob_header = page + offset;
5207  part_len = btr_blob_get_part_len(blob_header);
5208  copy_len = ut_min(part_len, len - copied_len);
5209 
5210  memcpy(buf + copied_len,
5211  blob_header + BTR_BLOB_HDR_SIZE, copy_len);
5212  copied_len += copy_len;
5213 
5214  page_no = btr_blob_get_next_page_no(blob_header);
5215 
5216  mtr_commit(&mtr);
5217 
5218  if (page_no == FIL_NULL || copy_len != part_len) {
5219  UNIV_MEM_ASSERT_RW(buf, copied_len);
5220  return(copied_len);
5221  }
5222 
5223  /* On other BLOB pages except the first the BLOB header
5224  always is at the page data start: */
5225 
5226  offset = FIL_PAGE_DATA;
5227 
5228  ut_ad(copied_len <= len);
5229  }
5230 }
5231 
5232 /*******************************************************************/
5236 static
5237 ulint
5238 btr_copy_zblob_prefix(
5239 /*==================*/
5240  byte* buf,
5242  ulint len,
5243  ulint zip_size,
5244  ulint space_id,
5245  ulint page_no,
5246  ulint offset)
5247 {
5248  ulint page_type = FIL_PAGE_TYPE_ZBLOB;
5249  mem_heap_t* heap;
5250  int err;
5251  z_stream d_stream;
5252 
5253  d_stream.next_out = buf;
5254  d_stream.avail_out = len;
5255  d_stream.next_in = Z_NULL;
5256  d_stream.avail_in = 0;
5257 
5258  /* Zlib inflate needs 32 kilobytes for the default
5259  window size, plus a few kilobytes for small objects. */
5260  heap = mem_heap_create(40000);
5261  page_zip_set_alloc(&d_stream, heap);
5262 
5263  ut_ad(ut_is_2pow(zip_size));
5264  ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
5265  ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
5266  ut_ad(space_id);
5267 
5268  err = inflateInit(&d_stream);
5269  ut_a(err == Z_OK);
5270 
5271  for (;;) {
5272  buf_page_t* bpage;
5273  ulint next_page_no;
5274 
5275  /* There is no latch on bpage directly. Instead,
5276  bpage is protected by the B-tree page latch that
5277  is being held on the clustered index record, or,
5278  in row_merge_copy_blobs(), by an exclusive table lock. */
5279  bpage = buf_page_get_zip(space_id, zip_size, page_no);
5280 
5281  if (UNIV_UNLIKELY(!bpage)) {
5282  ut_print_timestamp(stderr);
5283  fprintf(stderr,
5284  " InnoDB: Cannot load"
5285  " compressed BLOB"
5286  " page %lu space %lu\n",
5287  (ulong) page_no, (ulong) space_id);
5288  goto func_exit;
5289  }
5290 
5291  if (UNIV_UNLIKELY
5292  (fil_page_get_type(bpage->zip.data) != page_type)) {
5293  ut_print_timestamp(stderr);
5294  fprintf(stderr,
5295  " InnoDB: Unexpected type %lu of"
5296  " compressed BLOB"
5297  " page %lu space %lu\n",
5298  (ulong) fil_page_get_type(bpage->zip.data),
5299  (ulong) page_no, (ulong) space_id);
5300  ut_ad(0);
5301  goto end_of_blob;
5302  }
5303 
5304  next_page_no = mach_read_from_4(bpage->zip.data + offset);
5305 
5306  if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
5307  /* When the BLOB begins at page header,
5308  the compressed data payload does not
5309  immediately follow the next page pointer. */
5310  offset = FIL_PAGE_DATA;
5311  } else {
5312  offset += 4;
5313  }
5314 
5315  d_stream.next_in = bpage->zip.data + offset;
5316  d_stream.avail_in = zip_size - offset;
5317 
5318  err = inflate(&d_stream, Z_NO_FLUSH);
5319  switch (err) {
5320  case Z_OK:
5321  if (!d_stream.avail_out) {
5322  goto end_of_blob;
5323  }
5324  break;
5325  case Z_STREAM_END:
5326  if (next_page_no == FIL_NULL) {
5327  goto end_of_blob;
5328  }
5329  /* fall through */
5330  default:
5331 inflate_error:
5332  ut_print_timestamp(stderr);
5333  fprintf(stderr,
5334  " InnoDB: inflate() of"
5335  " compressed BLOB"
5336  " page %lu space %lu returned %d (%s)\n",
5337  (ulong) page_no, (ulong) space_id,
5338  err, d_stream.msg);
5339  case Z_BUF_ERROR:
5340  goto end_of_blob;
5341  }
5342 
5343  if (next_page_no == FIL_NULL) {
5344  if (!d_stream.avail_in) {
5345  ut_print_timestamp(stderr);
5346  fprintf(stderr,
5347  " InnoDB: unexpected end of"
5348  " compressed BLOB"
5349  " page %lu space %lu\n",
5350  (ulong) page_no,
5351  (ulong) space_id);
5352  } else {
5353  err = inflate(&d_stream, Z_FINISH);
5354  switch (err) {
5355  case Z_STREAM_END:
5356  case Z_BUF_ERROR:
5357  break;
5358  default:
5359  goto inflate_error;
5360  }
5361  }
5362 
5363 end_of_blob:
5364  buf_page_release_zip(bpage);
5365  goto func_exit;
5366  }
5367 
5368  buf_page_release_zip(bpage);
5369 
5370  /* On other BLOB pages except the first
5371  the BLOB header always is at the page header: */
5372 
5373  page_no = next_page_no;
5374  offset = FIL_PAGE_NEXT;
5375  page_type = FIL_PAGE_TYPE_ZBLOB2;
5376  }
5377 
5378 func_exit:
5379  inflateEnd(&d_stream);
5380  mem_heap_free(heap);
5381  UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
5382  return(d_stream.total_out);
5383 }
5384 
5385 /*******************************************************************/
5390 static
5391 ulint
5392 btr_copy_externally_stored_field_prefix_low(
5393 /*========================================*/
5394  byte* buf,
5396  ulint len,
5397  ulint zip_size,
5399  ulint space_id,
5400  ulint page_no,
5401  ulint offset)
5402 {
5403  if (UNIV_UNLIKELY(len == 0)) {
5404  return(0);
5405  }
5406 
5407  if (zip_size) {
5408  return(btr_copy_zblob_prefix(buf, len, zip_size,
5409  space_id, page_no, offset));
5410  } else {
5411  return(btr_copy_blob_prefix(buf, len, space_id,
5412  page_no, offset));
5413  }
5414 }
5415 
5416 /*******************************************************************/
5421 UNIV_INTERN
5422 ulint
5424 /*====================================*/
5425  byte* buf,
5426  ulint len,
5427  ulint zip_size,
5429  const byte* data,
5433  ulint local_len)
5434 {
5435  ulint space_id;
5436  ulint page_no;
5437  ulint offset;
5438 
5439  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5440 
5441  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
5442 
5443  if (UNIV_UNLIKELY(local_len >= len)) {
5444  memcpy(buf, data, len);
5445  return(len);
5446  }
5447 
5448  memcpy(buf, data, local_len);
5449  data += local_len;
5450 
5452 
5453  if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
5454  /* The externally stored part of the column has been
5455  (partially) deleted. Signal the half-deleted BLOB
5456  to the caller. */
5457 
5458  return(0);
5459  }
5460 
5461  space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
5462 
5463  page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
5464 
5465  offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
5466 
5467  return(local_len
5468  + btr_copy_externally_stored_field_prefix_low(buf + local_len,
5469  len - local_len,
5470  zip_size,
5471  space_id, page_no,
5472  offset));
5473 }
5474 
5475 /*******************************************************************/
5479 UNIV_INTERN
5480 byte*
5482 /*=============================*/
5483  ulint* len,
5484  const byte* data,
5488  ulint zip_size,
5490  ulint local_len,
5491  mem_heap_t* heap)
5492 {
5493  ulint space_id;
5494  ulint page_no;
5495  ulint offset;
5496  ulint extern_len;
5497  byte* buf;
5498 
5499  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5500 
5501  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
5502 
5503  space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
5504 
5505  page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
5506 
5507  offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
5508 
5509  /* Currently a BLOB cannot be bigger than 4 GB; we
5510  leave the 4 upper bytes in the length field unused */
5511 
5512  extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
5513 
5514  buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
5515 
5516  memcpy(buf, data, local_len);
5517  *len = local_len
5518  + btr_copy_externally_stored_field_prefix_low(buf + local_len,
5519  extern_len,
5520  zip_size,
5521  space_id,
5522  page_no, offset);
5523 
5524  return(buf);
5525 }
5526 
5527 /*******************************************************************/
5530 UNIV_INTERN
5531 byte*
5533 /*=================================*/
5534  const rec_t* rec,
5536  const ulint* offsets,
5537  ulint zip_size,
5539  ulint no,
5540  ulint* len,
5541  mem_heap_t* heap)
5542 {
5543  ulint local_len;
5544  const byte* data;
5545 
5546  ut_a(rec_offs_nth_extern(offsets, no));
5547 
5548  /* An externally stored field can contain some initial
5549  data from the field, and in the last 20 bytes it has the
5550  space id, page number, and offset where the rest of the
5551  field data is stored, and the data length in addition to
5552  the data stored locally. We may need to store some data
5553  locally to get the local record length above the 128 byte
5554  limit so that field offsets are stored in two bytes, and
5555  the extern bit is available in those two bytes. */
5556 
5557  data = rec_get_nth_field(rec, offsets, no, &local_len);
5558 
5559  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5560 
5561  if (UNIV_UNLIKELY
5562  (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
5564  /* The externally stored field was not written yet.
5565  This record should only be seen by
5566  recv_recovery_rollback_active() or any
5567  TRX_ISO_READ_UNCOMMITTED transactions. */
5568  return(NULL);
5569  }
5570 
5571  return(btr_copy_externally_stored_field(len, data,
5572  zip_size, local_len, heap));
5573 }
5574 #endif /* !UNIV_HOTBACKUP */