MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
buf0flu.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "buf0flu.h"
27 
28 #ifdef UNIV_NONINL
29 #include "buf0flu.ic"
30 #endif
31 
32 #include "buf0buf.h"
33 #include "buf0checksum.h"
34 #include "srv0start.h"
35 #include "srv0srv.h"
36 #include "page0zip.h"
37 #ifndef UNIV_HOTBACKUP
38 #include "ut0byte.h"
39 #include "ut0lst.h"
40 #include "page0page.h"
41 #include "fil0fil.h"
42 #include "buf0lru.h"
43 #include "buf0rea.h"
44 #include "ibuf0ibuf.h"
45 #include "log0log.h"
46 #include "os0file.h"
47 #include "trx0sys.h"
48 #include "srv0mon.h"
49 #include "mysql/plugin.h"
50 #include "mysql/service_thd_wait.h"
51 
53 static ulint buf_lru_flush_page_count = 0;
54 
60 UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
61 
64 #define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE 100
65 
66 #ifdef UNIV_PFS_THREAD
67 UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
68 #endif /* UNIV_PFS_THREAD */
69 
74 #define BUF_LRU_MIN_LEN 256
75 
76 /* @} */
77 
78 /******************************************************************/
81 static inline
82 void
83 incr_flush_list_size_in_bytes(
84 /*==========================*/
86  buf_pool_t* buf_pool)
87 {
89  ulint zip_size = page_zip_get_size(&block->page.zip);
90  buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
91  ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
92 }
93 
94 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
95 /******************************************************************/
98 static
99 ibool
100 buf_flush_validate_low(
101 /*===================*/
102  buf_pool_t* buf_pool);
104 /******************************************************************/
107 static
108 ibool
109 buf_flush_validate_skip(
110 /*====================*/
111  buf_pool_t* buf_pool)
112 {
114 # define BUF_FLUSH_VALIDATE_SKIP 23
115 
118  static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
119 
120  /* There is a race condition below, but it does not matter,
121  because this call is only for heuristic purposes. We want to
122  reduce the call frequency of the costly buf_flush_validate_low()
123  check in debug builds. */
124  if (--buf_flush_validate_count > 0) {
125  return(TRUE);
126  }
127 
128  buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
129  return(buf_flush_validate_low(buf_pool));
130 }
131 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
132 
133 /*******************************************************************/
135 UNIV_INLINE
136 void
138 /*=============*/
139  buf_pool_t* buf_pool,
140  const buf_page_t* bpage)
141 {
142  ut_ad(buf_flush_list_mutex_own(buf_pool));
143  ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
144  ut_ad(!bpage || buf_page_in_file(bpage));
145  ut_ad(!bpage || bpage->in_flush_list);
146  ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
147 
148  buf_pool->flush_list_hp = bpage;
149 }
150 
151 /*******************************************************************/
154 UNIV_INLINE
155 bool
157 /*============*/
158  buf_pool_t* buf_pool,
159  const buf_page_t* bpage)
160 {
161  ut_ad(buf_flush_list_mutex_own(buf_pool));
162 
163  return(buf_pool->flush_list_hp == bpage);
164 }
165 
166 /*******************************************************************/
172 UNIV_INLINE
173 void
175 /*================*/
176  buf_pool_t* buf_pool,
177  buf_page_t* bpage)
178 {
179  ut_ad(buf_flush_list_mutex_own(buf_pool));
180 
181  if (buf_flush_is_hp(buf_pool, bpage)) {
182  buf_flush_set_hp(buf_pool, NULL);
183  MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
184  }
185 }
186 
187 /******************************************************************/
192 static
193 buf_page_t*
194 buf_flush_insert_in_flush_rbt(
195 /*==========================*/
196  buf_page_t* bpage)
197 {
198  const ib_rbt_node_t* c_node;
199  const ib_rbt_node_t* p_node;
200  buf_page_t* prev = NULL;
201  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
202 
203  ut_ad(buf_flush_list_mutex_own(buf_pool));
204 
205  /* Insert this buffer into the rbt. */
206  c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
207  ut_a(c_node != NULL);
208 
209  /* Get the predecessor. */
210  p_node = rbt_prev(buf_pool->flush_rbt, c_node);
211 
212  if (p_node != NULL) {
213  buf_page_t** value;
214  value = rbt_value(buf_page_t*, p_node);
215  prev = *value;
216  ut_a(prev != NULL);
217  }
218 
219  return(prev);
220 }
221 
222 /*********************************************************/
224 static
225 void
226 buf_flush_delete_from_flush_rbt(
227 /*============================*/
228  buf_page_t* bpage)
229 {
230 #ifdef UNIV_DEBUG
231  ibool ret = FALSE;
232 #endif /* UNIV_DEBUG */
233  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
234 
235  ut_ad(buf_flush_list_mutex_own(buf_pool));
236 
237 #ifdef UNIV_DEBUG
238  ret =
239 #endif /* UNIV_DEBUG */
240  rbt_delete(buf_pool->flush_rbt, &bpage);
241 
242  ut_ad(ret);
243 }
244 
245 /*****************************************************************/
255 static
256 int
257 buf_flush_block_cmp(
258 /*================*/
259  const void* p1,
260  const void* p2)
261 {
262  int ret;
263  const buf_page_t* b1 = *(const buf_page_t**) p1;
264  const buf_page_t* b2 = *(const buf_page_t**) p2;
265 #ifdef UNIV_DEBUG
266  buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
267 #endif /* UNIV_DEBUG */
268 
269  ut_ad(b1 != NULL);
270  ut_ad(b2 != NULL);
271 
272  ut_ad(buf_flush_list_mutex_own(buf_pool));
273 
274  ut_ad(b1->in_flush_list);
275  ut_ad(b2->in_flush_list);
276 
278  return(1);
279  } else if (b2->oldest_modification < b1->oldest_modification) {
280  return(-1);
281  }
282 
283  /* If oldest_modification is same then decide on the space. */
284  ret = (int)(b2->space - b1->space);
285 
286  /* Or else decide ordering on the offset field. */
287  return(ret ? ret : (int)(b2->offset - b1->offset));
288 }
289 
290 /********************************************************************/
294 UNIV_INTERN
295 void
297 /*==========================*/
298 {
299  ulint i;
300 
301  for (i = 0; i < srv_buf_pool_instances; i++) {
302  buf_pool_t* buf_pool;
303 
304  buf_pool = buf_pool_from_array(i);
305 
306  buf_flush_list_mutex_enter(buf_pool);
307 
308  /* Create red black tree for speedy insertions in flush list. */
309  buf_pool->flush_rbt = rbt_create(
310  sizeof(buf_page_t*), buf_flush_block_cmp);
311 
312  buf_flush_list_mutex_exit(buf_pool);
313  }
314 }
315 
316 /********************************************************************/
318 UNIV_INTERN
319 void
321 /*==========================*/
322 {
323  ulint i;
324 
325  for (i = 0; i < srv_buf_pool_instances; i++) {
326  buf_pool_t* buf_pool;
327 
328  buf_pool = buf_pool_from_array(i);
329 
330  buf_flush_list_mutex_enter(buf_pool);
331 
332 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
333  ut_a(buf_flush_validate_low(buf_pool));
334 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
335 
336  rbt_free(buf_pool->flush_rbt);
337  buf_pool->flush_rbt = NULL;
338 
339  buf_flush_list_mutex_exit(buf_pool);
340  }
341 }
342 
343 /********************************************************************/
345 UNIV_INTERN
346 void
348 /*=============================*/
349  buf_pool_t* buf_pool,
350  buf_block_t* block,
351  lsn_t lsn)
352 {
353  ut_ad(!buf_pool_mutex_own(buf_pool));
355  ut_ad(mutex_own(&block->mutex));
356 
357  buf_flush_list_mutex_enter(buf_pool);
358 
359  ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
360  || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
361  <= lsn));
362 
363  /* If we are in the recovery then we need to update the flush
364  red-black tree as well. */
365  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
366  buf_flush_list_mutex_exit(buf_pool);
367  buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
368  return;
369  }
370 
372  ut_ad(!block->page.in_flush_list);
373 
374  ut_d(block->page.in_flush_list = TRUE);
375  block->page.oldest_modification = lsn;
376  UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
377  incr_flush_list_size_in_bytes(block, buf_pool);
378 
379 #ifdef UNIV_DEBUG_VALGRIND
380  {
381  ulint zip_size = buf_block_get_zip_size(block);
382 
383  if (zip_size) {
384  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
385  } else {
386  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
387  }
388  }
389 #endif /* UNIV_DEBUG_VALGRIND */
390 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
391  ut_a(buf_flush_validate_skip(buf_pool));
392 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
393 
394  buf_flush_list_mutex_exit(buf_pool);
395 }
396 
397 /********************************************************************/
401 UNIV_INTERN
402 void
404 /*====================================*/
405  buf_pool_t* buf_pool,
406  buf_block_t* block,
407  lsn_t lsn)
408 {
409  buf_page_t* prev_b;
410  buf_page_t* b;
411 
412  ut_ad(!buf_pool_mutex_own(buf_pool));
414  ut_ad(mutex_own(&block->mutex));
416 
417  buf_flush_list_mutex_enter(buf_pool);
418 
419  /* The field in_LRU_list is protected by buf_pool->mutex, which
420  we are not holding. However, while a block is in the flush
421  list, it is dirty and cannot be discarded, not from the
422  page_hash or from the LRU list. At most, the uncompressed
423  page frame of a compressed block may be discarded or created
424  (copying the block->page to or from a buf_page_t that is
425  dynamically allocated from buf_buddy_alloc()). Because those
426  transitions hold block->mutex and the flush list mutex (via
427  buf_flush_relocate_on_flush_list()), there is no possibility
428  of a race condition in the assertions below. */
429  ut_ad(block->page.in_LRU_list);
430  ut_ad(block->page.in_page_hash);
431  /* buf_buddy_block_register() will take a block in the
432  BUF_BLOCK_MEMORY state, not a file page. */
433  ut_ad(!block->page.in_zip_hash);
434 
435  ut_ad(!block->page.in_flush_list);
436  ut_d(block->page.in_flush_list = TRUE);
437  block->page.oldest_modification = lsn;
438 
439 #ifdef UNIV_DEBUG_VALGRIND
440  {
441  ulint zip_size = buf_block_get_zip_size(block);
442 
443  if (zip_size) {
444  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
445  } else {
446  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
447  }
448  }
449 #endif /* UNIV_DEBUG_VALGRIND */
450 
451  prev_b = NULL;
452 
453  /* For the most part when this function is called the flush_rbt
454  should not be NULL. In a very rare boundary case it is possible
455  that the flush_rbt has already been freed by the recovery thread
456  before the last page was hooked up in the flush_list by the
457  io-handler thread. In that case we'll just do a simple
458  linear search in the else block. */
459  if (buf_pool->flush_rbt) {
460 
461  prev_b = buf_flush_insert_in_flush_rbt(&block->page);
462 
463  } else {
464 
465  b = UT_LIST_GET_FIRST(buf_pool->flush_list);
466 
467  while (b && b->oldest_modification
468  > block->page.oldest_modification) {
469  ut_ad(b->in_flush_list);
470  prev_b = b;
471  b = UT_LIST_GET_NEXT(list, b);
472  }
473  }
474 
475  if (prev_b == NULL) {
476  UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
477  } else {
478  UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
479  prev_b, &block->page);
480  }
481 
482  incr_flush_list_size_in_bytes(block, buf_pool);
483 
484 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
485  ut_a(buf_flush_validate_low(buf_pool));
486 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
487 
488  buf_flush_list_mutex_exit(buf_pool);
489 }
490 
491 /********************************************************************/
495 UNIV_INTERN
496 ibool
498 /*========================*/
499  buf_page_t* bpage)
501 {
502 #ifdef UNIV_DEBUG
503  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
504  ut_ad(buf_pool_mutex_own(buf_pool));
505 #endif
506  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
507  ut_ad(bpage->in_LRU_list);
508 
509  if (UNIV_LIKELY(buf_page_in_file(bpage))) {
510 
511  return(bpage->oldest_modification == 0
512  && buf_page_get_io_fix(bpage) == BUF_IO_NONE
513  && bpage->buf_fix_count == 0);
514  }
515 
516  ut_print_timestamp(stderr);
517  fprintf(stderr,
518  " InnoDB: Error: buffer block state %lu"
519  " in the LRU list!\n",
520  (ulong) buf_page_get_state(bpage));
521  ut_print_buf(stderr, bpage, sizeof(buf_page_t));
522  putc('\n', stderr);
523 
524  return(FALSE);
525 }
526 
527 /********************************************************************/
530 UNIV_INTERN
531 bool
533 /*======================*/
534  buf_page_t* bpage,
536  buf_flush_t flush_type)
537 {
538 #ifdef UNIV_DEBUG
539  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
540  ut_ad(buf_pool_mutex_own(buf_pool));
541 #endif /* UNIV_DEBUG */
542 
543  ut_a(buf_page_in_file(bpage));
544  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
545  ut_ad(flush_type < BUF_FLUSH_N_TYPES);
546 
547  if (bpage->oldest_modification == 0
548  || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
549  return(false);
550  }
551 
552  ut_ad(bpage->in_flush_list);
553 
554  switch (flush_type) {
555  case BUF_FLUSH_LIST:
556  return(true);
557 
558  case BUF_FLUSH_LRU:
560  /* Because any thread may call single page flush, even
561  when owning locks on pages, to avoid deadlocks, we must
562  make sure that the that it is not buffer fixed.
563  The same holds true for LRU flush because a user thread
564  may end up waiting for an LRU flush to end while
565  holding locks on other pages. */
566  return(bpage->buf_fix_count == 0);
567  case BUF_FLUSH_N_TYPES:
568  break;
569  }
570 
571  ut_error;
572  return(false);
573 }
574 
575 /********************************************************************/
577 UNIV_INTERN
578 void
580 /*=============*/
581  buf_page_t* bpage)
582 {
583  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
584  ulint zip_size;
585 
586  ut_ad(buf_pool_mutex_own(buf_pool));
587  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
588  ut_ad(bpage->in_flush_list);
589 
590  buf_flush_list_mutex_enter(buf_pool);
591 
592  switch (buf_page_get_state(bpage)) {
594  case BUF_BLOCK_ZIP_PAGE:
595  /* Clean compressed pages should not be on the flush list */
596  case BUF_BLOCK_NOT_USED:
598  case BUF_BLOCK_MEMORY:
600  ut_error;
601  return;
602  case BUF_BLOCK_ZIP_DIRTY:
604  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
605 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
606  buf_LRU_insert_zip_clean(bpage);
607 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
608  break;
609  case BUF_BLOCK_FILE_PAGE:
610  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
611  break;
612  }
613 
614  /* If the flush_rbt is active then delete from there as well. */
615  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
616  buf_flush_delete_from_flush_rbt(bpage);
617  }
618 
619  /* Must be done after we have removed it from the flush_rbt
620  because we assert on in_flush_list in comparison function. */
621  ut_d(bpage->in_flush_list = FALSE);
622 
623  zip_size = page_zip_get_size(&bpage->zip);
624  buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
625 
626  bpage->oldest_modification = 0;
627 
628 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
629  ut_a(buf_flush_validate_skip(buf_pool));
630 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
631 
632  buf_flush_update_hp(buf_pool, bpage);
633  buf_flush_list_mutex_exit(buf_pool);
634 }
635 
636 /*******************************************************************/
647 UNIV_INTERN
648 void
650 /*=============================*/
651  buf_page_t* bpage,
652  buf_page_t* dpage)
653 {
654  buf_page_t* prev;
655  buf_page_t* prev_b = NULL;
656  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
657 
658  ut_ad(buf_pool_mutex_own(buf_pool));
659  /* Must reside in the same buffer pool. */
660  ut_ad(buf_pool == buf_pool_from_bpage(dpage));
661 
662  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
663 
664  buf_flush_list_mutex_enter(buf_pool);
665 
666  /* FIXME: At this point we have both buf_pool and flush_list
667  mutexes. Theoretically removal of a block from flush list is
668  only covered by flush_list mutex but currently we do
669  have buf_pool mutex in buf_flush_remove() therefore this block
670  is guaranteed to be in the flush list. We need to check if
671  this will work without the assumption of block removing code
672  having the buf_pool mutex. */
673  ut_ad(bpage->in_flush_list);
674  ut_ad(dpage->in_flush_list);
675 
676  /* If recovery is active we must swap the control blocks in
677  the flush_rbt as well. */
678  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
679  buf_flush_delete_from_flush_rbt(bpage);
680  prev_b = buf_flush_insert_in_flush_rbt(dpage);
681  }
682 
683  /* Must be done after we have removed it from the flush_rbt
684  because we assert on in_flush_list in comparison function. */
685  ut_d(bpage->in_flush_list = FALSE);
686 
687  prev = UT_LIST_GET_PREV(list, bpage);
688  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
689 
690  if (prev) {
691  ut_ad(prev->in_flush_list);
693  list,
694  buf_pool->flush_list,
695  prev, dpage);
696  } else {
698  list,
699  buf_pool->flush_list,
700  dpage);
701  }
702 
703  /* Just an extra check. Previous in flush_list
704  should be the same control block as in flush_rbt. */
705  ut_a(!buf_pool->flush_rbt || prev_b == prev);
706 
707 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
708  ut_a(buf_flush_validate_low(buf_pool));
709 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
710 
711  buf_flush_update_hp(buf_pool, bpage);
712  buf_flush_list_mutex_exit(buf_pool);
713 }
714 
715 /********************************************************************/
717 UNIV_INTERN
718 void
720 /*=====================*/
721  buf_page_t* bpage)
722 {
723  buf_flush_t flush_type;
724  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
725 
726  ut_ad(bpage);
727 
728  buf_flush_remove(bpage);
729 
730  flush_type = buf_page_get_flush_type(bpage);
731  buf_pool->n_flush[flush_type]--;
732 
733  /* fprintf(stderr, "n pending flush %lu\n",
734  buf_pool->n_flush[flush_type]); */
735 
736  if (buf_pool->n_flush[flush_type] == 0
737  && buf_pool->init_flush[flush_type] == FALSE) {
738 
739  /* The running flush batch has ended */
740 
741  os_event_set(buf_pool->no_flush[flush_type]);
742  }
743 
744  buf_dblwr_update(bpage, flush_type);
745 }
746 #endif /* !UNIV_HOTBACKUP */
747 
748 /********************************************************************/
750 UNIV_INTERN
751 void
753 /*==========================*/
754  buf_frame_t* page,
755  ulint zip_size,
756  lsn_t lsn)
757 {
758  ut_a(zip_size > 0);
759 
760  ib_uint32_t checksum = page_zip_calc_checksum(
761  page, zip_size,
762  static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
763 
764  mach_write_to_8(page + FIL_PAGE_LSN, lsn);
765  memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
766  mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
767 }
768 
769 /********************************************************************/
771 UNIV_INTERN
772 void
774 /*=======================*/
775  byte* page,
776  void* page_zip_,
777  lsn_t newest_lsn)
779 {
780  ib_uint32_t checksum = 0 /* silence bogus gcc warning */;
781 
782  ut_ad(page);
783 
784  if (page_zip_) {
785  page_zip_des_t* page_zip;
786  ulint zip_size;
787 
788  page_zip = static_cast<page_zip_des_t*>(page_zip_);
789  zip_size = page_zip_get_size(page_zip);
790 
791  ut_ad(zip_size);
792  ut_ad(ut_is_2pow(zip_size));
793  ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
794 
795  switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
797  case FIL_PAGE_INODE:
800  case FIL_PAGE_TYPE_XDES:
801  /* These are essentially uncompressed pages. */
802  memcpy(page_zip->data, page, zip_size);
803  /* fall through */
804  case FIL_PAGE_TYPE_ZBLOB:
806  case FIL_PAGE_INDEX:
807 
809  page_zip->data, zip_size, newest_lsn);
810 
811  return;
812  }
813 
814  ut_print_timestamp(stderr);
815  fputs(" InnoDB: ERROR: The compressed page to be written"
816  " seems corrupt:", stderr);
817  ut_print_buf(stderr, page, zip_size);
818  fputs("\nInnoDB: Possibly older version of the page:", stderr);
819  ut_print_buf(stderr, page_zip->data, zip_size);
820  putc('\n', stderr);
821  ut_error;
822  }
823 
824  /* Write the newest modification lsn to the page header and trailer */
825  mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
826 
827  mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
828  newest_lsn);
829 
830  /* Store the new formula checksum */
831 
835  checksum = buf_calc_page_crc32(page);
836  break;
839  checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
840  break;
843  checksum = BUF_NO_CHECKSUM_MAGIC;
844  break;
845  /* no default so the compiler will emit a warning if new enum
846  is added and not handled here */
847  }
848 
849  mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
850 
851  /* We overwrite the first 4 bytes of the end lsn field to store
852  the old formula checksum. Since it depends also on the field
853  FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
854  new formula checksum. */
855 
856  if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
857  || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
858 
859  checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
860 
861  /* In other cases we use the value assigned from above.
862  If CRC32 is used then it is faster to use that checksum
863  (calculated above) instead of calculating another one.
864  We can afford to store something other than
865  buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
866  this field because the file will not be readable by old
867  versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
868  }
869 
870  mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
871  checksum);
872 }
873 
874 #ifndef UNIV_HOTBACKUP
875 /********************************************************************/
880 static
881 void
882 buf_flush_write_block_low(
883 /*======================*/
884  buf_page_t* bpage,
885  buf_flush_t flush_type,
886  bool sync)
887 {
888  ulint zip_size = buf_page_get_zip_size(bpage);
889  page_t* frame = NULL;
890 
891 #ifdef UNIV_DEBUG
892  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
893  ut_ad(!buf_pool_mutex_own(buf_pool));
894 #endif
895 
896 #ifdef UNIV_LOG_DEBUG
897  static ibool univ_log_debug_warned;
898 #endif /* UNIV_LOG_DEBUG */
899 
900  ut_ad(buf_page_in_file(bpage));
901 
902  /* We are not holding buf_pool->mutex or block_mutex here.
903  Nevertheless, it is safe to access bpage, because it is
904  io_fixed and oldest_modification != 0. Thus, it cannot be
905  relocated in the buffer pool or removed from flush_list or
906  LRU_list. */
907  ut_ad(!buf_pool_mutex_own(buf_pool));
908  ut_ad(!buf_flush_list_mutex_own(buf_pool));
909  ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
911  ut_ad(bpage->oldest_modification != 0);
912 
913 #ifdef UNIV_IBUF_COUNT_DEBUG
914  ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
915 #endif
916  ut_ad(bpage->newest_modification != 0);
917 
918 #ifdef UNIV_LOG_DEBUG
919  if (!univ_log_debug_warned) {
920  univ_log_debug_warned = TRUE;
921  fputs("Warning: cannot force log to disk if"
922  " UNIV_LOG_DEBUG is defined!\n"
923  "Crash recovery will not work!\n",
924  stderr);
925  }
926 #else
927  /* Force the log to the disk before writing the modified block */
928  log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
929 #endif
930  switch (buf_page_get_state(bpage)) {
932  case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
933  case BUF_BLOCK_NOT_USED:
935  case BUF_BLOCK_MEMORY:
937  ut_error;
938  break;
939  case BUF_BLOCK_ZIP_DIRTY:
940  frame = bpage->zip.data;
941 
942  ut_a(page_zip_verify_checksum(frame, zip_size));
943 
945  bpage->newest_modification);
946  memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
947  break;
948  case BUF_BLOCK_FILE_PAGE:
949  frame = bpage->zip.data;
950  if (!frame) {
951  frame = ((buf_block_t*) bpage)->frame;
952  }
953 
954  buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
955  bpage->zip.data
956  ? &bpage->zip : NULL,
957  bpage->newest_modification);
958  break;
959  }
960 
961  if (!srv_use_doublewrite_buf || !buf_dblwr) {
962  fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
963  sync, buf_page_get_space(bpage), zip_size,
964  buf_page_get_page_no(bpage), 0,
965  zip_size ? zip_size : UNIV_PAGE_SIZE,
966  frame, bpage);
967  } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
968  buf_dblwr_write_single_page(bpage, sync);
969  } else {
970  ut_ad(!sync);
971  buf_dblwr_add_to_batch(bpage);
972  }
973 
974  /* When doing single page flushing the IO is done synchronously
975  and we flush the changes to disk only for the tablespace we
976  are working on. */
977  if (sync) {
978  ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
980  buf_page_io_complete(bpage);
981  }
982 
983  /* Increment the counter of I/O operations used
984  for selecting LRU policy. */
986 }
987 
988 /********************************************************************/
995 UNIV_INTERN
996 void
998 /*===========*/
999  buf_pool_t* buf_pool,
1000  buf_page_t* bpage,
1001  buf_flush_t flush_type,
1002  bool sync)
1003 {
1005  ibool is_uncompressed;
1006 
1007  ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1008  ut_ad(buf_pool_mutex_own(buf_pool));
1009  ut_ad(buf_page_in_file(bpage));
1010  ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1011 
1012  block_mutex = buf_page_get_mutex(bpage);
1013  ut_ad(mutex_own(block_mutex));
1014 
1015  ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1016 
1018 
1019  buf_page_set_flush_type(bpage, flush_type);
1020 
1021  if (buf_pool->n_flush[flush_type] == 0) {
1022 
1023  os_event_reset(buf_pool->no_flush[flush_type]);
1024  }
1025 
1026  buf_pool->n_flush[flush_type]++;
1027 
1028  is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1029  ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1030 
1031  switch (flush_type) {
1032  ibool is_s_latched;
1033  case BUF_FLUSH_LIST:
1034  /* If the simulated aio thread is not running, we must
1035  not wait for any latch, as we may end up in a deadlock:
1036  if buf_fix_count == 0, then we know we need not wait */
1037 
1038  is_s_latched = (bpage->buf_fix_count == 0);
1039  if (is_s_latched && is_uncompressed) {
1040  rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1041  BUF_IO_WRITE);
1042  }
1043 
1044  mutex_exit(block_mutex);
1045  buf_pool_mutex_exit(buf_pool);
1046 
1047  /* Even though bpage is not protected by any mutex at
1048  this point, it is safe to access bpage, because it is
1049  io_fixed and oldest_modification != 0. Thus, it
1050  cannot be relocated in the buffer pool or removed from
1051  flush_list or LRU_list. */
1052 
1053  if (!is_s_latched) {
1055 
1056  if (is_uncompressed) {
1057  rw_lock_s_lock_gen(&((buf_block_t*) bpage)
1058  ->lock, BUF_IO_WRITE);
1059  }
1060  }
1061 
1062  break;
1063 
1064  case BUF_FLUSH_LRU:
1065  case BUF_FLUSH_SINGLE_PAGE:
1066  /* VERY IMPORTANT:
1067  Because any thread may call single page flush, even when
1068  owning locks on pages, to avoid deadlocks, we must make
1069  sure that the s-lock is acquired on the page without
1070  waiting: this is accomplished because
1071  buf_flush_ready_for_flush() must hold, and that requires
1072  the page not to be bufferfixed.
1073  The same holds true for LRU flush because a user thread
1074  may end up waiting for an LRU flush to end while
1075  holding locks on other pages. */
1076 
1077  if (is_uncompressed) {
1078  rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1079  BUF_IO_WRITE);
1080  }
1081 
1082  /* Note that the s-latch is acquired before releasing the
1083  buf_pool mutex: this ensures that the latch is acquired
1084  immediately. */
1085 
1086  mutex_exit(block_mutex);
1087  buf_pool_mutex_exit(buf_pool);
1088  break;
1089 
1090  default:
1091  ut_error;
1092  }
1093 
1094  /* Even though bpage is not protected by any mutex at this
1095  point, it is safe to access bpage, because it is io_fixed and
1096  oldest_modification != 0. Thus, it cannot be relocated in the
1097  buffer pool or removed from flush_list or LRU_list. */
1098 
1099 #ifdef UNIV_DEBUG
1100  if (buf_debug_prints) {
1101  fprintf(stderr,
1102  "Flushing %u space %u page %u\n",
1103  flush_type, bpage->space, bpage->offset);
1104  }
1105 #endif /* UNIV_DEBUG */
1106  buf_flush_write_block_low(bpage, flush_type, sync);
1107 }
1108 
1109 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1110 /********************************************************************/
1116 UNIV_INTERN
1117 ibool
1118 buf_flush_page_try(
1119 /*===============*/
1120  buf_pool_t* buf_pool,
1121  buf_block_t* block)
1122 {
1123  ut_ad(buf_pool_mutex_own(buf_pool));
1125  ut_ad(mutex_own(&block->mutex));
1126 
1128  return(FALSE);
1129  }
1130 
1131  /* The following call will release the buffer pool and
1132  block mutex. */
1133  buf_flush_page(buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true);
1134  return(TRUE);
1135 }
1136 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1137 /***********************************************************/
1140 static
1141 bool
1142 buf_flush_check_neighbor(
1143 /*=====================*/
1144  ulint space,
1145  ulint offset,
1146  buf_flush_t flush_type)
1148 {
1149  buf_page_t* bpage;
1150  buf_pool_t* buf_pool = buf_pool_get(space, offset);
1151  bool ret;
1152 
1153  ut_ad(flush_type == BUF_FLUSH_LRU
1154  || flush_type == BUF_FLUSH_LIST);
1155 
1156  buf_pool_mutex_enter(buf_pool);
1157 
1158  /* We only want to flush pages from this buffer pool. */
1159  bpage = buf_page_hash_get(buf_pool, space, offset);
1160 
1161  if (!bpage) {
1162 
1163  buf_pool_mutex_exit(buf_pool);
1164  return(false);
1165  }
1166 
1167  ut_a(buf_page_in_file(bpage));
1168 
1169  /* We avoid flushing 'non-old' blocks in an LRU flush,
1170  because the flushed blocks are soon freed */
1171 
1172  ret = false;
1173  if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1175 
1176  mutex_enter(block_mutex);
1177  if (buf_flush_ready_for_flush(bpage, flush_type)) {
1178  ret = true;
1179  }
1180  mutex_exit(block_mutex);
1181  }
1182  buf_pool_mutex_exit(buf_pool);
1183 
1184  return(ret);
1185 }
1186 
1187 /***********************************************************/
1190 static
1191 ulint
1192 buf_flush_try_neighbors(
1193 /*====================*/
1194  ulint space,
1195  ulint offset,
1196  buf_flush_t flush_type,
1198  ulint n_flushed,
1200  ulint n_to_flush)
1202 {
1203  ulint i;
1204  ulint low;
1205  ulint high;
1206  ulint count = 0;
1207  buf_pool_t* buf_pool = buf_pool_get(space, offset);
1208 
1209  ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1210 
1211  if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1212  || srv_flush_neighbors == 0) {
1213  /* If there is little space or neighbor flushing is
1214  not enabled then just flush the victim. */
1215  low = offset;
1216  high = offset + 1;
1217  } else {
1218  /* When flushed, dirty blocks are searched in
1219  neighborhoods of this size, and flushed along with the
1220  original page. */
1221 
1222  ulint buf_flush_area;
1223 
1224  buf_flush_area = ut_min(
1225  BUF_READ_AHEAD_AREA(buf_pool),
1226  buf_pool->curr_size / 16);
1227 
1228  low = (offset / buf_flush_area) * buf_flush_area;
1229  high = (offset / buf_flush_area + 1) * buf_flush_area;
1230 
1231  if (srv_flush_neighbors == 1) {
1232  /* adjust 'low' and 'high' to limit
1233  for contiguous dirty area */
1234  if (offset > low) {
1235  for (i = offset - 1;
1236  i >= low
1237  && buf_flush_check_neighbor(
1238  space, i, flush_type);
1239  i--) {
1240  /* do nothing */
1241  }
1242  low = i + 1;
1243  }
1244 
1245  for (i = offset + 1;
1246  i < high
1247  && buf_flush_check_neighbor(
1248  space, i, flush_type);
1249  i++) {
1250  /* do nothing */
1251  }
1252  high = i;
1253  }
1254  }
1255 
1256  /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1257 
1258  if (high > fil_space_get_size(space)) {
1259  high = fil_space_get_size(space);
1260  }
1261 
1262  for (i = low; i < high; i++) {
1263 
1264  buf_page_t* bpage;
1265 
1266  if ((count + n_flushed) >= n_to_flush) {
1267 
1268  /* We have already flushed enough pages and
1269  should call it a day. There is, however, one
1270  exception. If the page whose neighbors we
1271  are flushing has not been flushed yet then
1272  we'll try to flush the victim that we
1273  selected originally. */
1274  if (i <= offset) {
1275  i = offset;
1276  } else {
1277  break;
1278  }
1279  }
1280 
1281  buf_pool = buf_pool_get(space, i);
1282 
1283  buf_pool_mutex_enter(buf_pool);
1284 
1285  /* We only want to flush pages from this buffer pool. */
1286  bpage = buf_page_hash_get(buf_pool, space, i);
1287 
1288  if (!bpage) {
1289 
1290  buf_pool_mutex_exit(buf_pool);
1291  continue;
1292  }
1293 
1294  ut_a(buf_page_in_file(bpage));
1295 
1296  /* We avoid flushing 'non-old' blocks in an LRU flush,
1297  because the flushed blocks are soon freed */
1298 
1299  if (flush_type != BUF_FLUSH_LRU
1300  || i == offset
1301  || buf_page_is_old(bpage)) {
1302  ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
1303 
1304  mutex_enter(block_mutex);
1305 
1306  if (buf_flush_ready_for_flush(bpage, flush_type)
1307  && (i == offset || !bpage->buf_fix_count)) {
1308  /* We only try to flush those
1309  neighbors != offset where the buf fix
1310  count is zero, as we then know that we
1311  probably can latch the page without a
1312  semaphore wait. Semaphore waits are
1313  expensive because we must flush the
1314  doublewrite buffer before we start
1315  waiting. */
1316 
1317  buf_flush_page(buf_pool, bpage, flush_type, false);
1318  ut_ad(!mutex_own(block_mutex));
1319  ut_ad(!buf_pool_mutex_own(buf_pool));
1320  count++;
1321  continue;
1322  } else {
1323  mutex_exit(block_mutex);
1324  }
1325  }
1326  buf_pool_mutex_exit(buf_pool);
1327  }
1328 
1329  if (count > 0) {
1331  MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1332  MONITOR_FLUSH_NEIGHBOR_COUNT,
1333  MONITOR_FLUSH_NEIGHBOR_PAGES,
1334  (count - 1));
1335  }
1336 
1337  return(count);
1338 }
1339 
1340 /********************************************************************/
1347 static
1348 ibool
1349 buf_flush_page_and_try_neighbors(
1350 /*=============================*/
1351  buf_page_t* bpage,
1354  buf_flush_t flush_type,
1356  ulint n_to_flush,
1358  ulint* count)
1360 {
1362  ibool flushed = FALSE;
1363 #ifdef UNIV_DEBUG
1364  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1365 #endif /* UNIV_DEBUG */
1366 
1367  ut_ad(buf_pool_mutex_own(buf_pool));
1368 
1369  block_mutex = buf_page_get_mutex(bpage);
1370  mutex_enter(block_mutex);
1371 
1372  ut_a(buf_page_in_file(bpage));
1373 
1374  if (buf_flush_ready_for_flush(bpage, flush_type)) {
1375  ulint space;
1376  ulint offset;
1377  buf_pool_t* buf_pool;
1378 
1379  buf_pool = buf_pool_from_bpage(bpage);
1380 
1381  buf_pool_mutex_exit(buf_pool);
1382 
1383  /* These fields are protected by both the
1384  buffer pool mutex and block mutex. */
1385  space = buf_page_get_space(bpage);
1386  offset = buf_page_get_page_no(bpage);
1387 
1388  mutex_exit(block_mutex);
1389 
1390  /* Try to flush also all the neighbors */
1391  *count += buf_flush_try_neighbors(space,
1392  offset,
1393  flush_type,
1394  *count,
1395  n_to_flush);
1396 
1397  buf_pool_mutex_enter(buf_pool);
1398  flushed = TRUE;
1399  } else {
1400  mutex_exit(block_mutex);
1401  }
1402 
1403  ut_ad(buf_pool_mutex_own(buf_pool));
1404 
1405  return(flushed);
1406 }
1407 
1408 /*******************************************************************/
1417 static
1418 ulint
1419 buf_free_from_unzip_LRU_list_batch(
1420 /*===============================*/
1421  buf_pool_t* buf_pool,
1422  ulint max)
1424 {
1425  buf_block_t* block;
1426  ulint scanned = 0;
1427  ulint count = 0;
1428  ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1429  ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1430 
1431  ut_ad(buf_pool_mutex_own(buf_pool));
1432 
1433  block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1434  while (block != NULL && count < max
1435  && free_len < srv_LRU_scan_depth
1436  && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1437 
1438  ++scanned;
1439  if (buf_LRU_free_page(&block->page, false)) {
1440  /* Block was freed. buf_pool->mutex potentially
1441  released and reacquired */
1442  ++count;
1443  block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1444 
1445  } else {
1446 
1447  block = UT_LIST_GET_PREV(unzip_LRU, block);
1448  }
1449 
1450  free_len = UT_LIST_GET_LEN(buf_pool->free);
1451  lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1452  }
1453 
1454  ut_ad(buf_pool_mutex_own(buf_pool));
1455 
1456  if (scanned) {
1458  MONITOR_LRU_BATCH_SCANNED,
1459  MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1460  MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1461  scanned);
1462  }
1463 
1464  return(count);
1465 }
1466 
1467 /*******************************************************************/
1474 static
1475 ulint
1476 buf_flush_LRU_list_batch(
1477 /*=====================*/
1478  buf_pool_t* buf_pool,
1479  ulint max)
1481 {
1482  buf_page_t* bpage;
1483  ulint scanned = 0;
1484  ulint count = 0;
1485  ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1486  ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1487 
1488  ut_ad(buf_pool_mutex_own(buf_pool));
1489 
1490  bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1491  while (bpage != NULL && count < max
1492  && free_len < srv_LRU_scan_depth
1493  && lru_len > BUF_LRU_MIN_LEN) {
1494 
1495  ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
1496  ibool evict;
1497 
1498  mutex_enter(block_mutex);
1499  evict = buf_flush_ready_for_replace(bpage);
1500  mutex_exit(block_mutex);
1501 
1502  ++scanned;
1503 
1504  /* If the block is ready to be replaced we try to
1505  free it i.e.: put it on the free list.
1506  Otherwise we try to flush the block and its
1507  neighbors. In this case we'll put it on the
1508  free list in the next pass. We do this extra work
1509  of putting blocks to the free list instead of
1510  just flushing them because after every flush
1511  we have to restart the scan from the tail of
1512  the LRU list and if we don't clear the tail
1513  of the flushed pages then the scan becomes
1514  O(n*n). */
1515  if (evict) {
1516  if (buf_LRU_free_page(bpage, true)) {
1517  /* buf_pool->mutex was potentially
1518  released and reacquired. */
1519  bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1520  } else {
1521  bpage = UT_LIST_GET_PREV(LRU, bpage);
1522  }
1523  } else if (buf_flush_page_and_try_neighbors(
1524  bpage,
1525  BUF_FLUSH_LRU, max, &count)) {
1526 
1527  /* buf_pool->mutex was released.
1528  Restart the scan. */
1529  bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1530  } else {
1531  bpage = UT_LIST_GET_PREV(LRU, bpage);
1532  }
1533 
1534  free_len = UT_LIST_GET_LEN(buf_pool->free);
1535  lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1536  }
1537 
1538  /* We keep track of all flushes happening as part of LRU
1539  flush. When estimating the desired rate at which flush_list
1540  should be flushed, we factor in this value. */
1541  buf_lru_flush_page_count += count;
1542 
1543  ut_ad(buf_pool_mutex_own(buf_pool));
1544 
1545  if (scanned) {
1547  MONITOR_LRU_BATCH_SCANNED,
1548  MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1549  MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1550  scanned);
1551  }
1552 
1553  return(count);
1554 }
1555 
1556 /*******************************************************************/
1562 static
1563 ulint
1564 buf_do_LRU_batch(
1565 /*=============*/
1566  buf_pool_t* buf_pool,
1567  ulint max)
1569 {
1570  ulint count = 0;
1571 
1572  if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1573  count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1574  }
1575 
1576  if (max > count) {
1577  count += buf_flush_LRU_list_batch(buf_pool, max - count);
1578  }
1579 
1580  return(count);
1581 }
1582 
1583 /*******************************************************************/
1589 static
1590 ulint
1591 buf_do_flush_list_batch(
1592 /*====================*/
1593  buf_pool_t* buf_pool,
1594  ulint min_n,
1598  lsn_t lsn_limit)
1603 {
1604  ulint count = 0;
1605  ulint scanned = 0;
1606 
1607  ut_ad(buf_pool_mutex_own(buf_pool));
1608 
1609  /* Start from the end of the list looking for a suitable
1610  block to be flushed. */
1611  buf_flush_list_mutex_enter(buf_pool);
1612  ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1613 
1614  /* In order not to degenerate this scan to O(n*n) we attempt
1615  to preserve pointer of previous block in the flush list. To do
1616  so we declare it a hazard pointer. Any thread working on the
1617  flush list must check the hazard pointer and if it is removing
1618  the same block then it must reset it. */
1619  for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1620  count < min_n && bpage != NULL && len > 0
1621  && bpage->oldest_modification < lsn_limit;
1622  ++scanned) {
1623 
1624  buf_page_t* prev;
1625 
1626  ut_a(bpage->oldest_modification > 0);
1627  ut_ad(bpage->in_flush_list);
1628 
1629  prev = UT_LIST_GET_PREV(list, bpage);
1630  buf_flush_set_hp(buf_pool, prev);
1631 
1632  buf_flush_list_mutex_exit(buf_pool);
1633 
1634 #ifdef UNIV_DEBUG
1635  bool flushed =
1636 #endif /* UNIV_DEBUG */
1637  buf_flush_page_and_try_neighbors(
1638  bpage, BUF_FLUSH_LIST, min_n, &count);
1639 
1640  buf_flush_list_mutex_enter(buf_pool);
1641 
1642  ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
1643 
1644  if (!buf_flush_is_hp(buf_pool, prev)) {
1645  /* The hazard pointer was reset by some other
1646  thread. Restart the scan. */
1647  ut_ad(buf_flush_is_hp(buf_pool, NULL));
1648  bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1649  len = UT_LIST_GET_LEN(buf_pool->flush_list);
1650  } else {
1651  bpage = prev;
1652  --len;
1653  buf_flush_set_hp(buf_pool, NULL);
1654  }
1655 
1656  ut_ad(!bpage || bpage->in_flush_list);
1657  }
1658 
1659  buf_flush_list_mutex_exit(buf_pool);
1660 
1661  MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
1662  MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1663  MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1664  scanned);
1665 
1666  ut_ad(buf_pool_mutex_own(buf_pool));
1667 
1668  return(count);
1669 }
1670 
1671 /*******************************************************************/
1678 static
1679 ulint
1680 buf_flush_batch(
1681 /*============*/
1682  buf_pool_t* buf_pool,
1683  buf_flush_t flush_type,
1687  ulint min_n,
1690  lsn_t lsn_limit)
1695 {
1696  ulint count = 0;
1697 
1698  ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1699 #ifdef UNIV_SYNC_DEBUG
1700  ut_ad((flush_type != BUF_FLUSH_LIST)
1701  || sync_thread_levels_empty_except_dict());
1702 #endif /* UNIV_SYNC_DEBUG */
1703 
1704  buf_pool_mutex_enter(buf_pool);
1705 
1706  /* Note: The buffer pool mutex is released and reacquired within
1707  the flush functions. */
1708  switch (flush_type) {
1709  case BUF_FLUSH_LRU:
1710  count = buf_do_LRU_batch(buf_pool, min_n);
1711  break;
1712  case BUF_FLUSH_LIST:
1713  count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1714  break;
1715  default:
1716  ut_error;
1717  }
1718 
1719  buf_pool_mutex_exit(buf_pool);
1720 
1721 #ifdef UNIV_DEBUG
1722  if (buf_debug_prints && count > 0) {
1723  fprintf(stderr, flush_type == BUF_FLUSH_LRU
1724  ? "Flushed %lu pages in LRU flush\n"
1725  : "Flushed %lu pages in flush list flush\n",
1726  (ulong) count);
1727  }
1728 #endif /* UNIV_DEBUG */
1729 
1730  return(count);
1731 }
1732 
1733 /******************************************************************/
1735 static
1736 void
1737 buf_flush_common(
1738 /*=============*/
1739  buf_flush_t flush_type,
1740  ulint page_count)
1741 {
1743 
1744  ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1745 
1746 #ifdef UNIV_DEBUG
1747  if (buf_debug_prints && page_count > 0) {
1748  fprintf(stderr, flush_type == BUF_FLUSH_LRU
1749  ? "Flushed %lu pages in LRU flush\n"
1750  : "Flushed %lu pages in flush list flush\n",
1751  (ulong) page_count);
1752  }
1753 #endif /* UNIV_DEBUG */
1754 
1755  srv_stats.buf_pool_flushed.add(page_count);
1756 }
1757 
1758 /******************************************************************/
1760 static
1761 ibool
1762 buf_flush_start(
1763 /*============*/
1764  buf_pool_t* buf_pool,
1765  buf_flush_t flush_type)
1767 {
1768  buf_pool_mutex_enter(buf_pool);
1769 
1770  if (buf_pool->n_flush[flush_type] > 0
1771  || buf_pool->init_flush[flush_type] == TRUE) {
1772 
1773  /* There is already a flush batch of the same type running */
1774 
1775  buf_pool_mutex_exit(buf_pool);
1776 
1777  return(FALSE);
1778  }
1779 
1780  buf_pool->init_flush[flush_type] = TRUE;
1781 
1782  buf_pool_mutex_exit(buf_pool);
1783 
1784  return(TRUE);
1785 }
1786 
1787 /******************************************************************/
1789 static
1790 void
1791 buf_flush_end(
1792 /*==========*/
1793  buf_pool_t* buf_pool,
1794  buf_flush_t flush_type)
1796 {
1797  buf_pool_mutex_enter(buf_pool);
1798 
1799  buf_pool->init_flush[flush_type] = FALSE;
1800 
1801  buf_pool->try_LRU_scan = TRUE;
1802 
1803  if (buf_pool->n_flush[flush_type] == 0) {
1804 
1805  /* The running flush batch has ended */
1806 
1807  os_event_set(buf_pool->no_flush[flush_type]);
1808  }
1809 
1810  buf_pool_mutex_exit(buf_pool);
1811 }
1812 
1813 /******************************************************************/
1815 UNIV_INTERN
1816 void
1818 /*=====================*/
1819  buf_pool_t* buf_pool,
1820  buf_flush_t type)
1822 {
1823  ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1824 
1825  if (buf_pool == NULL) {
1826  ulint i;
1827 
1828  for (i = 0; i < srv_buf_pool_instances; ++i) {
1829  buf_pool_t* buf_pool;
1830 
1831  buf_pool = buf_pool_from_array(i);
1832 
1833  thd_wait_begin(NULL, THD_WAIT_DISKIO);
1834  os_event_wait(buf_pool->no_flush[type]);
1835  thd_wait_end(NULL);
1836  }
1837  } else {
1838  thd_wait_begin(NULL, THD_WAIT_DISKIO);
1839  os_event_wait(buf_pool->no_flush[type]);
1840  thd_wait_end(NULL);
1841  }
1842 }
1843 
1844 /*******************************************************************/
1851 static
1852 bool
1853 buf_flush_LRU(
1854 /*==========*/
1855  buf_pool_t* buf_pool,
1856  ulint min_n,
1859  ulint* n_processed)
1862 {
1863  ulint page_count;
1864 
1865  if (n_processed) {
1866  *n_processed = 0;
1867  }
1868 
1869  if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1870  return(false);
1871  }
1872 
1873  page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
1874 
1875  buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1876 
1877  buf_flush_common(BUF_FLUSH_LRU, page_count);
1878 
1879  if (n_processed) {
1880  *n_processed = page_count;
1881  }
1882 
1883  return(true);
1884 }
1885 
1886 /*******************************************************************/
1893 UNIV_INTERN
1894 bool
1896 /*===========*/
1897  ulint min_n,
1900  lsn_t lsn_limit,
1905  ulint* n_processed)
1909 {
1910  ulint i;
1911  bool success = true;
1912 
1913  if (n_processed) {
1914  *n_processed = 0;
1915  }
1916 
1917  if (min_n != ULINT_MAX) {
1918  /* Ensure that flushing is spread evenly amongst the
1919  buffer pool instances. When min_n is ULINT_MAX
1920  we need to flush everything up to the lsn limit
1921  so no limit here. */
1922  min_n = (min_n + srv_buf_pool_instances - 1)
1923  / srv_buf_pool_instances;
1924  }
1925 
1926  /* Flush to lsn_limit in all buffer pool instances */
1927  for (i = 0; i < srv_buf_pool_instances; i++) {
1928  buf_pool_t* buf_pool;
1929  ulint page_count = 0;
1930 
1931  buf_pool = buf_pool_from_array(i);
1932 
1933  if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
1934  /* We have two choices here. If lsn_limit was
1935  specified then skipping an instance of buffer
1936  pool means we cannot guarantee that all pages
1937  up to lsn_limit has been flushed. We can
1938  return right now with failure or we can try
1939  to flush remaining buffer pools up to the
1940  lsn_limit. We attempt to flush other buffer
1941  pools based on the assumption that it will
1942  help in the retry which will follow the
1943  failure. */
1944  success = false;
1945 
1946  continue;
1947  }
1948 
1949  page_count = buf_flush_batch(
1950  buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
1951 
1952  buf_flush_end(buf_pool, BUF_FLUSH_LIST);
1953 
1954  buf_flush_common(BUF_FLUSH_LIST, page_count);
1955 
1956  if (n_processed) {
1957  *n_processed += page_count;
1958  }
1959 
1960  if (page_count) {
1962  MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1963  MONITOR_FLUSH_BATCH_COUNT,
1964  MONITOR_FLUSH_BATCH_PAGES,
1965  page_count);
1966  }
1967  }
1968 
1969  return(success);
1970 }
1971 
1972 /******************************************************************/
1980 UNIV_INTERN
1981 ibool
1983 /*===========================*/
1984  buf_pool_t* buf_pool)
1985 {
1986  ulint scanned;
1987  buf_page_t* bpage;
1989  ibool freed;
1990  bool evict_zip;
1991 
1992  buf_pool_mutex_enter(buf_pool);
1993 
1994  for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
1995  bpage != NULL;
1996  bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
1997 
1998  block_mutex = buf_page_get_mutex(bpage);
1999  mutex_enter(block_mutex);
2000  if (buf_flush_ready_for_flush(bpage,
2002  /* buf_flush_page() will release the block
2003  mutex */
2004  break;
2005  }
2006  mutex_exit(block_mutex);
2007  }
2008 
2010  MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2011  MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2012  MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2013  scanned);
2014 
2015  if (!bpage) {
2016  /* Can't find a single flushable page. */
2017  buf_pool_mutex_exit(buf_pool);
2018  return(FALSE);
2019  }
2020 
2021  /* The following call will release the buffer pool and
2022  block mutex. */
2023  buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2024 
2025  /* At this point the page has been written to the disk.
2026  As we are not holding buffer pool or block mutex therefore
2027  we cannot use the bpage safely. It may have been plucked out
2028  of the LRU list by some other thread or it may even have
2029  relocated in case of a compressed page. We need to start
2030  the scan of LRU list again to remove the block from the LRU
2031  list and put it on the free list. */
2032  buf_pool_mutex_enter(buf_pool);
2033 
2034  for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
2035  bpage != NULL;
2036  bpage = UT_LIST_GET_PREV(LRU, bpage)) {
2037 
2038  ibool ready;
2039 
2040  block_mutex = buf_page_get_mutex(bpage);
2041  mutex_enter(block_mutex);
2042  ready = buf_flush_ready_for_replace(bpage);
2043  mutex_exit(block_mutex);
2044  if (ready) {
2045  break;
2046  }
2047 
2048  }
2049 
2050  if (!bpage) {
2051  /* Can't find a single replaceable page. */
2052  buf_pool_mutex_exit(buf_pool);
2053  return(FALSE);
2054  }
2055 
2056  evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
2057 
2058  freed = buf_LRU_free_page(bpage, evict_zip);
2059  buf_pool_mutex_exit(buf_pool);
2060 
2061  return(freed);
2062 }
2063 
2064 /*********************************************************************/
2071 UNIV_INTERN
2072 ulint
2074 /*====================*/
2075 {
2076  ulint total_flushed = 0;
2077 
2078  for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2079 
2080  buf_pool_t* buf_pool = buf_pool_from_array(i);
2081  ulint scan_depth;
2082 
2083  /* srv_LRU_scan_depth can be arbitrarily large value.
2084  We cap it with current LRU size. */
2085  buf_pool_mutex_enter(buf_pool);
2086  scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2087  buf_pool_mutex_exit(buf_pool);
2088 
2089  scan_depth = ut_min(srv_LRU_scan_depth, scan_depth);
2090 
2091  /* We divide LRU flush into smaller chunks because
2092  there may be user threads waiting for the flush to
2093  end in buf_LRU_get_free_block(). */
2094  for (ulint j = 0;
2095  j < scan_depth;
2097 
2098  ulint n_flushed = 0;
2099 
2100  /* Currently page_cleaner is the only thread
2101  that can trigger an LRU flush. It is possible
2102  that a batch triggered during last iteration is
2103  still running, */
2104  if (buf_flush_LRU(buf_pool,
2106  &n_flushed)) {
2107 
2108  /* Allowed only one batch per
2109  buffer pool instance. */
2111  buf_pool, BUF_FLUSH_LRU);
2112  }
2113 
2114  if (n_flushed) {
2115  total_flushed += n_flushed;
2116  } else {
2117  /* Nothing to flush */
2118  break;
2119  }
2120  }
2121  }
2122 
2123  if (total_flushed) {
2125  MONITOR_LRU_BATCH_TOTAL_PAGE,
2126  MONITOR_LRU_BATCH_COUNT,
2127  MONITOR_LRU_BATCH_PAGES,
2128  total_flushed);
2129  }
2130 
2131  return(total_flushed);
2132 }
2133 
2134 /*********************************************************************/
2136 UNIV_INTERN
2137 void
2139 /*==============================*/
2140 {
2141  for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2142  buf_pool_t* buf_pool;
2143 
2144  buf_pool = buf_pool_from_array(i);
2145 
2146  buf_pool_mutex_enter(buf_pool);
2147 
2148  if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2149  || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2150 
2151  buf_pool_mutex_exit(buf_pool);
2153  } else {
2154  buf_pool_mutex_exit(buf_pool);
2155  }
2156  }
2157 }
2158 
2159 /*********************************************************************/
2163 static
2164 ulint
2165 page_cleaner_do_flush_batch(
2166 /*========================*/
2167  ulint n_to_flush,
2169  lsn_t lsn_limit)
2171 {
2172  ulint n_flushed;
2173 
2174  buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
2175 
2176  return(n_flushed);
2177 }
2178 
2179 /*********************************************************************/
2183 static
2184 ulint
2185 af_get_pct_for_dirty()
2186 /*==================*/
2187 {
2188  ulint dirty_pct = buf_get_modified_ratio_pct();
2189 
2190  ut_a(srv_max_dirty_pages_pct_lwm
2191  <= srv_max_buf_pool_modified_pct);
2192 
2193  if (srv_max_dirty_pages_pct_lwm == 0) {
2194  /* The user has not set the option to preflush dirty
2195  pages as we approach the high water mark. */
2196  if (dirty_pct > srv_max_buf_pool_modified_pct) {
2197  /* We have crossed the high water mark of dirty
2198  pages In this case we start flushing at 100% of
2199  innodb_io_capacity. */
2200  return(100);
2201  }
2202  } else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
2203  /* We should start flushing pages gradually. */
2204  return((dirty_pct * 100)
2205  / (srv_max_buf_pool_modified_pct + 1));
2206  }
2207 
2208  return(0);
2209 }
2210 
2211 /*********************************************************************/
2214 static
2215 ulint
2216 af_get_pct_for_lsn(
2217 /*===============*/
2218  lsn_t age)
2219 {
2220  lsn_t max_async_age;
2221  lsn_t lsn_age_factor;
2222  lsn_t af_lwm = (srv_adaptive_flushing_lwm
2223  * log_get_capacity()) / 100;
2224 
2225  if (age < af_lwm) {
2226  /* No adaptive flushing. */
2227  return(0);
2228  }
2229 
2230  max_async_age = log_get_max_modified_age_async();
2231 
2232  if (age < max_async_age && !srv_adaptive_flushing) {
2233  /* We have still not reached the max_async point and
2234  the user has disabled adaptive flushing. */
2235  return(0);
2236  }
2237 
2238  /* If we are here then we know that either:
2239  1) User has enabled adaptive flushing
2240  2) User may have disabled adaptive flushing but we have reached
2241  max_async_age. */
2242  lsn_age_factor = (age * 100) / max_async_age;
2243 
2244  ut_ad(srv_max_io_capacity >= srv_io_capacity);
2245  return(static_cast<ulint>(
2246  ((srv_max_io_capacity / srv_io_capacity)
2247  * (lsn_age_factor * sqrt((double)lsn_age_factor)))
2248  / 7.5));
2249 }
2250 
2251 /*********************************************************************/
2257 static
2258 ulint
2259 page_cleaner_flush_pages_if_needed(void)
2260 /*====================================*/
2261 {
2262  static lsn_t lsn_avg_rate = 0;
2263  static lsn_t prev_lsn = 0;
2264  static lsn_t last_lsn = 0;
2265  static ulint sum_pages = 0;
2266  static ulint last_pages = 0;
2267  static ulint prev_pages = 0;
2268  static ulint avg_page_rate = 0;
2269  static ulint n_iterations = 0;
2270  lsn_t oldest_lsn;
2271  lsn_t cur_lsn;
2272  lsn_t age;
2273  lsn_t lsn_rate;
2274  ulint n_pages = 0;
2275  ulint pct_for_dirty = 0;
2276  ulint pct_for_lsn = 0;
2277  ulint pct_total = 0;
2278  int age_factor = 0;
2279 
2280  cur_lsn = log_get_lsn();
2281 
2282  if (prev_lsn == 0) {
2283  /* First time around. */
2284  prev_lsn = cur_lsn;
2285  return(0);
2286  }
2287 
2288  if (prev_lsn == cur_lsn) {
2289  return(0);
2290  }
2291 
2292  /* We update our variables every srv_flushing_avg_loops
2293  iterations to smooth out transition in workload. */
2294  if (++n_iterations >= srv_flushing_avg_loops) {
2295 
2296  avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
2297  + avg_page_rate) / 2;
2298 
2299  /* How much LSN we have generated since last call. */
2300  lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
2301 
2302  lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2303 
2304  prev_lsn = cur_lsn;
2305 
2306  n_iterations = 0;
2307 
2308  sum_pages = 0;
2309  }
2310 
2311  oldest_lsn = buf_pool_get_oldest_modification();
2312 
2313  ut_ad(oldest_lsn <= log_get_lsn());
2314 
2315  age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2316 
2317  pct_for_dirty = af_get_pct_for_dirty();
2318  pct_for_lsn = af_get_pct_for_lsn(age);
2319 
2320  pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2321 
2322  /* Cap the maximum IO capacity that we are going to use by
2323  max_io_capacity. */
2324  n_pages = (PCT_IO(pct_total) + avg_page_rate) / 2;
2325 
2326  if (n_pages > srv_max_io_capacity) {
2327  n_pages = srv_max_io_capacity;
2328  }
2329 
2330  if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
2331  age_factor = prev_pages / last_pages;
2332  }
2333 
2334  MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2335 
2336  prev_pages = n_pages;
2337  n_pages = page_cleaner_do_flush_batch(
2338  n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
2339 
2340  last_lsn= cur_lsn;
2341  last_pages= n_pages + 1;
2342 
2343  MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2344  MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2345  MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2346  MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2347 
2348  if (n_pages) {
2350  MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
2351  MONITOR_FLUSH_ADAPTIVE_COUNT,
2352  MONITOR_FLUSH_ADAPTIVE_PAGES,
2353  n_pages);
2354 
2355  sum_pages += n_pages;
2356  }
2357 
2358  return(n_pages);
2359 }
2360 
2361 /*********************************************************************/
2364 static
2365 void
2366 page_cleaner_sleep_if_needed(
2367 /*=========================*/
2368  ulint next_loop_time)
2370 {
2371  ulint cur_time = ut_time_ms();
2372 
2373  if (next_loop_time > cur_time) {
2374  /* Get sleep interval in micro seconds. We use
2375  ut_min() to avoid long sleep in case of
2376  wrap around. */
2377  os_thread_sleep(ut_min(1000000,
2378  (next_loop_time - cur_time)
2379  * 1000));
2380  }
2381 }
2382 
2383 /******************************************************************/
2387 extern "C" UNIV_INTERN
2388 os_thread_ret_t
2390 /*==========================================*/
2391  void* arg __attribute__((unused)))
2394 {
2395  ulint next_loop_time = ut_time_ms() + 1000;
2396  ulint n_flushed = 0;
2397  ulint last_activity = srv_get_activity_count();
2398 
2400 
2401 #ifdef UNIV_PFS_THREAD
2402  pfs_register_thread(buf_page_cleaner_thread_key);
2403 #endif /* UNIV_PFS_THREAD */
2404 
2405 #ifdef UNIV_DEBUG_THREAD_CREATION
2406  fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
2408 #endif /* UNIV_DEBUG_THREAD_CREATION */
2409 
2411 
2413 
2414  /* The page_cleaner skips sleep if the server is
2415  idle and there are no pending IOs in the buffer pool
2416  and there is work to do. */
2417  if (srv_check_activity(last_activity)
2419  || n_flushed == 0) {
2420  page_cleaner_sleep_if_needed(next_loop_time);
2421  }
2422 
2423  next_loop_time = ut_time_ms() + 1000;
2424 
2425  if (srv_check_activity(last_activity)) {
2426  last_activity = srv_get_activity_count();
2427 
2428  /* Flush pages from end of LRU if required */
2429  n_flushed = buf_flush_LRU_tail();
2430 
2431  /* Flush pages from flush_list if required */
2432  n_flushed += page_cleaner_flush_pages_if_needed();
2433  } else {
2434  n_flushed = page_cleaner_do_flush_batch(
2435  PCT_IO(100),
2436  LSN_MAX);
2437 
2438  if (n_flushed) {
2440  MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
2441  MONITOR_FLUSH_BACKGROUND_COUNT,
2442  MONITOR_FLUSH_BACKGROUND_PAGES,
2443  n_flushed);
2444  }
2445  }
2446  }
2447 
2449  if (srv_fast_shutdown == 2) {
2450  /* In very fast shutdown we simulate a crash of
2451  buffer pool. We are not required to do any flushing */
2452  goto thread_exit;
2453  }
2454 
2455  /* In case of normal and slow shutdown the page_cleaner thread
2456  must wait for all other activity in the server to die down.
2457  Note that we can start flushing the buffer pool as soon as the
2458  server enters shutdown phase but we must stay alive long enough
2459  to ensure that any work done by the master or purge threads is
2460  also flushed.
2461  During shutdown we pass through two stages. In the first stage,
2462  when SRV_SHUTDOWN_CLEANUP is set other threads like the master
2463  and the purge threads may be working as well. We start flushing
2464  the buffer pool but can't be sure that no new pages are being
2465  dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
2466 
2467  do {
2468  n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
2469 
2470  /* We sleep only if there are no pages to flush */
2471  if (n_flushed == 0) {
2472  os_thread_sleep(100000);
2473  }
2475 
2476  /* At this point all threads including the master and the purge
2477  thread must have been suspended. */
2480 
2481  /* We can now make a final sweep on flushing the buffer pool
2482  and exit after we have cleaned the whole buffer pool.
2483  It is important that we wait for any running batch that has
2484  been triggered by us to finish. Otherwise we can end up
2485  considering end of that batch as a finish of our final
2486  sweep and we'll come out of the loop leaving behind dirty pages
2487  in the flush_list */
2490 
2491  bool success;
2492 
2493  do {
2494 
2495  success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
2497 
2498  } while (!success || n_flushed > 0);
2499 
2500  /* Some sanity checks */
2503  for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2504  buf_pool_t* buf_pool = buf_pool_from_array(i);
2505  ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
2506  }
2507 
2508  /* We have lived our life. Time to die. */
2509 
2510 thread_exit:
2512 
2513  /* We count the number of threads in os_thread_exit(). A created
2514  thread should always use that to exit and not use return() to exit. */
2515  os_thread_exit(NULL);
2516 
2517  OS_THREAD_DUMMY_RETURN;
2518 }
2519 
2520 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2521 
2523 struct Check {
2524  void operator()(const buf_page_t* elem)
2525  {
2526  ut_a(elem->in_flush_list);
2527  }
2528 };
2529 
2530 /******************************************************************/
2533 static
2534 ibool
2535 buf_flush_validate_low(
2536 /*===================*/
2537  buf_pool_t* buf_pool)
2538 {
2539  buf_page_t* bpage;
2540  const ib_rbt_node_t* rnode = NULL;
2541 
2542  ut_ad(buf_flush_list_mutex_own(buf_pool));
2543 
2544  UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
2545 
2546  bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2547 
2548  /* If we are in recovery mode i.e.: flush_rbt != NULL
2549  then each block in the flush_list must also be present
2550  in the flush_rbt. */
2551  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2552  rnode = rbt_first(buf_pool->flush_rbt);
2553  }
2554 
2555  while (bpage != NULL) {
2556  const lsn_t om = bpage->oldest_modification;
2557 
2558  ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2559 
2560  ut_ad(bpage->in_flush_list);
2561 
2562  /* A page in buf_pool->flush_list can be in
2563  BUF_BLOCK_REMOVE_HASH state. This happens when a page
2564  is in the middle of being relocated. In that case the
2565  original descriptor can have this state and still be
2566  in the flush list waiting to acquire the
2567  buf_pool->flush_list_mutex to complete the relocation. */
2568  ut_a(buf_page_in_file(bpage)
2570  ut_a(om > 0);
2571 
2572  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2573  buf_page_t** prpage;
2574 
2575  ut_a(rnode);
2576  prpage = rbt_value(buf_page_t*, rnode);
2577 
2578  ut_a(*prpage);
2579  ut_a(*prpage == bpage);
2580  rnode = rbt_next(buf_pool->flush_rbt, rnode);
2581  }
2582 
2583  bpage = UT_LIST_GET_NEXT(list, bpage);
2584 
2585  ut_a(!bpage || om >= bpage->oldest_modification);
2586  }
2587 
2588  /* By this time we must have exhausted the traversal of
2589  flush_rbt (if active) as well. */
2590  ut_a(rnode == NULL);
2591 
2592  return(TRUE);
2593 }
2594 
2595 /******************************************************************/
2598 UNIV_INTERN
2599 ibool
2600 buf_flush_validate(
2601 /*===============*/
2602  buf_pool_t* buf_pool)
2603 {
2604  ibool ret;
2605 
2606  buf_flush_list_mutex_enter(buf_pool);
2607 
2608  ret = buf_flush_validate_low(buf_pool);
2609 
2610  buf_flush_list_mutex_exit(buf_pool);
2611 
2612  return(ret);
2613 }
2614 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2615 #endif /* !UNIV_HOTBACKUP */
2616 
2617 #ifdef UNIV_DEBUG
2618 /******************************************************************/
2622 UNIV_INTERN
2623 ulint
2624 buf_pool_get_dirty_pages_count(
2625 /*===========================*/
2626  buf_pool_t* buf_pool,
2627  ulint id)
2629 {
2630  ulint count = 0;
2631 
2632  buf_pool_mutex_enter(buf_pool);
2633  buf_flush_list_mutex_enter(buf_pool);
2634 
2635  buf_page_t* bpage;
2636 
2637  for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2638  bpage != 0;
2639  bpage = UT_LIST_GET_NEXT(list, bpage)) {
2640 
2641  ut_ad(buf_page_in_file(bpage));
2642  ut_ad(bpage->in_flush_list);
2643  ut_ad(bpage->oldest_modification > 0);
2644 
2645  if (buf_page_get_space(bpage) == id) {
2646  ++count;
2647  }
2648  }
2649 
2650  buf_flush_list_mutex_exit(buf_pool);
2651  buf_pool_mutex_exit(buf_pool);
2652 
2653  return(count);
2654 }
2655 
2656 /******************************************************************/
2659 UNIV_INTERN
2660 ulint
2661 buf_flush_get_dirty_pages_count(
2662 /*============================*/
2663  ulint id)
2665 {
2666  ulint count = 0;
2667 
2668  for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2669  buf_pool_t* buf_pool;
2670 
2671  buf_pool = buf_pool_from_array(i);
2672 
2673  count += buf_pool_get_dirty_pages_count(buf_pool, id);
2674  }
2675 
2676  return(count);
2677 }
2678 #endif /* UNIV_DEBUG */