MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
buf0dblwr.cc
Go to the documentation of this file.
1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "buf0dblwr.h"
27 
28 #ifdef UNIV_NONINL
29 #include "buf0buf.ic"
30 #endif
31 
32 #include "buf0buf.h"
33 #include "buf0checksum.h"
34 #include "srv0start.h"
35 #include "srv0srv.h"
36 #include "page0zip.h"
37 #include "trx0sys.h"
38 
39 #ifndef UNIV_HOTBACKUP
40 
41 #ifdef UNIV_PFS_MUTEX
42 /* Key to register the mutex with performance schema */
43 UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key;
44 #endif /* UNIV_PFS_RWLOCK */
45 
47 UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL;
48 
50 UNIV_INTERN ibool buf_dblwr_being_created = FALSE;
51 
52 /****************************************************************/
56 UNIV_INTERN
57 ibool
59 /*==================*/
60  ulint page_no)
61 {
62  if (buf_dblwr == NULL) {
63 
64  return(FALSE);
65  }
66 
67  if (page_no >= buf_dblwr->block1
68  && page_no < buf_dblwr->block1
70  return(TRUE);
71  }
72 
73  if (page_no >= buf_dblwr->block2
74  && page_no < buf_dblwr->block2
76  return(TRUE);
77  }
78 
79  return(FALSE);
80 }
81 
82 /****************************************************************/
87 UNIV_INLINE
88 byte*
90 /*==========*/
91  mtr_t* mtr)
92 {
94 
95  block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
96  RW_X_LATCH, mtr);
97  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
98 
99  return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
100 }
101 
102 /********************************************************************/
105 UNIV_INLINE
106 void
108 /*======================*/
109 {
110  /* Wake possible simulated aio thread to actually post the
111  writes to the operating system */
113 
114  /* Wait that all async writes to tablespaces have been posted to
115  the OS */
117 
118  /* Now we flush the data to disk (for example, with fsync) */
120 }
121 
122 /****************************************************************/
124 static
125 void
126 buf_dblwr_init(
127 /*===========*/
128  byte* doublewrite)
130 {
131  ulint buf_size;
132 
133  buf_dblwr = static_cast<buf_dblwr_t*>(
134  mem_zalloc(sizeof(buf_dblwr_t)));
135 
136  /* There are two blocks of same size in the doublewrite
137  buffer. */
138  buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
139 
140  /* There must be atleast one buffer for single page writes
141  and one buffer for batch writes. */
143  && srv_doublewrite_batch_size < buf_size);
144 
145  mutex_create(buf_dblwr_mutex_key,
146  &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
147 
150  buf_dblwr->first_free = 0;
151  buf_dblwr->s_reserved = 0;
152  buf_dblwr->b_reserved = 0;
153 
155  doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
157  doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
158 
159  buf_dblwr->in_use = static_cast<bool*>(
160  mem_zalloc(buf_size * sizeof(bool)));
161 
162  buf_dblwr->write_buf_unaligned = static_cast<byte*>(
163  ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
164 
165  buf_dblwr->write_buf = static_cast<byte*>(
167  UNIV_PAGE_SIZE));
168 
169  buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
170  mem_zalloc(buf_size * sizeof(void*)));
171 }
172 
173 /****************************************************************/
176 UNIV_INTERN
177 void
179 /*==================*/
180 {
181  buf_block_t* block2;
182  buf_block_t* new_block;
183  byte* doublewrite;
184  byte* fseg_header;
185  ulint page_no;
186  ulint prev_page_no;
187  ulint i;
188  mtr_t mtr;
189 
190  if (buf_dblwr) {
191  /* Already inited */
192 
193  return;
194  }
195 
196 start_again:
197  mtr_start(&mtr);
199 
200  doublewrite = buf_dblwr_get(&mtr);
201 
204  /* The doublewrite buffer has already been created:
205  just read in some numbers */
206 
207  buf_dblwr_init(doublewrite);
208 
209  mtr_commit(&mtr);
210  buf_dblwr_being_created = FALSE;
211  return;
212  }
213 
214  ib_logf(IB_LOG_LEVEL_INFO,
215  "Doublewrite buffer not found: creating new");
216 
219  + FSP_EXTENT_SIZE / 2 + 100)
220  * UNIV_PAGE_SIZE)) {
221 
222  ib_logf(IB_LOG_LEVEL_ERROR,
223  "Cannot create doublewrite buffer: you must "
224  "increase your buffer pool size. Cannot continue "
225  "operation.");
226 
227  exit(EXIT_FAILURE);
228  }
229 
230  block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
232  + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
233 
234  /* fseg_create acquires a second latch on the page,
235  therefore we must declare it: */
236 
237  buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
238 
239  if (block2 == NULL) {
240  ib_logf(IB_LOG_LEVEL_ERROR,
241  "Cannot create doublewrite buffer: you must "
242  "increase your tablespace size. "
243  "Cannot continue operation.");
244 
245  /* We exit without committing the mtr to prevent
246  its modifications to the database getting to disk */
247 
248  exit(EXIT_FAILURE);
249  }
250 
251  fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
252  prev_page_no = 0;
253 
254  for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
255  + FSP_EXTENT_SIZE / 2; i++) {
256  new_block = fseg_alloc_free_page(
257  fseg_header, prev_page_no + 1, FSP_UP, &mtr);
258  if (new_block == NULL) {
259  ib_logf(IB_LOG_LEVEL_ERROR,
260  "Cannot create doublewrite buffer: you must "
261  "increase your tablespace size. "
262  "Cannot continue operation.");
263 
264  exit(EXIT_FAILURE);
265  }
266 
267  /* We read the allocated pages to the buffer pool;
268  when they are written to disk in a flush, the space
269  id and page number fields are also written to the
270  pages. When we at database startup read pages
271  from the doublewrite buffer, we know that if the
272  space id and page number in them are the same as
273  the page position in the tablespace, then the page
274  has not been written to in doublewrite. */
275 
276  ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
277  page_no = buf_block_get_page_no(new_block);
278 
279  if (i == FSP_EXTENT_SIZE / 2) {
280  ut_a(page_no == FSP_EXTENT_SIZE);
281  mlog_write_ulint(doublewrite
283  page_no, MLOG_4BYTES, &mtr);
284  mlog_write_ulint(doublewrite
287  page_no, MLOG_4BYTES, &mtr);
288 
289  } else if (i == FSP_EXTENT_SIZE / 2
291  ut_a(page_no == 2 * FSP_EXTENT_SIZE);
292  mlog_write_ulint(doublewrite
294  page_no, MLOG_4BYTES, &mtr);
295  mlog_write_ulint(doublewrite
298  page_no, MLOG_4BYTES, &mtr);
299 
300  } else if (i > FSP_EXTENT_SIZE / 2) {
301  ut_a(page_no == prev_page_no + 1);
302  }
303 
304  if (((i + 1) & 15) == 0) {
305  /* rw_locks can only be recursively x-locked
306  2048 times. (on 32 bit platforms,
307  (lint) 0 - (X_LOCK_DECR * 2049)
308  is no longer a negative number, and thus
309  lock_word becomes like a shared lock).
310  For 4k page size this loop will
311  lock the fseg header too many times. Since
312  this code is not done while any other threads
313  are active, restart the MTR occasionally. */
314  mtr_commit(&mtr);
315  mtr_start(&mtr);
316  doublewrite = buf_dblwr_get(&mtr);
317  fseg_header = doublewrite
319  }
320 
321  prev_page_no = page_no;
322  }
323 
326  MLOG_4BYTES, &mtr);
330  MLOG_4BYTES, &mtr);
331 
332  mlog_write_ulint(doublewrite
335  MLOG_4BYTES, &mtr);
336  mtr_commit(&mtr);
337 
338  /* Flush the modified pages to disk and make a checkpoint */
339  log_make_checkpoint_at(LSN_MAX, TRUE);
340 
341  /* Remove doublewrite pages from LRU */
343 
344  ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
345 
346  goto start_again;
347 }
348 
349 /****************************************************************/
356 UNIV_INTERN
357 void
359 /*============================*/
360  ibool restore_corrupt_pages)
361 {
362  byte* buf;
363  byte* read_buf;
364  byte* unaligned_read_buf;
365  ulint block1;
366  ulint block2;
367  byte* page;
368  ibool reset_space_ids = FALSE;
369  byte* doublewrite;
370  ulint space_id;
371  ulint page_no;
372  ulint i;
373 
374  /* We do the file i/o past the buffer pool */
375 
376  unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
377 
378  read_buf = static_cast<byte*>(
379  ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
380 
381  /* Read the trx sys header to check if we are using the doublewrite
382  buffer */
383 
384  fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
385  UNIV_PAGE_SIZE, read_buf, NULL);
386  doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
387 
390  /* The doublewrite buffer has been created */
391 
392  buf_dblwr_init(doublewrite);
393 
394  block1 = buf_dblwr->block1;
395  block2 = buf_dblwr->block2;
396 
397  buf = buf_dblwr->write_buf;
398  } else {
399  goto leave_func;
400  }
401 
404 
405  /* We are upgrading from a version < 4.1.x to a version where
406  multiple tablespaces are supported. We must reset the space id
407  field in the pages in the doublewrite buffer because starting
408  from this version the space id is stored to
409  FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
410 
411  reset_space_ids = TRUE;
412 
413  ib_logf(IB_LOG_LEVEL_INFO,
414  "Resetting space id's in the doublewrite buffer");
415  }
416 
417  /* Read the pages from the doublewrite buffer to memory */
418 
419  fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0,
420  TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
421  buf, NULL);
422  fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0,
423  TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
424  buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
425  NULL);
426  /* Check if any of these pages is half-written in data files, in the
427  intended position */
428 
429  page = buf;
430 
431  for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
432 
433  ulint source_page_no;
434  page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
435 
436  if (reset_space_ids) {
437 
438  space_id = 0;
439  mach_write_to_4(page
441  /* We do not need to calculate new checksums for the
442  pages because the field .._SPACE_ID does not affect
443  them. Write the page back to where we read it from. */
444 
445  if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
446  source_page_no = block1 + i;
447  } else {
448  source_page_no = block2
450  }
451 
452  fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0,
453  UNIV_PAGE_SIZE, page, NULL);
454  } else {
455 
456  space_id = mach_read_from_4(
458  }
459 
460  if (!restore_corrupt_pages) {
461  /* The database was shut down gracefully: no need to
462  restore pages */
463 
464  } else if (!fil_tablespace_exists_in_mem(space_id)) {
465  /* Maybe we have dropped the single-table tablespace
466  and this page once belonged to it: do nothing */
467 
468  } else if (!fil_check_adress_in_tablespace(space_id,
469  page_no)) {
470  ib_logf(IB_LOG_LEVEL_WARN,
471  "A page in the doublewrite buffer is not "
472  "within space bounds; space id %lu "
473  "page number %lu, page %lu in "
474  "doublewrite buf.",
475  (ulong) space_id, (ulong) page_no, (ulong) i);
476 
477  } else if (space_id == TRX_SYS_SPACE
478  && ((page_no >= block1
479  && page_no
480  < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
481  || (page_no >= block2
482  && page_no
483  < (block2
484  + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
485 
486  /* It is an unwritten doublewrite buffer page:
487  do nothing */
488  } else {
489  ulint zip_size = fil_space_get_zip_size(space_id);
490 
491  /* Read in the actual page from the file */
492  fil_io(OS_FILE_READ, true, space_id, zip_size,
493  page_no, 0,
494  zip_size ? zip_size : UNIV_PAGE_SIZE,
495  read_buf, NULL);
496 
497  /* Check if the page is corrupt */
498 
499  if (buf_page_is_corrupted(true, read_buf, zip_size)) {
500 
501  fprintf(stderr,
502  "InnoDB: Warning: database page"
503  " corruption or a failed\n"
504  "InnoDB: file read of"
505  " space %lu page %lu.\n"
506  "InnoDB: Trying to recover it from"
507  " the doublewrite buffer.\n",
508  (ulong) space_id, (ulong) page_no);
509 
510  if (buf_page_is_corrupted(true,
511  page, zip_size)) {
512  fprintf(stderr,
513  "InnoDB: Dump of the page:\n");
515  read_buf, zip_size,
517  fprintf(stderr,
518  "InnoDB: Dump of"
519  " corresponding page"
520  " in doublewrite buffer:\n");
522  page, zip_size,
524 
525  fprintf(stderr,
526  "InnoDB: Also the page in the"
527  " doublewrite buffer"
528  " is corrupt.\n"
529  "InnoDB: Cannot continue"
530  " operation.\n"
531  "InnoDB: You can try to"
532  " recover the database"
533  " with the my.cnf\n"
534  "InnoDB: option:\n"
535  "InnoDB:"
536  " innodb_force_recovery=6\n");
537  ut_error;
538  }
539 
540  /* Write the good page from the
541  doublewrite buffer to the intended
542  position */
543 
544  fil_io(OS_FILE_WRITE, true, space_id,
545  zip_size, page_no, 0,
546  zip_size ? zip_size : UNIV_PAGE_SIZE,
547  page, NULL);
548 
549  ib_logf(IB_LOG_LEVEL_INFO,
550  "Recovered the page from"
551  " the doublewrite buffer.");
552  }
553  }
554 
555  page += UNIV_PAGE_SIZE;
556  }
557 
559 
560 leave_func:
561  ut_free(unaligned_read_buf);
562 }
563 
564 /****************************************************************/
566 UNIV_INTERN
567 void
569 /*================*/
570 {
571  /* Free the double write data structures. */
572  ut_a(buf_dblwr != NULL);
573  ut_ad(buf_dblwr->s_reserved == 0);
574  ut_ad(buf_dblwr->b_reserved == 0);
575 
580 
582  buf_dblwr->buf_block_arr = NULL;
583 
585  buf_dblwr->in_use = NULL;
586 
587  mutex_free(&buf_dblwr->mutex);
589  buf_dblwr = NULL;
590 }
591 
592 /********************************************************************/
594 UNIV_INTERN
595 void
597 /*=============*/
598  const buf_page_t* bpage,
599  buf_flush_t flush_type)
600 {
601  if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
602  return;
603  }
604 
605  switch (flush_type) {
606  case BUF_FLUSH_LIST:
607  case BUF_FLUSH_LRU:
608  mutex_enter(&buf_dblwr->mutex);
609 
611  ut_ad(buf_dblwr->b_reserved > 0);
613 
615 
616  if (buf_dblwr->b_reserved == 0) {
617  mutex_exit(&buf_dblwr->mutex);
618  /* This will finish the batch. Sync data files
619  to the disk. */
621  mutex_enter(&buf_dblwr->mutex);
622 
623  /* We can now reuse the doublewrite memory buffer: */
624  buf_dblwr->first_free = 0;
625  buf_dblwr->batch_running = false;
627  }
628 
629  mutex_exit(&buf_dblwr->mutex);
630  break;
632  {
633  const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
634  ulint i;
635  mutex_enter(&buf_dblwr->mutex);
636  for (i = srv_doublewrite_batch_size; i < size; ++i) {
637  if (buf_dblwr->buf_block_arr[i] == bpage) {
639  buf_dblwr->buf_block_arr[i] = NULL;
640  buf_dblwr->in_use[i] = false;
641  break;
642  }
643  }
644 
645  /* The block we are looking for must exist as a
646  reserved block. */
647  ut_a(i < size);
648  }
650  mutex_exit(&buf_dblwr->mutex);
651  break;
652  case BUF_FLUSH_N_TYPES:
653  ut_error;
654  }
655 }
656 
657 /********************************************************************/
659 static
660 void
661 buf_dblwr_check_page_lsn(
662 /*=====================*/
663  const page_t* page)
664 {
665  if (memcmp(page + (FIL_PAGE_LSN + 4),
666  page + (UNIV_PAGE_SIZE
668  4)) {
669 
670  ut_print_timestamp(stderr);
671  fprintf(stderr,
672  " InnoDB: ERROR: The page to be written"
673  " seems corrupt!\n"
674  "InnoDB: The low 4 bytes of LSN fields do not match "
675  "(" ULINTPF " != " ULINTPF ")!"
676  " Noticed in the buffer pool.\n",
678  page + FIL_PAGE_LSN + 4),
680  page + UNIV_PAGE_SIZE
682  }
683 }
684 
685 /********************************************************************/
688 static
689 void
690 buf_dblwr_assert_on_corrupt_block(
691 /*==============================*/
692  const buf_block_t* block)
693 {
695 
696  ut_print_timestamp(stderr);
697  fprintf(stderr,
698  " InnoDB: Apparent corruption of an"
699  " index page n:o %lu in space %lu\n"
700  "InnoDB: to be written to data file."
701  " We intentionally crash server\n"
702  "InnoDB: to prevent corrupt data"
703  " from ending up in data\n"
704  "InnoDB: files.\n",
705  (ulong) buf_block_get_page_no(block),
706  (ulong) buf_block_get_space(block));
707 
708  ut_error;
709 }
710 
711 /********************************************************************/
714 static
715 void
716 buf_dblwr_check_block(
717 /*==================*/
718  const buf_block_t* block)
719 {
721  || block->page.zip.data) {
722  /* No simple validate for compressed pages exists. */
723  return;
724  }
725 
726  buf_dblwr_check_page_lsn(block->frame);
727 
728  if (!block->check_index_page_at_flush) {
729  return;
730  }
731 
732  if (page_is_comp(block->frame)) {
733  if (!page_simple_validate_new(block->frame)) {
734  buf_dblwr_assert_on_corrupt_block(block);
735  }
736  } else if (!page_simple_validate_old(block->frame)) {
737 
738  buf_dblwr_assert_on_corrupt_block(block);
739  }
740 }
741 
742 /********************************************************************/
745 static
746 void
747 buf_dblwr_write_block_to_datafile(
748 /*==============================*/
749  const buf_page_t* bpage,
750  bool sync)
752 {
753  ut_a(bpage);
754  ut_a(buf_page_in_file(bpage));
755 
756  const ulint flags = sync
757  ? OS_FILE_WRITE
758  : OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
759 
760  if (bpage->zip.data) {
761  fil_io(flags, sync, buf_page_get_space(bpage),
762  buf_page_get_zip_size(bpage),
763  buf_page_get_page_no(bpage), 0,
764  buf_page_get_zip_size(bpage),
765  (void*) bpage->zip.data,
766  (void*) bpage);
767 
768  return;
769  }
770 
771 
772  const buf_block_t* block = (buf_block_t*) bpage;
774  buf_dblwr_check_page_lsn(block->frame);
775 
776  fil_io(flags, sync, buf_block_get_space(block), 0,
777  buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
778  (void*) block->frame, (void*) block);
779 }
780 
781 /********************************************************************/
787 UNIV_INTERN
788 void
790 /*=================================*/
791 {
792  byte* write_buf;
793  ulint first_free;
794  ulint len;
795 
796  if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
797  /* Sync the writes to the disk. */
799  return;
800  }
801 
802 try_again:
803  mutex_enter(&buf_dblwr->mutex);
804 
805  /* Write first to doublewrite buffer blocks. We use synchronous
806  aio and thus know that file write has been completed when the
807  control returns. */
808 
809  if (buf_dblwr->first_free == 0) {
810 
811  mutex_exit(&buf_dblwr->mutex);
812 
813  return;
814  }
815 
816  if (buf_dblwr->batch_running) {
817  /* Another thread is running the batch right now. Wait
818  for it to finish. */
819  ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
820  mutex_exit(&buf_dblwr->mutex);
821 
822  os_event_wait_low(buf_dblwr->b_event, sig_count);
823  goto try_again;
824  }
825 
828 
829  /* Disallow anyone else to post to doublewrite buffer or to
830  start another batch of flushing. */
831  buf_dblwr->batch_running = true;
832  first_free = buf_dblwr->first_free;
833 
834  /* Now safe to release the mutex. Note that though no other
835  thread is allowed to post to the doublewrite batch flushing
836  but any threads working on single page flushes are allowed
837  to proceed. */
838  mutex_exit(&buf_dblwr->mutex);
839 
840  write_buf = buf_dblwr->write_buf;
841 
842  for (ulint len2 = 0, i = 0;
844  len2 += UNIV_PAGE_SIZE, i++) {
845 
846  const buf_block_t* block;
847 
848  block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
849 
851  || block->page.zip.data) {
852  /* No simple validate for compressed
853  pages exists. */
854  continue;
855  }
856 
857  /* Check that the actual page in the buffer pool is
858  not corrupt and the LSN values are sane. */
859  buf_dblwr_check_block(block);
860 
861  /* Check that the page as written to the doublewrite
862  buffer has sane LSN values. */
863  buf_dblwr_check_page_lsn(write_buf + len2);
864  }
865 
866  /* Write out the first block of the doublewrite buffer */
868  buf_dblwr->first_free) * UNIV_PAGE_SIZE;
869 
870  fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
871  buf_dblwr->block1, 0, len,
872  (void*) write_buf, NULL);
873 
875  /* No unwritten pages in the second block. */
876  goto flush;
877  }
878 
879  /* Write out the second block of the doublewrite buffer. */
881  * UNIV_PAGE_SIZE;
882 
883  write_buf = buf_dblwr->write_buf
884  + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
885 
886  fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
887  buf_dblwr->block2, 0, len,
888  (void*) write_buf, NULL);
889 
890 flush:
891  /* increment the doublewrite flushed pages counter */
894 
895  /* Now flush the doublewrite buffer data to disk */
896  fil_flush(TRX_SYS_SPACE);
897 
898  /* We know that the writes have been flushed to disk now
899  and in recovery we will find them in the doublewrite buffer
900  blocks. Next do the writes to the intended positions. */
901 
902  /* Up to this point first_free and buf_dblwr->first_free are
903  same because we have set the buf_dblwr->batch_running flag
904  disallowing any other thread to post any request but we
905  can't safely access buf_dblwr->first_free in the loop below.
906  This is so because it is possible that after we are done with
907  the last iteration and before we terminate the loop, the batch
908  gets finished in the IO helper thread and another thread posts
909  a new batch setting buf_dblwr->first_free to a higher value.
910  If this happens and we are using buf_dblwr->first_free in the
911  loop termination condition then we'll end up dispatching
912  the same block twice from two different threads. */
913  ut_ad(first_free == buf_dblwr->first_free);
914  for (ulint i = 0; i < first_free; i++) {
915  buf_dblwr_write_block_to_datafile(
916  buf_dblwr->buf_block_arr[i], false);
917  }
918 
919  /* Wake possible simulated aio thread to actually post the
920  writes to the operating system. We don't flush the files
921  at this point. We leave it to the IO helper thread to flush
922  datafiles when the whole batch has been processed. */
924 }
925 
926 /********************************************************************/
930 UNIV_INTERN
931 void
933 /*====================*/
934  buf_page_t* bpage)
935 {
936  ulint zip_size;
937 
938  ut_a(buf_page_in_file(bpage));
939 
940 try_again:
941  mutex_enter(&buf_dblwr->mutex);
942 
944 
945  if (buf_dblwr->batch_running) {
946 
947  /* This not nearly as bad as it looks. There is only
948  page_cleaner thread which does background flushing
949  in batches therefore it is unlikely to be a contention
950  point. The only exception is when a user thread is
951  forced to do a flush batch because of a sync
952  checkpoint. */
953  ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
954  mutex_exit(&buf_dblwr->mutex);
955 
956  os_event_wait_low(buf_dblwr->b_event, sig_count);
957  goto try_again;
958  }
959 
961  mutex_exit(&(buf_dblwr->mutex));
962 
964 
965  goto try_again;
966  }
967 
968  zip_size = buf_page_get_zip_size(bpage);
969 
970  if (zip_size) {
971  UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
972  /* Copy the compressed page and clear the rest. */
973  memcpy(buf_dblwr->write_buf
974  + UNIV_PAGE_SIZE * buf_dblwr->first_free,
975  bpage->zip.data, zip_size);
976  memset(buf_dblwr->write_buf
977  + UNIV_PAGE_SIZE * buf_dblwr->first_free
978  + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
979  } else {
981  UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
982  UNIV_PAGE_SIZE);
983 
984  memcpy(buf_dblwr->write_buf
985  + UNIV_PAGE_SIZE * buf_dblwr->first_free,
986  ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
987  }
988 
990 
993 
997 
999  mutex_exit(&(buf_dblwr->mutex));
1000 
1002 
1003  return;
1004  }
1005 
1006  mutex_exit(&(buf_dblwr->mutex));
1007 }
1008 
1009 /********************************************************************/
1017 UNIV_INTERN
1018 void
1020 /*========================*/
1021  buf_page_t* bpage,
1022  bool sync)
1023 {
1024  ulint n_slots;
1025  ulint size;
1026  ulint zip_size;
1027  ulint offset;
1028  ulint i;
1029 
1030  ut_a(buf_page_in_file(bpage));
1031  ut_a(srv_use_doublewrite_buf);
1032  ut_a(buf_dblwr != NULL);
1033 
1034  /* total number of slots available for single page flushes
1035  starts from srv_doublewrite_batch_size to the end of the
1036  buffer. */
1037  size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
1039  n_slots = size - srv_doublewrite_batch_size;
1040 
1041  if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
1042 
1043  /* Check that the actual page in the buffer pool is
1044  not corrupt and the LSN values are sane. */
1045  buf_dblwr_check_block((buf_block_t*) bpage);
1046 
1047  /* Check that the page as written to the doublewrite
1048  buffer has sane LSN values. */
1049  if (!bpage->zip.data) {
1050  buf_dblwr_check_page_lsn(
1051  ((buf_block_t*) bpage)->frame);
1052  }
1053  }
1054 
1055 retry:
1056  mutex_enter(&buf_dblwr->mutex);
1057  if (buf_dblwr->s_reserved == n_slots) {
1058 
1059  /* All slots are reserved. */
1060  ib_int64_t sig_count =
1062  mutex_exit(&buf_dblwr->mutex);
1063  os_event_wait_low(buf_dblwr->s_event, sig_count);
1064 
1065  goto retry;
1066  }
1067 
1068  for (i = srv_doublewrite_batch_size; i < size; ++i) {
1069 
1070  if (!buf_dblwr->in_use[i]) {
1071  break;
1072  }
1073  }
1074 
1075  /* We are guaranteed to find a slot. */
1076  ut_a(i < size);
1077  buf_dblwr->in_use[i] = true;
1078  buf_dblwr->s_reserved++;
1080 
1081  /* increment the doublewrite flushed pages counter */
1084 
1085  mutex_exit(&buf_dblwr->mutex);
1086 
1087  /* Lets see if we are going to write in the first or second
1088  block of the doublewrite buffer. */
1090  offset = buf_dblwr->block1 + i;
1091  } else {
1092  offset = buf_dblwr->block2 + i
1094  }
1095 
1096  /* We deal with compressed and uncompressed pages a little
1097  differently here. In case of uncompressed pages we can
1098  directly write the block to the allocated slot in the
1099  doublewrite buffer in the system tablespace and then after
1100  syncing the system table space we can proceed to write the page
1101  in the datafile.
1102  In case of compressed page we first do a memcpy of the block
1103  to the in-memory buffer of doublewrite before proceeding to
1104  write it. This is so because we want to pad the remaining
1105  bytes in the doublewrite page with zeros. */
1106 
1107  zip_size = buf_page_get_zip_size(bpage);
1108  if (zip_size) {
1109  memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
1110  bpage->zip.data, zip_size);
1111  memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
1112  + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
1113 
1114  fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1115  offset, 0, UNIV_PAGE_SIZE,
1116  (void*) (buf_dblwr->write_buf
1117  + UNIV_PAGE_SIZE * i), NULL);
1118  } else {
1119  /* It is a regular page. Write it directly to the
1120  doublewrite buffer */
1121  fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1122  offset, 0, UNIV_PAGE_SIZE,
1123  (void*) ((buf_block_t*) bpage)->frame,
1124  NULL);
1125  }
1126 
1127  /* Now flush the doublewrite buffer data to disk */
1128  fil_flush(TRX_SYS_SPACE);
1129 
1130  /* We know that the write has been flushed to disk now
1131  and during recovery we will find it in the doublewrite buffer
1132  blocks. Next do the write to the intended position. */
1133  buf_dblwr_write_block_to_datafile(bpage, sync);
1134 }
1135 #endif /* !UNIV_HOTBACKUP */