MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
os0file.cc
Go to the documentation of this file.
1 /***********************************************************************
2 
3 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify it
14 under the terms of the GNU General Public License as published by the
15 Free Software Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
20 Public License for more details.
21 
22 You should have received a copy of the GNU General Public License along with
23 this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
25 
26 ***********************************************************************/
27 
28 /**************************************************/
35 #include "os0file.h"
36 
37 #ifdef UNIV_NONINL
38 #include "os0file.ic"
39 #endif
40 
41 #include "ut0mem.h"
42 #include "srv0srv.h"
43 #include "srv0start.h"
44 #include "fil0fil.h"
45 #include "buf0buf.h"
46 #include "srv0mon.h"
47 #ifndef UNIV_HOTBACKUP
48 # include "os0sync.h"
49 # include "os0thread.h"
50 #else /* !UNIV_HOTBACKUP */
51 # ifdef __WIN__
52 /* Add includes for the _stat() call to compile on Windows */
53 # include <sys/types.h>
54 # include <sys/stat.h>
55 # include <errno.h>
56 # endif /* __WIN__ */
57 #endif /* !UNIV_HOTBACKUP */
58 
59 #if defined(LINUX_NATIVE_AIO)
60 #include <libaio.h>
61 #endif
62 
64 static const ulint IO_IBUF_SEGMENT = 0;
65 
67 static const ulint IO_LOG_SEGMENT = 1;
68 
69 /* This specifies the file permissions InnoDB uses when it creates files in
70 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
71 my_umask */
72 
73 #ifndef __WIN__
74 
75 UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
76 #else
77 
78 UNIV_INTERN ulint os_innodb_umask = 0;
79 #endif /* __WIN__ */
80 
81 #ifndef UNIV_HOTBACKUP
82 /* We use these mutexes to protect lseek + file i/o operation, if the
83 OS does not provide an atomic pread or pwrite, or similar */
84 #define OS_FILE_N_SEEK_MUTEXES 16
85 UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
86 
87 /* In simulated aio, merge at most this many consecutive i/os */
88 #define OS_AIO_MERGE_N_CONSECUTIVE 64
89 
90 /**********************************************************************
91 
92 InnoDB AIO Implementation:
93 =========================
94 
95 We support native AIO for windows and linux. For rest of the platforms
96 we simulate AIO by special io-threads servicing the IO-requests.
97 
98 Simulated AIO:
99 ==============
100 
101 In platforms where we 'simulate' AIO following is a rough explanation
102 of the high level design.
103 There are four io-threads (for ibuf, log, read, write).
104 All synchronous IO requests are serviced by the calling thread using
105 os_file_write/os_file_read. The Asynchronous requests are queued up
106 in an array (there are four such arrays) by the calling thread.
107 Later these requests are picked up by the io-thread and are serviced
108 synchronously.
109 
110 Windows native AIO:
111 ==================
112 
113 If srv_use_native_aio is not set then windows follow the same
114 code as simulated AIO. If the flag is set then native AIO interface
115 is used. On windows, one of the limitation is that if a file is opened
116 for AIO no synchronous IO can be done on it. Therefore we have an
117 extra fifth array to queue up synchronous IO requests.
118 There are innodb_file_io_threads helper threads. These threads work
119 on the four arrays mentioned above in Simulated AIO. No thread is
120 required for the sync array.
121 If a synchronous IO request is made, it is first queued in the sync
122 array. Then the calling thread itself waits on the request, thus
123 making the call synchronous.
124 If an AIO request is made the calling thread not only queues it in the
125 array but also submits the requests. The helper thread then collects
126 the completed IO request and calls completion routine on it.
127 
128 Linux native AIO:
129 =================
130 
131 If we have libaio installed on the system and innodb_use_native_aio
132 is set to TRUE we follow the code path of native AIO, otherwise we
133 do simulated AIO.
134 There are innodb_file_io_threads helper threads. These threads work
135 on the four arrays mentioned above in Simulated AIO.
136 If a synchronous IO request is made, it is handled by calling
137 os_file_write/os_file_read.
138 If an AIO request is made the calling thread not only queues it in the
139 array but also submits the requests. The helper thread then collects
140 the completed IO request and calls completion routine on it.
141 
142 **********************************************************************/
143 
145 UNIV_INTERN ibool os_aio_print_debug = FALSE;
146 
147 #ifdef UNIV_PFS_IO
148 /* Keys to register InnoDB I/O with performance schema */
149 UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
150 UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
151 UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
152 #endif /* UNIV_PFS_IO */
153 
156  ibool is_read;
157  ulint pos;
159  ibool reserved;
161  ulint len;
163  byte* buf;
164  ulint type;
167  const char* name;
174  void* message2;
178 #ifdef WIN_ASYNC_IO
179  HANDLE handle;
181  OVERLAPPED control;
183 #elif defined(LINUX_NATIVE_AIO)
184  struct iocb control; /* Linux control block for aio */
185  int n_bytes; /* bytes written/read. */
186  int ret; /* AIO return code */
187 #endif /* WIN_ASYNC_IO */
188 };
189 
201  ulint n_slots;
204  ulint n_segments;
209  ulint cur_seg;
213  ulint n_reserved;
217 #ifdef __WIN__
218  HANDLE* handles;
225 #endif /* __WIN__ */
226 
227 #if defined(LINUX_NATIVE_AIO)
228  io_context_t* aio_ctx;
229  /* completion queue for IO. There is
230  one such queue per segment. Each thread
231  will work on one ctx exclusively. */
232  struct io_event* aio_events;
233  /* The array to collect completed IOs.
234  There is one such event for each
235  possible pending IO. The size of the
236  array is equal to n_slots. */
237 #endif /* LINUX_NATIV_AIO */
238 };
239 
240 #if defined(LINUX_NATIVE_AIO)
241 
242 #define OS_AIO_REAP_TIMEOUT (500000000UL)
243 
245 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
246 
248 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
249 #endif
250 
252 static os_event_t* os_aio_segment_wait_events = NULL;
253 
256 static os_aio_array_t* os_aio_read_array = NULL;
257 static os_aio_array_t* os_aio_write_array = NULL;
258 static os_aio_array_t* os_aio_ibuf_array = NULL;
259 static os_aio_array_t* os_aio_log_array = NULL;
260 static os_aio_array_t* os_aio_sync_array = NULL;
261 /* @} */
262 
264 static ulint os_aio_n_segments = ULINT_UNDEFINED;
265 
268 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
269 #endif /* !UNIV_HOTBACKUP */
270 
271 UNIV_INTERN ulint os_n_file_reads = 0;
272 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
273 UNIV_INTERN ulint os_n_file_writes = 0;
274 UNIV_INTERN ulint os_n_fsyncs = 0;
275 UNIV_INTERN ulint os_n_file_reads_old = 0;
276 UNIV_INTERN ulint os_n_file_writes_old = 0;
277 UNIV_INTERN ulint os_n_fsyncs_old = 0;
278 UNIV_INTERN time_t os_last_printout;
279 
280 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
281 
282 #if !defined(UNIV_HOTBACKUP) \
283  && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
284 
285 static os_ib_mutex_t os_file_count_mutex;
286 #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
287 
289 UNIV_INTERN ulint os_file_n_pending_preads = 0;
291 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
293 UNIV_INTERN ulint os_n_pending_writes = 0;
295 UNIV_INTERN ulint os_n_pending_reads = 0;
296 
297 #ifdef UNIV_DEBUG
298 # ifndef UNIV_HOTBACKUP
299 /**********************************************************************/
302 UNIV_INTERN
303 ibool
304 os_aio_validate_skip(void)
305 /*======================*/
306 {
308 # define OS_AIO_VALIDATE_SKIP 13
309 
312  static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
313 
314  /* There is a race condition below, but it does not matter,
315  because this call is only for heuristic purposes. We want to
316  reduce the call frequency of the costly os_aio_validate()
317  check in debug builds. */
318  if (--os_aio_validate_count > 0) {
319  return(TRUE);
320  }
321 
322  os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
323  return(os_aio_validate());
324 }
325 # endif /* !UNIV_HOTBACKUP */
326 #endif /* UNIV_DEBUG */
327 
328 #ifdef __WIN__
329 /***********************************************************************/
333 UNIV_INTERN
334 ulint
335 os_get_os_version(void)
336 /*===================*/
337 {
338  OSVERSIONINFO os_info;
339 
340  os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
341 
342  ut_a(GetVersionEx(&os_info));
343 
344  if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
345  return(OS_WIN31);
346  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
347  return(OS_WIN95);
348  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
349  switch (os_info.dwMajorVersion) {
350  case 3:
351  case 4:
352  return(OS_WINNT);
353  case 5:
354  return (os_info.dwMinorVersion == 0)
355  ? OS_WIN2000 : OS_WINXP;
356  case 6:
357  return (os_info.dwMinorVersion == 0)
358  ? OS_WINVISTA : OS_WIN7;
359  default:
360  return(OS_WIN7);
361  }
362  } else {
363  ut_error;
364  return(0);
365  }
366 }
367 #endif /* __WIN__ */
368 
369 /***********************************************************************/
375 static
376 ulint
377 os_file_get_last_error_low(
378 /*=======================*/
379  bool report_all_errors,
381  bool on_error_silent)
383 {
384 #ifdef __WIN__
385 
386  ulint err = (ulint) GetLastError();
387  if (err == ERROR_SUCCESS) {
388  return(0);
389  }
390 
391  if (report_all_errors
392  || (!on_error_silent
393  && err != ERROR_DISK_FULL
394  && err != ERROR_FILE_EXISTS)) {
395 
396  ut_print_timestamp(stderr);
397  fprintf(stderr,
398  " InnoDB: Operating system error number %lu"
399  " in a file operation.\n", (ulong) err);
400 
401  if (err == ERROR_PATH_NOT_FOUND) {
402  fprintf(stderr,
403  "InnoDB: The error means the system"
404  " cannot find the path specified.\n");
405 
406  if (srv_is_being_started) {
407  fprintf(stderr,
408  "InnoDB: If you are installing InnoDB,"
409  " remember that you must create\n"
410  "InnoDB: directories yourself, InnoDB"
411  " does not create them.\n");
412  }
413  } else if (err == ERROR_ACCESS_DENIED) {
414  fprintf(stderr,
415  "InnoDB: The error means mysqld does not have"
416  " the access rights to\n"
417  "InnoDB: the directory. It may also be"
418  " you have created a subdirectory\n"
419  "InnoDB: of the same name as a data file.\n");
420  } else if (err == ERROR_SHARING_VIOLATION
421  || err == ERROR_LOCK_VIOLATION) {
422  fprintf(stderr,
423  "InnoDB: The error means that another program"
424  " is using InnoDB's files.\n"
425  "InnoDB: This might be a backup or antivirus"
426  " software or another instance\n"
427  "InnoDB: of MySQL."
428  " Please close it to get rid of this error.\n");
429  } else if (err == ERROR_WORKING_SET_QUOTA
430  || err == ERROR_NO_SYSTEM_RESOURCES) {
431  fprintf(stderr,
432  "InnoDB: The error means that there are no"
433  " sufficient system resources or quota to"
434  " complete the operation.\n");
435  } else if (err == ERROR_OPERATION_ABORTED) {
436  fprintf(stderr,
437  "InnoDB: The error means that the I/O"
438  " operation has been aborted\n"
439  "InnoDB: because of either a thread exit"
440  " or an application request.\n"
441  "InnoDB: Retry attempt is made.\n");
442  } else {
443  fprintf(stderr,
444  "InnoDB: Some operating system error numbers"
445  " are described at\n"
446  "InnoDB: "
447  REFMAN
448  "operating-system-error-codes.html\n");
449  }
450  }
451 
452  fflush(stderr);
453 
454  if (err == ERROR_FILE_NOT_FOUND) {
455  return(OS_FILE_NOT_FOUND);
456  } else if (err == ERROR_DISK_FULL) {
457  return(OS_FILE_DISK_FULL);
458  } else if (err == ERROR_FILE_EXISTS) {
459  return(OS_FILE_ALREADY_EXISTS);
460  } else if (err == ERROR_SHARING_VIOLATION
461  || err == ERROR_LOCK_VIOLATION) {
462  return(OS_FILE_SHARING_VIOLATION);
463  } else if (err == ERROR_WORKING_SET_QUOTA
464  || err == ERROR_NO_SYSTEM_RESOURCES) {
465  return(OS_FILE_INSUFFICIENT_RESOURCE);
466  } else if (err == ERROR_OPERATION_ABORTED) {
467  return(OS_FILE_OPERATION_ABORTED);
468  } else {
469  return(100 + err);
470  }
471 #else
472  int err = errno;
473  if (err == 0) {
474  return(0);
475  }
476 
477  if (report_all_errors
478  || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
479 
480  ut_print_timestamp(stderr);
481  fprintf(stderr,
482  " InnoDB: Operating system error number %d"
483  " in a file operation.\n", err);
484 
485  if (err == ENOENT) {
486  fprintf(stderr,
487  "InnoDB: The error means the system"
488  " cannot find the path specified.\n");
489 
490  if (srv_is_being_started) {
491  fprintf(stderr,
492  "InnoDB: If you are installing InnoDB,"
493  " remember that you must create\n"
494  "InnoDB: directories yourself, InnoDB"
495  " does not create them.\n");
496  }
497  } else if (err == EACCES) {
498  fprintf(stderr,
499  "InnoDB: The error means mysqld does not have"
500  " the access rights to\n"
501  "InnoDB: the directory.\n");
502  } else {
503  if (strerror(err) != NULL) {
504  fprintf(stderr,
505  "InnoDB: Error number %d"
506  " means '%s'.\n",
507  err, strerror(err));
508  }
509 
510 
511  fprintf(stderr,
512  "InnoDB: Some operating system"
513  " error numbers are described at\n"
514  "InnoDB: "
515  REFMAN
516  "operating-system-error-codes.html\n");
517  }
518  }
519 
520  fflush(stderr);
521 
522  switch (err) {
523  case ENOSPC:
524  return(OS_FILE_DISK_FULL);
525  case ENOENT:
526  return(OS_FILE_NOT_FOUND);
527  case EEXIST:
528  return(OS_FILE_ALREADY_EXISTS);
529  case EXDEV:
530  case ENOTDIR:
531  case EISDIR:
532  return(OS_FILE_PATH_ERROR);
533  case EAGAIN:
534  if (srv_use_native_aio) {
535  return(OS_FILE_AIO_RESOURCES_RESERVED);
536  }
537  break;
538  case EINTR:
539  if (srv_use_native_aio) {
540  return(OS_FILE_AIO_INTERRUPTED);
541  }
542  break;
543  }
544  return(100 + err);
545 #endif
546 }
547 
548 /***********************************************************************/
554 UNIV_INTERN
555 ulint
557 /*===================*/
558  bool report_all_errors)
560 {
561  return(os_file_get_last_error_low(report_all_errors, false));
562 }
563 
564 /****************************************************************/
569 static
570 ibool
571 os_file_handle_error_cond_exit(
572 /*===========================*/
573  const char* name,
574  const char* operation,
575  ibool should_exit,
577  ibool on_error_silent)
580 {
581  ulint err;
582 
583  err = os_file_get_last_error_low(false, on_error_silent);
584 
585  switch (err) {
586  case OS_FILE_DISK_FULL:
587  /* We only print a warning about disk full once */
588 
589  if (os_has_said_disk_full) {
590 
591  return(FALSE);
592  }
593 
594  /* Disk full error is reported irrespective of the
595  on_error_silent setting. */
596 
597  if (name) {
598  ut_print_timestamp(stderr);
599  fprintf(stderr,
600  " InnoDB: Encountered a problem with"
601  " file %s\n", name);
602  }
603 
604  ut_print_timestamp(stderr);
605  fprintf(stderr,
606  " InnoDB: Disk is full. Try to clean the disk"
607  " to free space.\n");
608 
609  os_has_said_disk_full = TRUE;
610 
611  fflush(stderr);
612 
613  return(FALSE);
614 
615  case OS_FILE_AIO_RESOURCES_RESERVED:
616  case OS_FILE_AIO_INTERRUPTED:
617 
618  return(TRUE);
619 
620  case OS_FILE_PATH_ERROR:
621  case OS_FILE_ALREADY_EXISTS:
622 
623  return(FALSE);
624 
625  case OS_FILE_SHARING_VIOLATION:
626 
627  os_thread_sleep(10000000); /* 10 sec */
628  return(TRUE);
629 
630  case OS_FILE_OPERATION_ABORTED:
631  case OS_FILE_INSUFFICIENT_RESOURCE:
632 
633  os_thread_sleep(100000); /* 100 ms */
634  return(TRUE);
635 
636  default:
637 
638  /* If it is an operation that can crash on error then it
639  is better to ignore on_error_silent and print an error message
640  to the log. */
641 
642  if (should_exit || !on_error_silent) {
643  ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
644  "error " ULINTPF ".%s", name ? name : "(unknown)",
645  operation, err, should_exit
646  ? " Cannot continue operation" : "");
647  }
648 
649  if (should_exit) {
650  exit(1);
651  }
652  }
653 
654  return(FALSE);
655 }
656 
657 /****************************************************************/
660 static
661 ibool
662 os_file_handle_error(
663 /*=================*/
664  const char* name,
665  const char* operation)
666 {
667  /* exit in case of unknown error */
668  return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
669 }
670 
671 /****************************************************************/
674 static
675 ibool
676 os_file_handle_error_no_exit(
677 /*=========================*/
678  const char* name,
679  const char* operation,
680  ibool on_error_silent)
682 {
683  /* don't exit in case of unknown error */
684  return(os_file_handle_error_cond_exit(
685  name, operation, FALSE, on_error_silent));
686 }
687 
688 #undef USE_FILE_LOCK
689 #define USE_FILE_LOCK
690 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
691 /* InnoDB Hot Backup does not lock the data files.
692  * On Windows, mandatory locking is used.
693  */
694 # undef USE_FILE_LOCK
695 #endif
696 #ifdef USE_FILE_LOCK
697 /****************************************************************/
700 static
701 int
702 os_file_lock(
703 /*=========*/
704  int fd,
705  const char* name)
706 {
707  struct flock lk;
708 
710 
711  lk.l_type = F_WRLCK;
712  lk.l_whence = SEEK_SET;
713  lk.l_start = lk.l_len = 0;
714 
715  if (fcntl(fd, F_SETLK, &lk) == -1) {
716 
717  ib_logf(IB_LOG_LEVEL_ERROR,
718  "Unable to lock %s, error: %d", name, errno);
719 
720  if (errno == EAGAIN || errno == EACCES) {
721  ib_logf(IB_LOG_LEVEL_INFO,
722  "Check that you do not already have "
723  "another mysqld process using the "
724  "same InnoDB data or log files.");
725  }
726 
727  return(-1);
728  }
729 
730  return(0);
731 }
732 #endif /* USE_FILE_LOCK */
733 
734 #ifndef UNIV_HOTBACKUP
735 /****************************************************************/
737 UNIV_INTERN
738 void
740 /*===================*/
741 {
742 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
743  os_file_count_mutex = os_mutex_create();
744 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
745 
746  for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
747  os_file_seek_mutexes[i] = os_mutex_create();
748  }
749 }
750 
751 /***********************************************************************/
755 UNIV_INTERN
756 FILE*
758 /*========================*/
759 {
760  FILE* file = NULL;
761  int fd = innobase_mysql_tmpfile();
762 
764 
765  if (fd >= 0) {
766  file = fdopen(fd, "w+b");
767  }
768 
769  if (!file) {
770  ut_print_timestamp(stderr);
771  fprintf(stderr,
772  " InnoDB: Error: unable to create temporary file;"
773  " errno: %d\n", errno);
774  if (fd >= 0) {
775  close(fd);
776  }
777  }
778 
779  return(file);
780 }
781 #endif /* !UNIV_HOTBACKUP */
782 
783 /***********************************************************************/
789 UNIV_INTERN
792 /*============*/
793  const char* dirname,
795  ibool error_is_fatal)
800 {
801  os_file_dir_t dir;
802 #ifdef __WIN__
803  LPWIN32_FIND_DATA lpFindFileData;
804  char path[OS_FILE_MAX_PATH + 3];
805 
806  ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
807 
808  strcpy(path, dirname);
809  strcpy(path + strlen(path), "\\*");
810 
811  /* Note that in Windows opening the 'directory stream' also retrieves
812  the first entry in the directory. Since it is '.', that is no problem,
813  as we will skip over the '.' and '..' entries anyway. */
814 
815  lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
816  ut_malloc(sizeof(WIN32_FIND_DATA)));
817 
818  dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
819 
820  ut_free(lpFindFileData);
821 
822  if (dir == INVALID_HANDLE_VALUE) {
823 
824  if (error_is_fatal) {
825  os_file_handle_error(dirname, "opendir");
826  }
827 
828  return(NULL);
829  }
830 
831  return(dir);
832 #else
833  dir = opendir(dirname);
834 
835  if (dir == NULL && error_is_fatal) {
836  os_file_handle_error(dirname, "opendir");
837  }
838 
839  return(dir);
840 #endif /* __WIN__ */
841 }
842 
843 /***********************************************************************/
846 UNIV_INTERN
847 int
849 /*=============*/
850  os_file_dir_t dir)
851 {
852 #ifdef __WIN__
853  BOOL ret;
854 
855  ret = FindClose(dir);
856 
857  if (!ret) {
858  os_file_handle_error_no_exit(NULL, "closedir", FALSE);
859 
860  return(-1);
861  }
862 
863  return(0);
864 #else
865  int ret;
866 
867  ret = closedir(dir);
868 
869  if (ret) {
870  os_file_handle_error_no_exit(NULL, "closedir", FALSE);
871  }
872 
873  return(ret);
874 #endif /* __WIN__ */
875 }
876 
877 /***********************************************************************/
881 UNIV_INTERN
882 int
884 /*======================*/
885  const char* dirname,
886  os_file_dir_t dir,
887  os_file_stat_t* info)
888 {
889 #ifdef __WIN__
890  LPWIN32_FIND_DATA lpFindFileData;
891  BOOL ret;
892 
893  lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
894  ut_malloc(sizeof(WIN32_FIND_DATA)));
895 next_file:
896  ret = FindNextFile(dir, lpFindFileData);
897 
898  if (ret) {
899  ut_a(strlen((char*) lpFindFileData->cFileName)
900  < OS_FILE_MAX_PATH);
901 
902  if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
903  || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
904 
905  goto next_file;
906  }
907 
908  strcpy(info->name, (char*) lpFindFileData->cFileName);
909 
910  info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
911  + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
912  << 32);
913 
914  if (lpFindFileData->dwFileAttributes
915  & FILE_ATTRIBUTE_REPARSE_POINT) {
916  /* TODO: test Windows symlinks */
917  /* TODO: MySQL has apparently its own symlink
918  implementation in Windows, dbname.sym can
919  redirect a database directory:
920  REFMAN "windows-symbolic-links.html" */
921  info->type = OS_FILE_TYPE_LINK;
922  } else if (lpFindFileData->dwFileAttributes
923  & FILE_ATTRIBUTE_DIRECTORY) {
924  info->type = OS_FILE_TYPE_DIR;
925  } else {
926  /* It is probably safest to assume that all other
927  file types are normal. Better to check them rather
928  than blindly skip them. */
929 
930  info->type = OS_FILE_TYPE_FILE;
931  }
932  }
933 
934  ut_free(lpFindFileData);
935 
936  if (ret) {
937  return(0);
938  } else if (GetLastError() == ERROR_NO_MORE_FILES) {
939 
940  return(1);
941  } else {
942  os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
943  return(-1);
944  }
945 #else
946  struct dirent* ent;
947  char* full_path;
948  int ret;
949  struct stat statinfo;
950 #ifdef HAVE_READDIR_R
951  char dirent_buf[sizeof(struct dirent)
952  + _POSIX_PATH_MAX + 100];
953  /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
954  the max file name len; but in most standards, the
955  length is NAME_MAX; we add 100 to be even safer */
956 #endif
957 
958 next_file:
959 
960 #ifdef HAVE_READDIR_R
961  ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
962 
963  if (ret != 0
964 #ifdef UNIV_AIX
965  /* On AIX, only if we got non-NULL 'ent' (result) value and
966  a non-zero 'ret' (return) value, it indicates a failed
967  readdir_r() call. An NULL 'ent' with an non-zero 'ret'
968  would indicate the "end of the directory" is reached. */
969  && ent != NULL
970 #endif
971  ) {
972  fprintf(stderr,
973  "InnoDB: cannot read directory %s, error %lu\n",
974  dirname, (ulong) ret);
975 
976  return(-1);
977  }
978 
979  if (ent == NULL) {
980  /* End of directory */
981 
982  return(1);
983  }
984 
985  ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
986 #else
987  ent = readdir(dir);
988 
989  if (ent == NULL) {
990 
991  return(1);
992  }
993 #endif
994  ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
995 
996  if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
997 
998  goto next_file;
999  }
1000 
1001  strcpy(info->name, ent->d_name);
1002 
1003  full_path = static_cast<char*>(
1004  ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
1005 
1006  sprintf(full_path, "%s/%s", dirname, ent->d_name);
1007 
1008  ret = stat(full_path, &statinfo);
1009 
1010  if (ret) {
1011 
1012  if (errno == ENOENT) {
1013  /* readdir() returned a file that does not exist,
1014  it must have been deleted in the meantime. Do what
1015  would have happened if the file was deleted before
1016  readdir() - ignore and go to the next entry.
1017  If this is the last entry then info->name will still
1018  contain the name of the deleted file when this
1019  function returns, but this is not an issue since the
1020  caller shouldn't be looking at info when end of
1021  directory is returned. */
1022 
1023  ut_free(full_path);
1024 
1025  goto next_file;
1026  }
1027 
1028  os_file_handle_error_no_exit(full_path, "stat", FALSE);
1029 
1030  ut_free(full_path);
1031 
1032  return(-1);
1033  }
1034 
1035  info->size = (ib_int64_t) statinfo.st_size;
1036 
1037  if (S_ISDIR(statinfo.st_mode)) {
1038  info->type = OS_FILE_TYPE_DIR;
1039  } else if (S_ISLNK(statinfo.st_mode)) {
1040  info->type = OS_FILE_TYPE_LINK;
1041  } else if (S_ISREG(statinfo.st_mode)) {
1042  info->type = OS_FILE_TYPE_FILE;
1043  } else {
1044  info->type = OS_FILE_TYPE_UNKNOWN;
1045  }
1046 
1047  ut_free(full_path);
1048 
1049  return(0);
1050 #endif
1051 }
1052 
1053 /*****************************************************************/
1061 UNIV_INTERN
1062 ibool
1064 /*=====================*/
1065  const char* pathname,
1067  ibool fail_if_exists)
1069 {
1070 #ifdef __WIN__
1071  BOOL rcode;
1072 
1073  rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1074  if (!(rcode != 0
1075  || (GetLastError() == ERROR_ALREADY_EXISTS
1076  && !fail_if_exists))) {
1077 
1078  os_file_handle_error_no_exit(
1079  pathname, "CreateDirectory", FALSE);
1080 
1081  return(FALSE);
1082  }
1083 
1084  return(TRUE);
1085 #else
1086  int rcode;
1087 
1088  rcode = mkdir(pathname, 0770);
1089 
1090  if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1091  /* failure */
1092  os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
1093 
1094  return(FALSE);
1095  }
1096 
1097  return (TRUE);
1098 #endif /* __WIN__ */
1099 }
1100 
1101 /****************************************************************/
1107 UNIV_INTERN
1108 os_file_t
1110 /*=======================*/
1111  const char* name,
1113  ulint create_mode,
1114  ulint access_type,
1116  ibool* success)
1117 {
1118  os_file_t file;
1119  ibool retry;
1120 
1121  *success = FALSE;
1122 #ifdef __WIN__
1123  DWORD access;
1124  DWORD create_flag;
1125  DWORD attributes = 0;
1126 
1127  ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1128  ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1129 
1130  if (create_mode == OS_FILE_OPEN) {
1131 
1132  create_flag = OPEN_EXISTING;
1133 
1134  } else if (srv_read_only_mode) {
1135 
1136  create_flag = OPEN_EXISTING;
1137 
1138  } else if (create_mode == OS_FILE_CREATE) {
1139 
1140  create_flag = CREATE_NEW;
1141 
1142  } else if (create_mode == OS_FILE_CREATE_PATH) {
1143 
1145 
1146  /* Create subdirs along the path if needed */
1147  *success = os_file_create_subdirs_if_needed(name);
1148 
1149  if (!*success) {
1150 
1151  ib_logf(IB_LOG_LEVEL_ERROR,
1152  "Unable to create subdirectories '%s'",
1153  name);
1154 
1155  return((os_file_t) -1);
1156  }
1157 
1158  create_flag = CREATE_NEW;
1159  create_mode = OS_FILE_CREATE;
1160 
1161  } else {
1162  ib_logf(IB_LOG_LEVEL_ERROR,
1163  "Unknown file create mode (%lu) for file '%s'",
1164  create_mode, name);
1165 
1166  return((os_file_t) -1);
1167  }
1168 
1169  if (access_type == OS_FILE_READ_ONLY) {
1170  access = GENERIC_READ;
1171  } else if (srv_read_only_mode) {
1172 
1173  ib_logf(IB_LOG_LEVEL_INFO,
1174  "read only mode set. Unable to "
1175  "open file '%s' in RW mode, trying RO mode", name);
1176 
1177  access = GENERIC_READ;
1178 
1179  } else if (access_type == OS_FILE_READ_WRITE) {
1180  access = GENERIC_READ | GENERIC_WRITE;
1181  } else {
1182  ib_logf(IB_LOG_LEVEL_ERROR,
1183  "Unknown file access type (%lu) for file '%s'",
1184  access_type, name);
1185 
1186  return((os_file_t) -1);
1187  }
1188 
1189  do {
1190  /* Use default security attributes and no template file. */
1191 
1192  file = CreateFile(
1193  (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
1194  create_flag, attributes, NULL);
1195 
1196  if (file == INVALID_HANDLE_VALUE) {
1197 
1198  *success = FALSE;
1199 
1200  retry = os_file_handle_error(
1201  name, create_mode == OS_FILE_OPEN ?
1202  "open" : "create");
1203 
1204  } else {
1205  *success = TRUE;
1206  retry = false;
1207  }
1208 
1209  } while (retry);
1210 
1211 #else /* __WIN__ */
1212  int create_flag;
1213 
1214  ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1215  ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1216 
1217  if (create_mode == OS_FILE_OPEN) {
1218 
1219  if (access_type == OS_FILE_READ_ONLY) {
1220  create_flag = O_RDONLY;
1221  } else if (srv_read_only_mode) {
1222  create_flag = O_RDONLY;
1223  } else {
1224  create_flag = O_RDWR;
1225  }
1226 
1227  } else if (srv_read_only_mode) {
1228 
1229  create_flag = O_RDONLY;
1230 
1231  } else if (create_mode == OS_FILE_CREATE) {
1232 
1233  create_flag = O_RDWR | O_CREAT | O_EXCL;
1234 
1235  } else if (create_mode == OS_FILE_CREATE_PATH) {
1236 
1237  /* Create subdirs along the path if needed */
1238 
1239  *success = os_file_create_subdirs_if_needed(name);
1240 
1241  if (!*success) {
1242 
1243  ib_logf(IB_LOG_LEVEL_ERROR,
1244  "Unable to create subdirectories '%s'",
1245  name);
1246 
1247  return((os_file_t) -1);
1248  }
1249 
1250  create_flag = O_RDWR | O_CREAT | O_EXCL;
1251  create_mode = OS_FILE_CREATE;
1252  } else {
1253 
1254  ib_logf(IB_LOG_LEVEL_ERROR,
1255  "Unknown file create mode (%lu) for file '%s'",
1256  create_mode, name);
1257 
1258  return((os_file_t) -1);
1259  }
1260 
1261  do {
1262  file = ::open(name, create_flag, os_innodb_umask);
1263 
1264  if (file == -1) {
1265  *success = FALSE;
1266 
1267  retry = os_file_handle_error(
1268  name,
1269  create_mode == OS_FILE_OPEN
1270  ? "open" : "create");
1271  } else {
1272  *success = TRUE;
1273  retry = false;
1274  }
1275 
1276  } while (retry);
1277 
1278 #ifdef USE_FILE_LOCK
1279  if (!srv_read_only_mode
1280  && *success
1281  && access_type == OS_FILE_READ_WRITE
1282  && os_file_lock(file, name)) {
1283 
1284  *success = FALSE;
1285  close(file);
1286  file = -1;
1287  }
1288 #endif /* USE_FILE_LOCK */
1289 
1290 #endif /* __WIN__ */
1291 
1292  return(file);
1293 }
1294 
1295 /****************************************************************/
1301 UNIV_INTERN
1302 os_file_t
1304 /*=========================================*/
1305  const char* name,
1307  ulint create_mode,
1308  ulint access_type,
1312  ibool* success)
1313 {
1314  os_file_t file;
1315 
1316  *success = FALSE;
1317 #ifdef __WIN__
1318  DWORD access;
1319  DWORD create_flag;
1320  DWORD attributes = 0;
1321  DWORD share_mode = FILE_SHARE_READ;
1322 
1323  ut_a(name);
1324 
1325  ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1326  ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1327 
1328  if (create_mode == OS_FILE_OPEN) {
1329  create_flag = OPEN_EXISTING;
1330  } else if (srv_read_only_mode) {
1331  create_flag = OPEN_EXISTING;
1332  } else if (create_mode == OS_FILE_CREATE) {
1333  create_flag = CREATE_NEW;
1334  } else {
1335 
1336  ib_logf(IB_LOG_LEVEL_ERROR,
1337  "Unknown file create mode (%lu) for file '%s'",
1338  create_mode, name);
1339 
1340  return((os_file_t) -1);
1341  }
1342 
1343  if (access_type == OS_FILE_READ_ONLY) {
1344  access = GENERIC_READ;
1345  } else if (srv_read_only_mode) {
1346  access = GENERIC_READ;
1347  } else if (access_type == OS_FILE_READ_WRITE) {
1348  access = GENERIC_READ | GENERIC_WRITE;
1349  } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1350 
1352 
1353  access = GENERIC_READ;
1354 
1358  share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
1359  } else {
1360  ib_logf(IB_LOG_LEVEL_ERROR,
1361  "Unknown file access type (%lu) for file '%s'",
1362  access_type, name);
1363 
1364  return((os_file_t) -1);
1365  }
1366 
1367  file = CreateFile((LPCTSTR) name,
1368  access,
1369  share_mode,
1370  NULL, // Security attributes
1371  create_flag,
1372  attributes,
1373  NULL); // No template file
1374 
1375  *success = (file != INVALID_HANDLE_VALUE);
1376 #else /* __WIN__ */
1377  int create_flag;
1378 
1379  ut_a(name);
1380 
1381  ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1382  ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1383 
1384  if (create_mode == OS_FILE_OPEN) {
1385 
1386  if (access_type == OS_FILE_READ_ONLY) {
1387 
1388  create_flag = O_RDONLY;
1389 
1390  } else if (srv_read_only_mode) {
1391 
1392  create_flag = O_RDONLY;
1393 
1394  } else {
1395 
1396  ut_a(access_type == OS_FILE_READ_WRITE
1397  || access_type == OS_FILE_READ_ALLOW_DELETE);
1398 
1399  create_flag = O_RDWR;
1400  }
1401 
1402  } else if (srv_read_only_mode) {
1403 
1404  create_flag = O_RDONLY;
1405 
1406  } else if (create_mode == OS_FILE_CREATE) {
1407 
1408  create_flag = O_RDWR | O_CREAT | O_EXCL;
1409 
1410  } else {
1411  ib_logf(IB_LOG_LEVEL_ERROR,
1412  "Unknown file create mode (%lu) for file '%s'",
1413  create_mode, name);
1414 
1415  return((os_file_t) -1);
1416  }
1417 
1418  file = ::open(name, create_flag, os_innodb_umask);
1419 
1420  *success = file == -1 ? FALSE : TRUE;
1421 
1422 #ifdef USE_FILE_LOCK
1423  if (!srv_read_only_mode
1424  && *success
1425  && access_type == OS_FILE_READ_WRITE
1426  && os_file_lock(file, name)) {
1427 
1428  *success = FALSE;
1429  close(file);
1430  file = -1;
1431 
1432  }
1433 #endif /* USE_FILE_LOCK */
1434 
1435 #endif /* __WIN__ */
1436 
1437  return(file);
1438 }
1439 
1440 /****************************************************************/
1442 UNIV_INTERN
1443 void
1445 /*================*/
1446  int fd
1447  __attribute__((unused)),
1448  const char* file_name
1450  __attribute__((unused)),
1451  const char* operation_name __attribute__((unused)))
1454 {
1455  /* some versions of Solaris may not have DIRECTIO_ON */
1456 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1457  if (directio(fd, DIRECTIO_ON) == -1) {
1458  int errno_save = errno;
1459 
1460  ib_logf(IB_LOG_LEVEL_ERROR,
1461  "Failed to set DIRECTIO_ON on file %s: %s: %s, "
1462  "continuing anyway.",
1463  file_name, operation_name, strerror(errno_save));
1464  }
1465 #elif defined(O_DIRECT)
1466  if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1467  int errno_save = errno;
1468 
1469  ib_logf(IB_LOG_LEVEL_ERROR,
1470  "Failed to set O_DIRECT on file %s: %s: %s, "
1471  "continuing anyway",
1472  file_name, operation_name, strerror(errno_save));
1473 
1474  if (errno_save == EINVAL) {
1475  ib_logf(IB_LOG_LEVEL_ERROR,
1476  "O_DIRECT is known to result in 'Invalid "
1477  "argument' on Linux on tmpfs, see MySQL "
1478  "Bug#26662");
1479  }
1480  }
1481 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
1482 }
1483 
1484 /****************************************************************/
1490 UNIV_INTERN
1491 os_file_t
1493 /*================*/
1494  const char* name,
1496  ulint create_mode,
1497  ulint purpose,
1504  ulint type,
1505  ibool* success)
1506 {
1507  os_file_t file;
1508  ibool retry;
1509  ibool on_error_no_exit;
1510  ibool on_error_silent;
1511 
1512 #ifdef __WIN__
1513  DBUG_EXECUTE_IF(
1514  "ib_create_table_fail_disk_full",
1515  *success = FALSE;
1516  SetLastError(ERROR_DISK_FULL);
1517  return((os_file_t) -1);
1518  );
1519 #else /* __WIN__ */
1520  DBUG_EXECUTE_IF(
1521  "ib_create_table_fail_disk_full",
1522  *success = FALSE;
1523  errno = ENOSPC;
1524  return((os_file_t) -1);
1525  );
1526 #endif /* __WIN__ */
1527 
1528 #ifdef __WIN__
1529  DWORD create_flag;
1530  DWORD share_mode = FILE_SHARE_READ;
1531 
1532  on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1533  ? TRUE : FALSE;
1534 
1535  on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1536  ? TRUE : FALSE;
1537 
1538  create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1539  create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1540 
1541  if (create_mode == OS_FILE_OPEN_RAW) {
1542 
1544 
1545  create_flag = OPEN_EXISTING;
1546 
1547  /* On Windows Physical devices require admin privileges and
1548  have to have the write-share mode set. See the remarks
1549  section for the CreateFile() function documentation in MSDN. */
1550 
1551  share_mode |= FILE_SHARE_WRITE;
1552 
1553  } else if (create_mode == OS_FILE_OPEN
1554  || create_mode == OS_FILE_OPEN_RETRY) {
1555 
1556  create_flag = OPEN_EXISTING;
1557 
1558  } else if (srv_read_only_mode) {
1559 
1560  create_flag = OPEN_EXISTING;
1561 
1562  } else if (create_mode == OS_FILE_CREATE) {
1563 
1564  create_flag = CREATE_NEW;
1565 
1566  } else if (create_mode == OS_FILE_OVERWRITE) {
1567 
1568  create_flag = CREATE_ALWAYS;
1569 
1570  } else {
1571  ib_logf(IB_LOG_LEVEL_ERROR,
1572  "Unknown file create mode (%lu) for file '%s'",
1573  create_mode, name);
1574 
1575  return((os_file_t) -1);
1576  }
1577 
1578  DWORD attributes = 0;
1579 
1580 #ifdef UNIV_HOTBACKUP
1581  attributes |= FILE_FLAG_NO_BUFFERING;
1582 #else
1583  if (purpose == OS_FILE_AIO) {
1584 
1585 #ifdef WIN_ASYNC_IO
1586  /* If specified, use asynchronous (overlapped) io and no
1587  buffering of writes in the OS */
1588 
1589  if (srv_use_native_aio) {
1590  attributes |= FILE_FLAG_OVERLAPPED;
1591  }
1592 #endif /* WIN_ASYNC_IO */
1593 
1594  } else if (purpose == OS_FILE_NORMAL) {
1595  /* Use default setting. */
1596  } else {
1597  ib_logf(IB_LOG_LEVEL_ERROR,
1598  "Unknown purpose flag (%lu) while opening file '%s'",
1599  purpose, name);
1600 
1601  return((os_file_t)(-1));
1602  }
1603 
1604 #ifdef UNIV_NON_BUFFERED_IO
1605  // TODO: Create a bug, this looks wrong. The flush log
1606  // parameter is dynamic.
1607  if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1608 
1609  /* Do not use unbuffered i/o for the log files because
1610  value 2 denotes that we do not flush the log at every
1611  commit, but only once per second */
1612 
1613  } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
1614 
1615  attributes |= FILE_FLAG_NO_BUFFERING;
1616  }
1617 #endif /* UNIV_NON_BUFFERED_IO */
1618 
1619 #endif /* UNIV_HOTBACKUP */
1620  DWORD access = GENERIC_READ;
1621 
1622  if (!srv_read_only_mode) {
1623  access |= GENERIC_WRITE;
1624  }
1625 
1626  do {
1627  /* Use default security attributes and no template file. */
1628  file = CreateFile(
1629  (LPCTSTR) name, access, share_mode, NULL,
1630  create_flag, attributes, NULL);
1631 
1632  if (file == INVALID_HANDLE_VALUE) {
1633  const char* operation;
1634 
1635  operation = (create_mode == OS_FILE_CREATE
1636  && !srv_read_only_mode)
1637  ? "create" : "open";
1638 
1639  *success = FALSE;
1640 
1641  if (on_error_no_exit) {
1642  retry = os_file_handle_error_no_exit(
1643  name, operation, on_error_silent);
1644  } else {
1645  retry = os_file_handle_error(name, operation);
1646  }
1647  } else {
1648  *success = TRUE;
1649  retry = FALSE;
1650  }
1651 
1652  } while (retry);
1653 
1654 #else /* __WIN__ */
1655  int create_flag;
1656  const char* mode_str = NULL;
1657 
1658  on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1659  ? TRUE : FALSE;
1660  on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1661  ? TRUE : FALSE;
1662 
1663  create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1664  create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1665 
1666  if (create_mode == OS_FILE_OPEN
1667  || create_mode == OS_FILE_OPEN_RAW
1668  || create_mode == OS_FILE_OPEN_RETRY) {
1669 
1670  mode_str = "OPEN";
1671 
1672  create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
1673 
1674  } else if (srv_read_only_mode) {
1675 
1676  mode_str = "OPEN";
1677 
1678  create_flag = O_RDONLY;
1679 
1680  } else if (create_mode == OS_FILE_CREATE) {
1681 
1682  mode_str = "CREATE";
1683  create_flag = O_RDWR | O_CREAT | O_EXCL;
1684 
1685  } else if (create_mode == OS_FILE_OVERWRITE) {
1686 
1687  mode_str = "OVERWRITE";
1688  create_flag = O_RDWR | O_CREAT | O_TRUNC;
1689 
1690  } else {
1691  ib_logf(IB_LOG_LEVEL_ERROR,
1692  "Unknown file create mode (%lu) for file '%s'",
1693  create_mode, name);
1694 
1695  return((os_file_t) -1);
1696  }
1697 
1698  ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1699  ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1700 
1701 #ifdef O_SYNC
1702  /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1703  O_SYNC because the datasync options seemed to corrupt files in 2001
1704  in both Linux and Solaris */
1705 
1706  if (!srv_read_only_mode
1707  && type == OS_LOG_FILE
1708  && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1709 
1710  create_flag |= O_SYNC;
1711  }
1712 #endif /* O_SYNC */
1713 
1714  do {
1715  file = ::open(name, create_flag, os_innodb_umask);
1716 
1717  if (file == -1) {
1718  const char* operation;
1719 
1720  operation = (create_mode == OS_FILE_CREATE
1721  && !srv_read_only_mode)
1722  ? "create" : "open";
1723 
1724  *success = FALSE;
1725 
1726  if (on_error_no_exit) {
1727  retry = os_file_handle_error_no_exit(
1728  name, operation, on_error_silent);
1729  } else {
1730  retry = os_file_handle_error(name, operation);
1731  }
1732  } else {
1733  *success = TRUE;
1734  retry = false;
1735  }
1736 
1737  } while (retry);
1738 
1739  /* We disable OS caching (O_DIRECT) only on data files */
1740 
1741  if (!srv_read_only_mode
1742  && *success
1743  && type != OS_LOG_FILE
1744  && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1745  || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1746 
1747  os_file_set_nocache(file, name, mode_str);
1748  }
1749 
1750 #ifdef USE_FILE_LOCK
1751  if (!srv_read_only_mode
1752  && *success
1753  && create_mode != OS_FILE_OPEN_RAW
1754  && os_file_lock(file, name)) {
1755 
1756  if (create_mode == OS_FILE_OPEN_RETRY) {
1757 
1759 
1760  ib_logf(IB_LOG_LEVEL_INFO,
1761  "Retrying to lock the first data file");
1762 
1763  for (int i = 0; i < 100; i++) {
1764  os_thread_sleep(1000000);
1765 
1766  if (!os_file_lock(file, name)) {
1767  *success = TRUE;
1768  return(file);
1769  }
1770  }
1771 
1772  ib_logf(IB_LOG_LEVEL_INFO,
1773  "Unable to open the first data file");
1774  }
1775 
1776  *success = FALSE;
1777  close(file);
1778  file = -1;
1779  }
1780 #endif /* USE_FILE_LOCK */
1781 
1782 #endif /* __WIN__ */
1783 
1784  return(file);
1785 }
1786 
1787 /***********************************************************************/
1790 UNIV_INTERN
1791 bool
1793 /*==========================*/
1794  const char* name)
1796 {
1797 #ifdef __WIN__
1798  bool ret;
1799  ulint count = 0;
1800 loop:
1801  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1802  it */
1803 
1804  ret = DeleteFile((LPCTSTR) name);
1805 
1806  if (ret) {
1807  return(true);
1808  }
1809 
1810  DWORD lasterr = GetLastError();
1811  if (lasterr == ERROR_FILE_NOT_FOUND
1812  || lasterr == ERROR_PATH_NOT_FOUND) {
1813  /* the file does not exist, this not an error */
1814 
1815  return(true);
1816  }
1817 
1818  count++;
1819 
1820  if (count > 100 && 0 == (count % 10)) {
1821  os_file_get_last_error(true); /* print error information */
1822 
1823  ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
1824  }
1825 
1826  os_thread_sleep(1000000); /* sleep for a second */
1827 
1828  if (count > 2000) {
1829 
1830  return(false);
1831  }
1832 
1833  goto loop;
1834 #else
1835  int ret;
1836 
1837  ret = unlink(name);
1838 
1839  if (ret != 0 && errno != ENOENT) {
1840  os_file_handle_error_no_exit(name, "delete", FALSE);
1841 
1842  return(false);
1843  }
1844 
1845  return(true);
1846 #endif /* __WIN__ */
1847 }
1848 
1849 /***********************************************************************/
1852 UNIV_INTERN
1853 bool
1855 /*================*/
1856  const char* name)
1858 {
1859 #ifdef __WIN__
1860  BOOL ret;
1861  ulint count = 0;
1862 loop:
1863  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1864  it */
1865 
1866  ret = DeleteFile((LPCTSTR) name);
1867 
1868  if (ret) {
1869  return(true);
1870  }
1871 
1872  if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1873  /* If the file does not exist, we classify this as a 'mild'
1874  error and return */
1875 
1876  return(false);
1877  }
1878 
1879  count++;
1880 
1881  if (count > 100 && 0 == (count % 10)) {
1882  os_file_get_last_error(true); /* print error information */
1883 
1884  fprintf(stderr,
1885  "InnoDB: Warning: cannot delete file %s\n"
1886  "InnoDB: Are you running ibbackup"
1887  " to back up the file?\n", name);
1888  }
1889 
1890  os_thread_sleep(1000000); /* sleep for a second */
1891 
1892  if (count > 2000) {
1893 
1894  return(false);
1895  }
1896 
1897  goto loop;
1898 #else
1899  int ret;
1900 
1901  ret = unlink(name);
1902 
1903  if (ret != 0) {
1904  os_file_handle_error_no_exit(name, "delete", FALSE);
1905 
1906  return(false);
1907  }
1908 
1909  return(true);
1910 #endif
1911 }
1912 
1913 /***********************************************************************/
1918 UNIV_INTERN
1919 ibool
1921 /*================*/
1922  const char* oldpath,
1924  const char* newpath)
1925 {
1926 #ifdef UNIV_DEBUG
1927  os_file_type_t type;
1928  ibool exists;
1929 
1930  /* New path must not exist. */
1931  ut_ad(os_file_status(newpath, &exists, &type));
1932  ut_ad(!exists);
1933 
1934  /* Old path must exist. */
1935  ut_ad(os_file_status(oldpath, &exists, &type));
1936  ut_ad(exists);
1937 #endif /* UNIV_DEBUG */
1938 
1939 #ifdef __WIN__
1940  BOOL ret;
1941 
1942  ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
1943 
1944  if (ret) {
1945  return(TRUE);
1946  }
1947 
1948  os_file_handle_error_no_exit(oldpath, "rename", FALSE);
1949 
1950  return(FALSE);
1951 #else
1952  int ret;
1953 
1954  ret = rename(oldpath, newpath);
1955 
1956  if (ret != 0) {
1957  os_file_handle_error_no_exit(oldpath, "rename", FALSE);
1958 
1959  return(FALSE);
1960  }
1961 
1962  return(TRUE);
1963 #endif /* __WIN__ */
1964 }
1965 
1966 /***********************************************************************/
1971 UNIV_INTERN
1972 ibool
1974 /*===============*/
1975  os_file_t file)
1976 {
1977 #ifdef __WIN__
1978  BOOL ret;
1979 
1980  ut_a(file);
1981 
1982  ret = CloseHandle(file);
1983 
1984  if (ret) {
1985  return(TRUE);
1986  }
1987 
1988  os_file_handle_error(NULL, "close");
1989 
1990  return(FALSE);
1991 #else
1992  int ret;
1993 
1994  ret = close(file);
1995 
1996  if (ret == -1) {
1997  os_file_handle_error(NULL, "close");
1998 
1999  return(FALSE);
2000  }
2001 
2002  return(TRUE);
2003 #endif /* __WIN__ */
2004 }
2005 
2006 #ifdef UNIV_HOTBACKUP
2007 /***********************************************************************/
2010 UNIV_INTERN
2011 ibool
2012 os_file_close_no_error_handling(
2013 /*============================*/
2014  os_file_t file)
2015 {
2016 #ifdef __WIN__
2017  BOOL ret;
2018 
2019  ut_a(file);
2020 
2021  ret = CloseHandle(file);
2022 
2023  if (ret) {
2024  return(TRUE);
2025  }
2026 
2027  return(FALSE);
2028 #else
2029  int ret;
2030 
2031  ret = close(file);
2032 
2033  if (ret == -1) {
2034 
2035  return(FALSE);
2036  }
2037 
2038  return(TRUE);
2039 #endif /* __WIN__ */
2040 }
2041 #endif /* UNIV_HOTBACKUP */
2042 
2043 /***********************************************************************/
2046 UNIV_INTERN
2049 /*=============*/
2050  os_file_t file)
2051 {
2052 #ifdef __WIN__
2054  DWORD high;
2055  DWORD low;
2056 
2057  low = GetFileSize(file, &high);
2058 
2059  if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
2060  return((os_offset_t) -1);
2061  }
2062 
2063  offset = (os_offset_t) low | ((os_offset_t) high << 32);
2064 
2065  return(offset);
2066 #else
2067  return((os_offset_t) lseek(file, 0, SEEK_END));
2068 #endif /* __WIN__ */
2069 }
2070 
2071 /***********************************************************************/
2074 UNIV_INTERN
2075 ibool
2077 /*=============*/
2078  const char* name,
2080  os_file_t file,
2081  os_offset_t size)
2082 {
2083  os_offset_t current_size;
2084  ibool ret;
2085  byte* buf;
2086  byte* buf2;
2087  ulint buf_size;
2088 
2089  current_size = 0;
2090 
2091  /* Write up to 1 megabyte at a time. */
2092  buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
2093  * UNIV_PAGE_SIZE;
2094  buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
2095 
2096  /* Align the buffer for possible raw i/o */
2097  buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
2098 
2099  /* Write buffer full of zeros */
2100  memset(buf, 0, buf_size);
2101 
2102  if (size >= (os_offset_t) 100 << 20) {
2103 
2104  fprintf(stderr, "InnoDB: Progress in MB:");
2105  }
2106 
2107  while (current_size < size) {
2108  ulint n_bytes;
2109 
2110  if (size - current_size < (os_offset_t) buf_size) {
2111  n_bytes = (ulint) (size - current_size);
2112  } else {
2113  n_bytes = buf_size;
2114  }
2115 
2116  ret = os_file_write(name, file, buf, current_size, n_bytes);
2117  if (!ret) {
2118  ut_free(buf2);
2119  goto error_handling;
2120  }
2121 
2122  /* Print about progress for each 100 MB written */
2123  if ((current_size + n_bytes) / (100 << 20)
2124  != current_size / (100 << 20)) {
2125 
2126  fprintf(stderr, " %lu00",
2127  (ulong) ((current_size + n_bytes)
2128  / (100 << 20)));
2129  }
2130 
2131  current_size += n_bytes;
2132  }
2133 
2134  if (size >= (os_offset_t) 100 << 20) {
2135 
2136  fprintf(stderr, "\n");
2137  }
2138 
2139  ut_free(buf2);
2140 
2141  ret = os_file_flush(file);
2142 
2143  if (ret) {
2144  return(TRUE);
2145  }
2146 
2147 error_handling:
2148  return(FALSE);
2149 }
2150 
2151 /***********************************************************************/
2154 UNIV_INTERN
2155 ibool
2157 /*============*/
2158  FILE* file)
2159 {
2160 #ifdef __WIN__
2161  HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2162  return(SetEndOfFile(h));
2163 #else /* __WIN__ */
2164  return(!ftruncate(fileno(file), ftell(file)));
2165 #endif /* __WIN__ */
2166 }
2167 
2168 #ifndef __WIN__
2169 /***********************************************************************/
2175 static
2176 int
2177 os_file_fsync(
2178 /*==========*/
2179  os_file_t file)
2180 {
2181  int ret;
2182  int failures;
2183  ibool retry;
2184 
2185  failures = 0;
2186 
2187  do {
2188  ret = fsync(file);
2189 
2190  os_n_fsyncs++;
2191 
2192  if (ret == -1 && errno == ENOLCK) {
2193 
2194  if (failures % 100 == 0) {
2195 
2196  ut_print_timestamp(stderr);
2197  fprintf(stderr,
2198  " InnoDB: fsync(): "
2199  "No locks available; retrying\n");
2200  }
2201 
2202  os_thread_sleep(200000 /* 0.2 sec */);
2203 
2204  failures++;
2205 
2206  retry = TRUE;
2207  } else {
2208 
2209  retry = FALSE;
2210  }
2211  } while (retry);
2212 
2213  return(ret);
2214 }
2215 #endif /* !__WIN__ */
2216 
2217 /***********************************************************************/
2221 UNIV_INTERN
2222 ibool
2224 /*===============*/
2225  os_file_t file)
2226 {
2227 #ifdef __WIN__
2228  BOOL ret;
2229 
2230  ut_a(file);
2231 
2232  os_n_fsyncs++;
2233 
2234  ret = FlushFileBuffers(file);
2235 
2236  if (ret) {
2237  return(TRUE);
2238  }
2239 
2240  /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2241  actually a raw device, we choose to ignore that error if we are using
2242  raw disks */
2243 
2244  if (srv_start_raw_disk_in_use && GetLastError()
2245  == ERROR_INVALID_FUNCTION) {
2246  return(TRUE);
2247  }
2248 
2249  os_file_handle_error(NULL, "flush");
2250 
2251  /* It is a fatal error if a file flush does not succeed, because then
2252  the database can get corrupt on disk */
2253  ut_error;
2254 
2255  return(FALSE);
2256 #else
2257  int ret;
2258 
2259 #if defined(HAVE_DARWIN_THREADS)
2260 # ifndef F_FULLFSYNC
2261  /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2262 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2263 # elif F_FULLFSYNC != 51
2264 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2265 # endif
2266  /* Apple has disabled fsync() for internal disk drives in OS X. That
2267  caused corruption for a user when he tested a power outage. Let us in
2268  OS X use a nonstandard flush method recommended by an Apple
2269  engineer. */
2270 
2271  if (!srv_have_fullfsync) {
2272  /* If we are not on an operating system that supports this,
2273  then fall back to a plain fsync. */
2274 
2275  ret = os_file_fsync(file);
2276  } else {
2277  ret = fcntl(file, F_FULLFSYNC, NULL);
2278 
2279  if (ret) {
2280  /* If we are not on a file system that supports this,
2281  then fall back to a plain fsync. */
2282  ret = os_file_fsync(file);
2283  }
2284  }
2285 #else
2286  ret = os_file_fsync(file);
2287 #endif
2288 
2289  if (ret == 0) {
2290  return(TRUE);
2291  }
2292 
2293  /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2294  we choose to ignore that error if we are using raw disks */
2295 
2296  if (srv_start_raw_disk_in_use && errno == EINVAL) {
2297 
2298  return(TRUE);
2299  }
2300 
2301  ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
2302 
2303  os_file_handle_error(NULL, "flush");
2304 
2305  /* It is a fatal error if a file flush does not succeed, because then
2306  the database can get corrupt on disk */
2307  ut_error;
2308 
2309  return(FALSE);
2310 #endif
2311 }
2312 
2313 #ifndef __WIN__
2314 /*******************************************************************/
2317 static __attribute__((nonnull, warn_unused_result))
2318 ssize_t
2319 os_file_pread(
2320 /*==========*/
2321  os_file_t file,
2322  void* buf,
2323  ulint n,
2325 {
2326  off_t offs;
2327 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2328  ssize_t n_bytes;
2329 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2330 
2331  ut_ad(n);
2332 
2333  /* If off_t is > 4 bytes in size, then we assume we can pass a
2334  64-bit address */
2335  offs = (off_t) offset;
2336 
2337  if (sizeof(off_t) <= 4) {
2338  if (offset != (os_offset_t) offs) {
2339  ib_logf(IB_LOG_LEVEL_ERROR,
2340  "File read at offset > 4 GB");
2341  }
2342  }
2343 
2344  os_n_file_reads++;
2345 
2346 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2347 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2348  (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2349  (void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
2350  MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2351 #else
2352  os_mutex_enter(os_file_count_mutex);
2355  MONITOR_INC(MONITOR_OS_PENDING_READS);
2356  os_mutex_exit(os_file_count_mutex);
2357 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2358 
2359  n_bytes = pread(file, buf, n, offs);
2360 
2361 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2362  (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2363  (void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
2364  MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
2365 #else
2366  os_mutex_enter(os_file_count_mutex);
2369  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2370  os_mutex_exit(os_file_count_mutex);
2371 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
2372 
2373  return(n_bytes);
2374 #else
2375  {
2376  off_t ret_offset;
2377  ssize_t ret;
2378 #ifndef UNIV_HOTBACKUP
2379  ulint i;
2380 #endif /* !UNIV_HOTBACKUP */
2381 
2382 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2383  (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2384  MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2385 #else
2386  os_mutex_enter(os_file_count_mutex);
2388  MONITOR_INC(MONITOR_OS_PENDING_READS);
2389  os_mutex_exit(os_file_count_mutex);
2390 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2391 #ifndef UNIV_HOTBACKUP
2392  /* Protect the seek / read operation with a mutex */
2393  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2394 
2395  os_mutex_enter(os_file_seek_mutexes[i]);
2396 #endif /* !UNIV_HOTBACKUP */
2397 
2398  ret_offset = lseek(file, offs, SEEK_SET);
2399 
2400  if (ret_offset < 0) {
2401  ret = -1;
2402  } else {
2403  ret = read(file, buf, (ssize_t) n);
2404  }
2405 
2406 #ifndef UNIV_HOTBACKUP
2407  os_mutex_exit(os_file_seek_mutexes[i]);
2408 #endif /* !UNIV_HOTBACKUP */
2409 
2410 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2411  (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2412  MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
2413 #else
2414  os_mutex_enter(os_file_count_mutex);
2416  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2417  os_mutex_exit(os_file_count_mutex);
2418 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
2419 
2420  return(ret);
2421  }
2422 #endif
2423 }
2424 
2425 /*******************************************************************/
2428 static __attribute__((nonnull, warn_unused_result))
2429 ssize_t
2430 os_file_pwrite(
2431 /*===========*/
2432  os_file_t file,
2433  const void* buf,
2434  ulint n,
2436 {
2437  ssize_t ret;
2438  off_t offs;
2439 
2440  ut_ad(n);
2442 
2443  /* If off_t is > 4 bytes in size, then we assume we can pass a
2444  64-bit address */
2445  offs = (off_t) offset;
2446 
2447  if (sizeof(off_t) <= 4) {
2448  if (offset != (os_offset_t) offs) {
2449  ib_logf(IB_LOG_LEVEL_ERROR,
2450  "File write at offset > 4 GB.");
2451  }
2452  }
2453 
2454  os_n_file_writes++;
2455 
2456 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2457 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2458  os_mutex_enter(os_file_count_mutex);
2461  MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2462  os_mutex_exit(os_file_count_mutex);
2463 #else
2464  (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
2465  (void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
2466  MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
2467 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2468 
2469  ret = pwrite(file, buf, (ssize_t) n, offs);
2470 
2471 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2472  os_mutex_enter(os_file_count_mutex);
2475  MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2476  os_mutex_exit(os_file_count_mutex);
2477 #else
2478  (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
2479  (void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
2480  MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
2481 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2482 
2483  return(ret);
2484 #else
2485  {
2486  off_t ret_offset;
2487 # ifndef UNIV_HOTBACKUP
2488  ulint i;
2489 # endif /* !UNIV_HOTBACKUP */
2490 
2491  os_mutex_enter(os_file_count_mutex);
2493  MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2494  os_mutex_exit(os_file_count_mutex);
2495 
2496 # ifndef UNIV_HOTBACKUP
2497  /* Protect the seek / write operation with a mutex */
2498  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2499 
2500  os_mutex_enter(os_file_seek_mutexes[i]);
2501 # endif /* UNIV_HOTBACKUP */
2502 
2503  ret_offset = lseek(file, offs, SEEK_SET);
2504 
2505  if (ret_offset < 0) {
2506  ret = -1;
2507 
2508  goto func_exit;
2509  }
2510 
2511  ret = write(file, buf, (ssize_t) n);
2512 
2513 func_exit:
2514 # ifndef UNIV_HOTBACKUP
2515  os_mutex_exit(os_file_seek_mutexes[i]);
2516 # endif /* !UNIV_HOTBACKUP */
2517 
2518  os_mutex_enter(os_file_count_mutex);
2520  MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2521  os_mutex_exit(os_file_count_mutex);
2522 
2523  return(ret);
2524  }
2525 #endif /* !UNIV_HOTBACKUP */
2526 }
2527 #endif
2528 
2529 /*******************************************************************/
2534 UNIV_INTERN
2535 ibool
2537 /*==============*/
2538  os_file_t file,
2539  void* buf,
2540  os_offset_t offset,
2541  ulint n)
2542 {
2543 #ifdef __WIN__
2544  BOOL ret;
2545  DWORD len;
2546  DWORD ret2;
2547  DWORD low;
2548  DWORD high;
2549  ibool retry;
2550 #ifndef UNIV_HOTBACKUP
2551  ulint i;
2552 #endif /* !UNIV_HOTBACKUP */
2553 
2554  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2555  no more than 32 bits. */
2556  ut_a((n & 0xFFFFFFFFUL) == n);
2557 
2558  os_n_file_reads++;
2559  os_bytes_read_since_printout += n;
2560 
2561 try_again:
2562  ut_ad(file);
2563  ut_ad(buf);
2564  ut_ad(n > 0);
2565 
2566  low = (DWORD) offset & 0xFFFFFFFF;
2567  high = (DWORD) (offset >> 32);
2568 
2569  os_mutex_enter(os_file_count_mutex);
2571  MONITOR_INC(MONITOR_OS_PENDING_READS);
2572  os_mutex_exit(os_file_count_mutex);
2573 
2574 #ifndef UNIV_HOTBACKUP
2575  /* Protect the seek / read operation with a mutex */
2576  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2577 
2578  os_mutex_enter(os_file_seek_mutexes[i]);
2579 #endif /* !UNIV_HOTBACKUP */
2580 
2581  ret2 = SetFilePointer(
2582  file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2583 
2584  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2585 
2586 #ifndef UNIV_HOTBACKUP
2587  os_mutex_exit(os_file_seek_mutexes[i]);
2588 #endif /* !UNIV_HOTBACKUP */
2589 
2590  os_mutex_enter(os_file_count_mutex);
2592  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2593  os_mutex_exit(os_file_count_mutex);
2594 
2595  goto error_handling;
2596  }
2597 
2598  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2599 
2600 #ifndef UNIV_HOTBACKUP
2601  os_mutex_exit(os_file_seek_mutexes[i]);
2602 #endif /* !UNIV_HOTBACKUP */
2603 
2604  os_mutex_enter(os_file_count_mutex);
2606  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2607  os_mutex_exit(os_file_count_mutex);
2608 
2609  if (ret && len == n) {
2610  return(TRUE);
2611  }
2612 #else /* __WIN__ */
2613  ibool retry;
2614  ssize_t ret;
2615 
2616  os_bytes_read_since_printout += n;
2617 
2618 try_again:
2619  ret = os_file_pread(file, buf, n, offset);
2620 
2621  if ((ulint) ret == n) {
2622 
2623  return(TRUE);
2624  }
2625 
2626  ib_logf(IB_LOG_LEVEL_ERROR,
2627  "Tried to read "ULINTPF" bytes at offset " UINT64PF". "
2628  "Was only able to read %ld.", n, offset, (lint) ret);
2629 #endif /* __WIN__ */
2630 #ifdef __WIN__
2631 error_handling:
2632 #endif
2633  retry = os_file_handle_error(NULL, "read");
2634 
2635  if (retry) {
2636  goto try_again;
2637  }
2638 
2639  fprintf(stderr,
2640  "InnoDB: Fatal error: cannot read from file."
2641  " OS error number %lu.\n",
2642 #ifdef __WIN__
2643  (ulong) GetLastError()
2644 #else
2645  (ulong) errno
2646 #endif /* __WIN__ */
2647  );
2648  fflush(stderr);
2649 
2650  ut_error;
2651 
2652  return(FALSE);
2653 }
2654 
2655 /*******************************************************************/
2661 UNIV_INTERN
2662 ibool
2664 /*================================*/
2665  os_file_t file,
2666  void* buf,
2667  os_offset_t offset,
2668  ulint n)
2669 {
2670 #ifdef __WIN__
2671  BOOL ret;
2672  DWORD len;
2673  DWORD ret2;
2674  DWORD low;
2675  DWORD high;
2676  ibool retry;
2677 #ifndef UNIV_HOTBACKUP
2678  ulint i;
2679 #endif /* !UNIV_HOTBACKUP */
2680 
2681  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2682  no more than 32 bits. */
2683  ut_a((n & 0xFFFFFFFFUL) == n);
2684 
2685  os_n_file_reads++;
2686  os_bytes_read_since_printout += n;
2687 
2688 try_again:
2689  ut_ad(file);
2690  ut_ad(buf);
2691  ut_ad(n > 0);
2692 
2693  low = (DWORD) offset & 0xFFFFFFFF;
2694  high = (DWORD) (offset >> 32);
2695 
2696  os_mutex_enter(os_file_count_mutex);
2698  MONITOR_INC(MONITOR_OS_PENDING_READS);
2699  os_mutex_exit(os_file_count_mutex);
2700 
2701 #ifndef UNIV_HOTBACKUP
2702  /* Protect the seek / read operation with a mutex */
2703  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2704 
2705  os_mutex_enter(os_file_seek_mutexes[i]);
2706 #endif /* !UNIV_HOTBACKUP */
2707 
2708  ret2 = SetFilePointer(
2709  file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2710 
2711  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2712 
2713 #ifndef UNIV_HOTBACKUP
2714  os_mutex_exit(os_file_seek_mutexes[i]);
2715 #endif /* !UNIV_HOTBACKUP */
2716 
2717  os_mutex_enter(os_file_count_mutex);
2719  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2720  os_mutex_exit(os_file_count_mutex);
2721 
2722  goto error_handling;
2723  }
2724 
2725  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2726 
2727 #ifndef UNIV_HOTBACKUP
2728  os_mutex_exit(os_file_seek_mutexes[i]);
2729 #endif /* !UNIV_HOTBACKUP */
2730 
2731  os_mutex_enter(os_file_count_mutex);
2733  MONITOR_DEC(MONITOR_OS_PENDING_READS);
2734  os_mutex_exit(os_file_count_mutex);
2735 
2736  if (ret && len == n) {
2737  return(TRUE);
2738  }
2739 #else /* __WIN__ */
2740  ibool retry;
2741  ssize_t ret;
2742 
2743  os_bytes_read_since_printout += n;
2744 
2745 try_again:
2746  ret = os_file_pread(file, buf, n, offset);
2747 
2748  if ((ulint) ret == n) {
2749 
2750  return(TRUE);
2751  }
2752 #endif /* __WIN__ */
2753 #ifdef __WIN__
2754 error_handling:
2755 #endif
2756  retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
2757 
2758  if (retry) {
2759  goto try_again;
2760  }
2761 
2762  return(FALSE);
2763 }
2764 
2765 /*******************************************************************/
2769 UNIV_INTERN
2770 void
2772 /*================*/
2773  FILE* file,
2774  char* str,
2775  ulint size)
2776 {
2777  size_t flen;
2778 
2779  if (size == 0) {
2780  return;
2781  }
2782 
2783  rewind(file);
2784  flen = fread(str, 1, size - 1, file);
2785  str[flen] = '\0';
2786 }
2787 
2788 /*******************************************************************/
2793 UNIV_INTERN
2794 ibool
2796 /*===============*/
2797  const char* name,
2799  os_file_t file,
2800  const void* buf,
2801  os_offset_t offset,
2802  ulint n)
2803 {
2805 
2806 #ifdef __WIN__
2807  BOOL ret;
2808  DWORD len;
2809  DWORD ret2;
2810  DWORD low;
2811  DWORD high;
2812  ulint n_retries = 0;
2813  ulint err;
2814 #ifndef UNIV_HOTBACKUP
2815  ulint i;
2816 #endif /* !UNIV_HOTBACKUP */
2817 
2818  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2819  no more than 32 bits. */
2820  ut_a((n & 0xFFFFFFFFUL) == n);
2821 
2822  os_n_file_writes++;
2823 
2824  ut_ad(file);
2825  ut_ad(buf);
2826  ut_ad(n > 0);
2827 retry:
2828  low = (DWORD) offset & 0xFFFFFFFF;
2829  high = (DWORD) (offset >> 32);
2830 
2831  os_mutex_enter(os_file_count_mutex);
2833  MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2834  os_mutex_exit(os_file_count_mutex);
2835 
2836 #ifndef UNIV_HOTBACKUP
2837  /* Protect the seek / write operation with a mutex */
2838  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2839 
2840  os_mutex_enter(os_file_seek_mutexes[i]);
2841 #endif /* !UNIV_HOTBACKUP */
2842 
2843  ret2 = SetFilePointer(
2844  file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2845 
2846  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2847 
2848 #ifndef UNIV_HOTBACKUP
2849  os_mutex_exit(os_file_seek_mutexes[i]);
2850 #endif /* !UNIV_HOTBACKUP */
2851 
2852  os_mutex_enter(os_file_count_mutex);
2854  MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2855  os_mutex_exit(os_file_count_mutex);
2856 
2857  ut_print_timestamp(stderr);
2858 
2859  fprintf(stderr,
2860  " InnoDB: Error: File pointer positioning to"
2861  " file %s failed at\n"
2862  "InnoDB: offset %llu. Operating system"
2863  " error number %lu.\n"
2864  "InnoDB: Some operating system error numbers"
2865  " are described at\n"
2866  "InnoDB: "
2867  REFMAN "operating-system-error-codes.html\n",
2868  name, offset, (ulong) GetLastError());
2869 
2870  return(FALSE);
2871  }
2872 
2873  ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2874 
2875 #ifndef UNIV_HOTBACKUP
2876  os_mutex_exit(os_file_seek_mutexes[i]);
2877 #endif /* !UNIV_HOTBACKUP */
2878 
2879  os_mutex_enter(os_file_count_mutex);
2881  MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2882  os_mutex_exit(os_file_count_mutex);
2883 
2884  if (ret && len == n) {
2885 
2886  return(TRUE);
2887  }
2888 
2889  /* If some background file system backup tool is running, then, at
2890  least in Windows 2000, we may get here a specific error. Let us
2891  retry the operation 100 times, with 1 second waits. */
2892 
2893  if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2894 
2895  os_thread_sleep(1000000);
2896 
2897  n_retries++;
2898 
2899  goto retry;
2900  }
2901 
2902  if (!os_has_said_disk_full) {
2903 
2904  err = (ulint) GetLastError();
2905 
2906  ut_print_timestamp(stderr);
2907 
2908  fprintf(stderr,
2909  " InnoDB: Error: Write to file %s failed"
2910  " at offset %llu.\n"
2911  "InnoDB: %lu bytes should have been written,"
2912  " only %lu were written.\n"
2913  "InnoDB: Operating system error number %lu.\n"
2914  "InnoDB: Check that your OS and file system"
2915  " support files of this size.\n"
2916  "InnoDB: Check also that the disk is not full"
2917  " or a disk quota exceeded.\n",
2918  name, offset,
2919  (ulong) n, (ulong) len, (ulong) err);
2920 
2921  if (strerror((int) err) != NULL) {
2922  fprintf(stderr,
2923  "InnoDB: Error number %lu means '%s'.\n",
2924  (ulong) err, strerror((int) err));
2925  }
2926 
2927  fprintf(stderr,
2928  "InnoDB: Some operating system error numbers"
2929  " are described at\n"
2930  "InnoDB: "
2931  REFMAN "operating-system-error-codes.html\n");
2932 
2933  os_has_said_disk_full = TRUE;
2934  }
2935 
2936  return(FALSE);
2937 #else
2938  ssize_t ret;
2939 
2940  ret = os_file_pwrite(file, buf, n, offset);
2941 
2942  if ((ulint) ret == n) {
2943 
2944  return(TRUE);
2945  }
2946 
2947  if (!os_has_said_disk_full) {
2948 
2949  ut_print_timestamp(stderr);
2950 
2951  fprintf(stderr,
2952  " InnoDB: Error: Write to file %s failed"
2953  " at offset "UINT64PF".\n"
2954  "InnoDB: %lu bytes should have been written,"
2955  " only %ld were written.\n"
2956  "InnoDB: Operating system error number %lu.\n"
2957  "InnoDB: Check that your OS and file system"
2958  " support files of this size.\n"
2959  "InnoDB: Check also that the disk is not full"
2960  " or a disk quota exceeded.\n",
2961  name, offset, n, (lint) ret,
2962  (ulint) errno);
2963  if (strerror(errno) != NULL) {
2964  fprintf(stderr,
2965  "InnoDB: Error number %d means '%s'.\n",
2966  errno, strerror(errno));
2967  }
2968 
2969  fprintf(stderr,
2970  "InnoDB: Some operating system error numbers"
2971  " are described at\n"
2972  "InnoDB: "
2973  REFMAN "operating-system-error-codes.html\n");
2974 
2975  os_has_said_disk_full = TRUE;
2976  }
2977 
2978  return(FALSE);
2979 #endif
2980 }
2981 
2982 /*******************************************************************/
2985 UNIV_INTERN
2986 ibool
2988 /*===========*/
2989  const char* path,
2990  ibool* exists,
2991  os_file_type_t* type)
2992 {
2993 #ifdef __WIN__
2994  int ret;
2995  struct _stat64 statinfo;
2996 
2997  ret = _stat64(path, &statinfo);
2998  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2999  /* file does not exist */
3000  *exists = FALSE;
3001  return(TRUE);
3002  } else if (ret) {
3003  /* file exists, but stat call failed */
3004 
3005  os_file_handle_error_no_exit(path, "stat", FALSE);
3006 
3007  return(FALSE);
3008  }
3009 
3010  if (_S_IFDIR & statinfo.st_mode) {
3011  *type = OS_FILE_TYPE_DIR;
3012  } else if (_S_IFREG & statinfo.st_mode) {
3013  *type = OS_FILE_TYPE_FILE;
3014  } else {
3015  *type = OS_FILE_TYPE_UNKNOWN;
3016  }
3017 
3018  *exists = TRUE;
3019 
3020  return(TRUE);
3021 #else
3022  int ret;
3023  struct stat statinfo;
3024 
3025  ret = stat(path, &statinfo);
3026  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3027  /* file does not exist */
3028  *exists = FALSE;
3029  return(TRUE);
3030  } else if (ret) {
3031  /* file exists, but stat call failed */
3032 
3033  os_file_handle_error_no_exit(path, "stat", FALSE);
3034 
3035  return(FALSE);
3036  }
3037 
3038  if (S_ISDIR(statinfo.st_mode)) {
3039  *type = OS_FILE_TYPE_DIR;
3040  } else if (S_ISLNK(statinfo.st_mode)) {
3041  *type = OS_FILE_TYPE_LINK;
3042  } else if (S_ISREG(statinfo.st_mode)) {
3043  *type = OS_FILE_TYPE_FILE;
3044  } else {
3045  *type = OS_FILE_TYPE_UNKNOWN;
3046  }
3047 
3048  *exists = TRUE;
3049 
3050  return(TRUE);
3051 #endif
3052 }
3053 
3054 /*******************************************************************/
3057 UNIV_INTERN
3058 dberr_t
3060 /*===============*/
3061  const char* path,
3062  os_file_stat_t* stat_info,
3064  bool check_rw_perm)
3066 {
3067  int ret;
3068 
3069 #ifdef __WIN__
3070  struct _stat64 statinfo;
3071 
3072  ret = _stat64(path, &statinfo);
3073 
3074  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3075  /* file does not exist */
3076 
3077  return(DB_NOT_FOUND);
3078 
3079  } else if (ret) {
3080  /* file exists, but stat call failed */
3081 
3082  os_file_handle_error_no_exit(path, "stat", FALSE);
3083 
3084  return(DB_FAIL);
3085 
3086  } else if (_S_IFDIR & statinfo.st_mode) {
3087  stat_info->type = OS_FILE_TYPE_DIR;
3088  } else if (_S_IFREG & statinfo.st_mode) {
3089 
3090  DWORD access = GENERIC_READ;
3091 
3092  if (!srv_read_only_mode) {
3093  access |= GENERIC_WRITE;
3094  }
3095 
3096  stat_info->type = OS_FILE_TYPE_FILE;
3097 
3098  /* Check if we can open it in read-only mode. */
3099 
3100  if (check_rw_perm) {
3101  HANDLE fh;
3102 
3103  fh = CreateFile(
3104  (LPCTSTR) path, // File to open
3105  access,
3106  0, // No sharing
3107  NULL, // Default security
3108  OPEN_EXISTING, // Existing file only
3109  FILE_ATTRIBUTE_NORMAL, // Normal file
3110  NULL); // No attr. template
3111 
3112  if (fh == INVALID_HANDLE_VALUE) {
3113  stat_info->rw_perm = false;
3114  } else {
3115  stat_info->rw_perm = true;
3116  CloseHandle(fh);
3117  }
3118  }
3119  } else {
3120  stat_info->type = OS_FILE_TYPE_UNKNOWN;
3121  }
3122 #else
3123  struct stat statinfo;
3124 
3125  ret = stat(path, &statinfo);
3126 
3127  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3128  /* file does not exist */
3129 
3130  return(DB_NOT_FOUND);
3131 
3132  } else if (ret) {
3133  /* file exists, but stat call failed */
3134 
3135  os_file_handle_error_no_exit(path, "stat", FALSE);
3136 
3137  return(DB_FAIL);
3138 
3139  } else if (S_ISDIR(statinfo.st_mode)) {
3140  stat_info->type = OS_FILE_TYPE_DIR;
3141  } else if (S_ISLNK(statinfo.st_mode)) {
3142  stat_info->type = OS_FILE_TYPE_LINK;
3143  } else if (S_ISREG(statinfo.st_mode)) {
3144  stat_info->type = OS_FILE_TYPE_FILE;
3145 
3146  if (check_rw_perm) {
3147  int fh;
3148  int access;
3149 
3150  access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
3151 
3152  fh = ::open(path, access, os_innodb_umask);
3153 
3154  if (fh == -1) {
3155  stat_info->rw_perm = false;
3156  } else {
3157  stat_info->rw_perm = true;
3158  close(fh);
3159  }
3160  }
3161  } else {
3162  stat_info->type = OS_FILE_TYPE_UNKNOWN;
3163  }
3164 
3165 #endif /* _WIN_ */
3166 
3167  stat_info->ctime = statinfo.st_ctime;
3168  stat_info->atime = statinfo.st_atime;
3169  stat_info->mtime = statinfo.st_mtime;
3170  stat_info->size = statinfo.st_size;
3171 
3172  return(DB_SUCCESS);
3173 }
3174 
3175 /* path name separator character */
3176 #ifdef __WIN__
3177 # define OS_FILE_PATH_SEPARATOR '\\'
3178 #else
3179 # define OS_FILE_PATH_SEPARATOR '/'
3180 #endif
3181 
3182 /****************************************************************/
3193 UNIV_INTERN
3194 char*
3196 /*======================*/
3197  const char* old_path,
3198  const char* tablename)
3199 {
3200  ulint dir_len;
3201  char* last_slash;
3202  char* base_name;
3203  char* new_path;
3204  ulint new_path_len;
3205 
3206  /* Split the tablename into its database and table name components.
3207  They are separated by a '/'. */
3208  last_slash = strrchr((char*) tablename, '/');
3209  base_name = last_slash ? last_slash + 1 : (char*) tablename;
3210 
3211  /* Find the offset of the last slash. We will strip off the
3212  old basename.ibd which starts after that slash. */
3213  last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
3214  dir_len = last_slash ? last_slash - old_path : strlen(old_path);
3215 
3216  /* allocate a new path and move the old directory path to it. */
3217  new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
3218  new_path = static_cast<char*>(mem_alloc(new_path_len));
3219  memcpy(new_path, old_path, dir_len);
3220 
3221  ut_snprintf(new_path + dir_len,
3222  new_path_len - dir_len,
3223  "%c%s.ibd",
3224  OS_FILE_PATH_SEPARATOR,
3225  base_name);
3226 
3227  return(new_path);
3228 }
3229 
3230 /****************************************************************/
3243 UNIV_INTERN
3244 char*
3246 /*=========================*/
3247  const char* data_dir_path,
3248  const char* tablename,
3249  const char* extention)
3250 {
3251  ulint data_dir_len;
3252  char* last_slash;
3253  char* new_path;
3254  ulint new_path_len;
3255 
3256  ut_ad(extention && strlen(extention) == 3);
3257 
3258  /* Find the offset of the last slash. We will strip off the
3259  old basename or tablename which starts after that slash. */
3260  last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3261  data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
3262 
3263  /* allocate a new path and move the old directory path to it. */
3264  new_path_len = data_dir_len + strlen(tablename)
3265  + sizeof "/." + strlen(extention);
3266  new_path = static_cast<char*>(mem_alloc(new_path_len));
3267  memcpy(new_path, data_dir_path, data_dir_len);
3268  ut_snprintf(new_path + data_dir_len,
3269  new_path_len - data_dir_len,
3270  "%c%s.%s",
3271  OS_FILE_PATH_SEPARATOR,
3272  tablename,
3273  extention);
3274 
3275  srv_normalize_path_for_win(new_path);
3276 
3277  return(new_path);
3278 }
3279 
3280 /****************************************************************/
3292 UNIV_INTERN
3293 void
3295 /*========================*/
3296  char* data_dir_path)
3297 {
3298  char* ptr;
3299  char* tablename;
3300  ulint tablename_len;
3301 
3302  /* Replace the period before the extension with a null byte. */
3303  ptr = strrchr((char*) data_dir_path, '.');
3304  if (!ptr) {
3305  return;
3306  }
3307  ptr[0] = '\0';
3308 
3309  /* The tablename starts after the last slash. */
3310  ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3311  if (!ptr) {
3312  return;
3313  }
3314  ptr[0] = '\0';
3315  tablename = ptr + 1;
3316 
3317  /* The databasename starts after the next to last slash. */
3318  ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3319  if (!ptr) {
3320  return;
3321  }
3322  tablename_len = ut_strlen(tablename);
3323 
3324  ut_memmove(++ptr, tablename, tablename_len);
3325 
3326  ptr[tablename_len] = '\0';
3327 }
3328 
3329 /****************************************************************/
3357 UNIV_INTERN
3358 char*
3360 /*============*/
3361  const char* path)
3362 {
3363  /* Find the offset of the last slash */
3364  const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3365  if (!last_slash) {
3366  /* No slash in the path, return "." */
3367 
3368  return(mem_strdup("."));
3369  }
3370 
3371  /* Ok, there is a slash */
3372 
3373  if (last_slash == path) {
3374  /* last slash is the first char of the path */
3375 
3376  return(mem_strdup("/"));
3377  }
3378 
3379  /* Non-trivial directory component */
3380 
3381  return(mem_strdupl(path, last_slash - path));
3382 }
3383 
3384 /****************************************************************/
3387 UNIV_INTERN
3388 ibool
3390 /*=============================*/
3391  const char* path)
3392 {
3393  if (srv_read_only_mode) {
3394 
3395  ib_logf(IB_LOG_LEVEL_ERROR,
3396  "read only mode set. Can't create subdirectories '%s'",
3397  path);
3398 
3399  return(FALSE);
3400 
3401  }
3402 
3403  char* subdir = os_file_dirname(path);
3404 
3405  if (strlen(subdir) == 1
3406  && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3407  /* subdir is root or cwd, nothing to do */
3408  mem_free(subdir);
3409 
3410  return(TRUE);
3411  }
3412 
3413  /* Test if subdir exists */
3414  os_file_type_t type;
3415  ibool subdir_exists;
3416  ibool success = os_file_status(subdir, &subdir_exists, &type);
3417 
3418  if (success && !subdir_exists) {
3419 
3420  /* subdir does not exist, create it */
3421  success = os_file_create_subdirs_if_needed(subdir);
3422 
3423  if (!success) {
3424  mem_free(subdir);
3425 
3426  return(FALSE);
3427  }
3428 
3429  success = os_file_create_directory(subdir, FALSE);
3430  }
3431 
3432  mem_free(subdir);
3433 
3434  return(success);
3435 }
3436 
3437 #ifndef UNIV_HOTBACKUP
3438 /****************************************************************/
3441 static
3443 os_aio_array_get_nth_slot(
3444 /*======================*/
3445  os_aio_array_t* array,
3446  ulint index)
3447 {
3448  ut_a(index < array->n_slots);
3449 
3450  return(&array->slots[index]);
3451 }
3452 
3453 #if defined(LINUX_NATIVE_AIO)
3454 /******************************************************************/
3457 static
3458 ibool
3459 os_aio_linux_create_io_ctx(
3460 /*=======================*/
3461  ulint max_events,
3462  io_context_t* io_ctx)
3463 {
3464  int ret;
3465  ulint retries = 0;
3466 
3467 retry:
3468  memset(io_ctx, 0x0, sizeof(*io_ctx));
3469 
3470  /* Initialize the io_ctx. Tell it how many pending
3471  IO requests this context will handle. */
3472 
3473  ret = io_setup(max_events, io_ctx);
3474  if (ret == 0) {
3475 #if defined(UNIV_AIO_DEBUG)
3476  fprintf(stderr,
3477  "InnoDB: Linux native AIO:"
3478  " initialized io_ctx for segment\n");
3479 #endif
3480  /* Success. Return now. */
3481  return(TRUE);
3482  }
3483 
3484  /* If we hit EAGAIN we'll make a few attempts before failing. */
3485 
3486  switch (ret) {
3487  case -EAGAIN:
3488  if (retries == 0) {
3489  /* First time around. */
3490  ut_print_timestamp(stderr);
3491  fprintf(stderr,
3492  " InnoDB: Warning: io_setup() failed"
3493  " with EAGAIN. Will make %d attempts"
3494  " before giving up.\n",
3495  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3496  }
3497 
3498  if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3499  ++retries;
3500  fprintf(stderr,
3501  "InnoDB: Warning: io_setup() attempt"
3502  " %lu failed.\n",
3503  retries);
3504  os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3505  goto retry;
3506  }
3507 
3508  /* Have tried enough. Better call it a day. */
3509  ut_print_timestamp(stderr);
3510  fprintf(stderr,
3511  " InnoDB: Error: io_setup() failed"
3512  " with EAGAIN after %d attempts.\n",
3513  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3514  break;
3515 
3516  case -ENOSYS:
3517  ut_print_timestamp(stderr);
3518  fprintf(stderr,
3519  " InnoDB: Error: Linux Native AIO interface"
3520  " is not supported on this platform. Please"
3521  " check your OS documentation and install"
3522  " appropriate binary of InnoDB.\n");
3523 
3524  break;
3525 
3526  default:
3527  ut_print_timestamp(stderr);
3528  fprintf(stderr,
3529  " InnoDB: Error: Linux Native AIO setup"
3530  " returned following error[%d]\n", -ret);
3531  break;
3532  }
3533 
3534  fprintf(stderr,
3535  "InnoDB: You can disable Linux Native AIO by"
3536  " setting innodb_use_native_aio = 0 in my.cnf\n");
3537  return(FALSE);
3538 }
3539 
3540 /******************************************************************/
3546 static
3547 ibool
3548 os_aio_native_aio_supported(void)
3549 /*=============================*/
3550 {
3551  int fd;
3552  io_context_t io_ctx;
3553  char name[1000];
3554 
3555  if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
3556  /* The platform does not support native aio. */
3557  return(FALSE);
3558  } else if (!srv_read_only_mode) {
3559  /* Now check if tmpdir supports native aio ops. */
3560  fd = innobase_mysql_tmpfile();
3561 
3562  if (fd < 0) {
3563  ib_logf(IB_LOG_LEVEL_WARN,
3564  "Unable to create temp file to check "
3565  "native AIO support.");
3566 
3567  return(FALSE);
3568  }
3569  } else {
3570 
3571  srv_normalize_path_for_win(srv_log_group_home_dir);
3572 
3573  ulint dirnamelen = strlen(srv_log_group_home_dir);
3574  ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3575  memcpy(name, srv_log_group_home_dir, dirnamelen);
3576 
3577  /* Add a path separator if needed. */
3578  if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
3579  name[dirnamelen++] = SRV_PATH_SEPARATOR;
3580  }
3581 
3582  strcpy(name + dirnamelen, "ib_logfile0");
3583 
3584  fd = ::open(name, O_RDONLY);
3585 
3586  if (fd == -1) {
3587 
3588  ib_logf(IB_LOG_LEVEL_WARN,
3589  "Unable to open \"%s\" to check "
3590  "native AIO read support.", name);
3591 
3592  return(FALSE);
3593  }
3594  }
3595 
3596  struct io_event io_event;
3597 
3598  memset(&io_event, 0x0, sizeof(io_event));
3599 
3600  byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
3601  byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3602 
3603  struct iocb iocb;
3604 
3605  /* Suppress valgrind warning. */
3606  memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3607  memset(&iocb, 0x0, sizeof(iocb));
3608 
3609  struct iocb* p_iocb = &iocb;
3610 
3611  if (!srv_read_only_mode) {
3612  io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3613  } else {
3614  ut_a(UNIV_PAGE_SIZE >= 512);
3615  io_prep_pread(p_iocb, fd, ptr, 512, 0);
3616  }
3617 
3618  int err = io_submit(io_ctx, 1, &p_iocb);
3619 
3620  if (err >= 1) {
3621  /* Now collect the submitted IO request. */
3622  err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3623  }
3624 
3625  ut_free(buf);
3626  close(fd);
3627 
3628  switch (err) {
3629  case 1:
3630  return(TRUE);
3631 
3632  case -EINVAL:
3633  case -ENOSYS:
3634  ib_logf(IB_LOG_LEVEL_ERROR,
3635  "Linux Native AIO not supported. You can either "
3636  "move %s to a file system that supports native "
3637  "AIO or you can set innodb_use_native_aio to "
3638  "FALSE to avoid this message.",
3639  srv_read_only_mode ? name : "tmpdir");
3640 
3641  /* fall through. */
3642  default:
3643  ib_logf(IB_LOG_LEVEL_ERROR,
3644  "Linux Native AIO check on %s returned error[%d]",
3645  srv_read_only_mode ? name : "tmpdir", -err);
3646  }
3647 
3648  return(FALSE);
3649 }
3650 #endif /* LINUX_NATIVE_AIO */
3651 
3652 /******************************************************************/
3657 static
3659 os_aio_array_create(
3660 /*================*/
3661  ulint n,
3664  ulint n_segments)
3665 {
3666  os_aio_array_t* array;
3667 #ifdef WIN_ASYNC_IO
3668  OVERLAPPED* over;
3669 #elif defined(LINUX_NATIVE_AIO)
3670  struct io_event* io_event = NULL;
3671 #endif /* WIN_ASYNC_IO */
3672  ut_a(n > 0);
3673  ut_a(n_segments > 0);
3674 
3675  array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
3676  memset(array, 0x0, sizeof(*array));
3677 
3678  array->mutex = os_mutex_create();
3679  array->not_full = os_event_create();
3680  array->is_empty = os_event_create();
3681 
3682  os_event_set(array->is_empty);
3683 
3684  array->n_slots = n;
3685  array->n_segments = n_segments;
3686 
3687  array->slots = static_cast<os_aio_slot_t*>(
3688  ut_malloc(n * sizeof(*array->slots)));
3689 
3690  memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
3691 #ifdef __WIN__
3692  array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
3693 #endif /* __WIN__ */
3694 
3695 #if defined(LINUX_NATIVE_AIO)
3696  array->aio_ctx = NULL;
3697  array->aio_events = NULL;
3698 
3699  /* If we are not using native aio interface then skip this
3700  part of initialization. */
3701  if (!srv_use_native_aio) {
3702  goto skip_native_aio;
3703  }
3704 
3705  /* Initialize the io_context array. One io_context
3706  per segment in the array. */
3707 
3708  array->aio_ctx = static_cast<io_context**>(
3709  ut_malloc(n_segments * sizeof(*array->aio_ctx)));
3710 
3711  for (ulint i = 0; i < n_segments; ++i) {
3712  if (!os_aio_linux_create_io_ctx(n/n_segments,
3713  &array->aio_ctx[i])) {
3714  /* If something bad happened during aio setup
3715  we should call it a day and return right away.
3716  We don't care about any leaks because a failure
3717  to initialize the io subsystem means that the
3718  server (or atleast the innodb storage engine)
3719  is not going to startup. */
3720  return(NULL);
3721  }
3722  }
3723 
3724  /* Initialize the event array. One event per slot. */
3725  io_event = static_cast<struct io_event*>(
3726  ut_malloc(n * sizeof(*io_event)));
3727 
3728  memset(io_event, 0x0, sizeof(*io_event) * n);
3729  array->aio_events = io_event;
3730 
3731 skip_native_aio:
3732 #endif /* LINUX_NATIVE_AIO */
3733  for (ulint i = 0; i < n; i++) {
3734  os_aio_slot_t* slot;
3735 
3736  slot = os_aio_array_get_nth_slot(array, i);
3737 
3738  slot->pos = i;
3739  slot->reserved = FALSE;
3740 #ifdef WIN_ASYNC_IO
3741  slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3742 
3743  over = &slot->control;
3744 
3745  over->hEvent = slot->handle;
3746 
3747  array->handles[i] = over->hEvent;
3748 
3749 #elif defined(LINUX_NATIVE_AIO)
3750  memset(&slot->control, 0x0, sizeof(slot->control));
3751  slot->n_bytes = 0;
3752  slot->ret = 0;
3753 #endif /* WIN_ASYNC_IO */
3754  }
3755 
3756  return(array);
3757 }
3758 
3759 /************************************************************************/
3761 static
3762 void
3763 os_aio_array_free(
3764 /*==============*/
3765  os_aio_array_t*& array)
3766 {
3767 #ifdef WIN_ASYNC_IO
3768  ulint i;
3769 
3770  for (i = 0; i < array->n_slots; i++) {
3771  os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3772  CloseHandle(slot->handle);
3773  }
3774 #endif /* WIN_ASYNC_IO */
3775 
3776 #ifdef __WIN__
3777  ut_free(array->handles);
3778 #endif /* __WIN__ */
3779  os_mutex_free(array->mutex);
3780  os_event_free(array->not_full);
3781  os_event_free(array->is_empty);
3782 
3783 #if defined(LINUX_NATIVE_AIO)
3784  if (srv_use_native_aio) {
3785  ut_free(array->aio_events);
3786  ut_free(array->aio_ctx);
3787  }
3788 #endif /* LINUX_NATIVE_AIO */
3789 
3790  ut_free(array->slots);
3791  ut_free(array);
3792 
3793  array = 0;
3794 }
3795 
3796 /***********************************************************************
3797 Initializes the asynchronous io system. Creates one array each for ibuf
3798 and log i/o. Also creates one array each for read and write where each
3799 array is divided logically into n_read_segs and n_write_segs
3800 respectively. The caller must create an i/o handler thread for each
3801 segment in these arrays. This function also creates the sync array.
3802 No i/o handler thread needs to be created for that */
3803 UNIV_INTERN
3804 ibool
3805 os_aio_init(
3806 /*========*/
3807  ulint n_per_seg, /*<! in: maximum number of pending aio
3808  operations allowed per segment */
3809  ulint n_read_segs, /*<! in: number of reader threads */
3810  ulint n_write_segs, /*<! in: number of writer threads */
3811  ulint n_slots_sync) /*<! in: number of slots in the sync aio
3812  array */
3813 {
3815 
3816 #if defined(LINUX_NATIVE_AIO)
3817  /* Check if native aio is supported on this system and tmpfs */
3818  if (srv_use_native_aio && !os_aio_native_aio_supported()) {
3819 
3820  ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
3821 
3822  srv_use_native_aio = FALSE;
3823  }
3824 #endif /* LINUX_NATIVE_AIO */
3825 
3827 
3828  os_aio_read_array = os_aio_array_create(
3829  n_read_segs * n_per_seg, n_read_segs);
3830 
3831  if (os_aio_read_array == NULL) {
3832  return(FALSE);
3833  }
3834 
3835  ulint start = (srv_read_only_mode) ? 0 : 2;
3836  ulint n_segs = n_read_segs + start;
3837 
3838  /* 0 is the ibuf segment and 1 is the insert buffer segment. */
3839  for (ulint i = start; i < n_segs; ++i) {
3840  ut_a(i < SRV_MAX_N_IO_THREADS);
3841  srv_io_thread_function[i] = "read thread";
3842  }
3843 
3844  ulint n_segments = n_read_segs;
3845 
3846  if (!srv_read_only_mode) {
3847 
3848  os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3849 
3850  if (os_aio_log_array == NULL) {
3851  return(FALSE);
3852  }
3853 
3854  ++n_segments;
3855 
3856  srv_io_thread_function[1] = "log thread";
3857 
3858  os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3859 
3860  if (os_aio_ibuf_array == NULL) {
3861  return(FALSE);
3862  }
3863 
3864  ++n_segments;
3865 
3866  srv_io_thread_function[0] = "insert buffer thread";
3867 
3868  os_aio_write_array = os_aio_array_create(
3869  n_write_segs * n_per_seg, n_write_segs);
3870 
3871  if (os_aio_write_array == NULL) {
3872  return(FALSE);
3873  }
3874 
3875  n_segments += n_write_segs;
3876 
3877  for (ulint i = start + n_read_segs; i < n_segments; ++i) {
3878  ut_a(i < SRV_MAX_N_IO_THREADS);
3879  srv_io_thread_function[i] = "write thread";
3880  }
3881 
3882  ut_ad(n_segments >= 4);
3883  } else {
3884  ut_ad(n_segments > 0);
3885  }
3886 
3887  os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3888 
3889  if (os_aio_sync_array == NULL) {
3890  return(FALSE);
3891  }
3892 
3893  os_aio_n_segments = n_segments;
3894 
3895  os_aio_validate();
3896 
3897  os_aio_segment_wait_events = static_cast<os_event_t*>(
3898  ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
3899 
3900  for (ulint i = 0; i < n_segments; ++i) {
3901  os_aio_segment_wait_events[i] = os_event_create();
3902  }
3903 
3904  os_last_printout = ut_time();
3905 
3906  return(TRUE);
3907 
3908 }
3909 
3910 /***********************************************************************
3911 Frees the asynchronous io system. */
3912 UNIV_INTERN
3913 void
3914 os_aio_free(void)
3915 /*=============*/
3916 {
3917  if (os_aio_ibuf_array != 0) {
3918  os_aio_array_free(os_aio_ibuf_array);
3919  }
3920 
3921  if (os_aio_log_array != 0) {
3922  os_aio_array_free(os_aio_log_array);
3923  }
3924 
3925  if (os_aio_write_array != 0) {
3926  os_aio_array_free(os_aio_write_array);
3927  }
3928 
3929  if (os_aio_sync_array != 0) {
3930  os_aio_array_free(os_aio_sync_array);
3931  }
3932 
3933  os_aio_array_free(os_aio_read_array);
3934 
3935  for (ulint i = 0; i < os_aio_n_segments; i++) {
3936  os_event_free(os_aio_segment_wait_events[i]);
3937  }
3938 
3939  ut_free(os_aio_segment_wait_events);
3940  os_aio_segment_wait_events = 0;
3941  os_aio_n_segments = 0;
3942 }
3943 
3944 #ifdef WIN_ASYNC_IO
3945 /************************************************************************/
3948 static
3949 void
3950 os_aio_array_wake_win_aio_at_shutdown(
3951 /*==================================*/
3952  os_aio_array_t* array)
3953 {
3954  ulint i;
3955 
3956  for (i = 0; i < array->n_slots; i++) {
3957 
3958  SetEvent((array->slots + i)->handle);
3959  }
3960 }
3961 #endif
3962 
3963 /************************************************************************/
3966 UNIV_INTERN
3967 void
3969 /*=====================================*/
3970 {
3971 #ifdef WIN_ASYNC_IO
3972  /* This code wakes up all ai/o threads in Windows native aio */
3973  os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3974  if (os_aio_write_array != 0) {
3975  os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3976  }
3977 
3978  if (os_aio_ibuf_array != 0) {
3979  os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3980  }
3981 
3982  if (os_aio_log_array != 0) {
3983  os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3984  }
3985 
3986 #elif defined(LINUX_NATIVE_AIO)
3987 
3988  /* When using native AIO interface the io helper threads
3989  wait on io_getevents with a timeout value of 500ms. At
3990  each wake up these threads check the server status.
3991  No need to do anything to wake them up. */
3992 
3993  if (srv_use_native_aio) {
3994  return;
3995  }
3996 
3997  /* Fall through to simulated AIO handler wakeup if we are
3998  not using native AIO. */
3999 #endif /* !WIN_ASYNC_AIO */
4000 
4001  /* This loop wakes up all simulated ai/o threads */
4002 
4003  for (ulint i = 0; i < os_aio_n_segments; i++) {
4004 
4005  os_event_set(os_aio_segment_wait_events[i]);
4006  }
4007 }
4008 
4009 /************************************************************************/
4012 UNIV_INTERN
4013 void
4015 /*=====================================*/
4016 {
4018  os_event_wait(os_aio_write_array->is_empty);
4019 }
4020 
4021 /**********************************************************************/
4025 static
4026 ulint
4027 os_aio_get_segment_no_from_slot(
4028 /*============================*/
4029  os_aio_array_t* array,
4030  os_aio_slot_t* slot)
4031 {
4032  ulint segment;
4033  ulint seg_len;
4034 
4035  if (array == os_aio_ibuf_array) {
4037 
4038  segment = IO_IBUF_SEGMENT;
4039 
4040  } else if (array == os_aio_log_array) {
4042 
4043  segment = IO_LOG_SEGMENT;
4044 
4045  } else if (array == os_aio_read_array) {
4046  seg_len = os_aio_read_array->n_slots
4047  / os_aio_read_array->n_segments;
4048 
4049  segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
4050  } else {
4052  ut_a(array == os_aio_write_array);
4053 
4054  seg_len = os_aio_write_array->n_slots
4055  / os_aio_write_array->n_segments;
4056 
4057  segment = os_aio_read_array->n_segments + 2
4058  + slot->pos / seg_len;
4059  }
4060 
4061  return(segment);
4062 }
4063 
4064 /**********************************************************************/
4067 static
4068 ulint
4069 os_aio_get_array_and_local_segment(
4070 /*===============================*/
4071  os_aio_array_t** array,
4072  ulint global_segment)
4073 {
4074  ulint segment;
4075 
4076  ut_a(global_segment < os_aio_n_segments);
4077 
4078  if (srv_read_only_mode) {
4079  *array = os_aio_read_array;
4080 
4081  return(global_segment);
4082  } else if (global_segment == IO_IBUF_SEGMENT) {
4083  *array = os_aio_ibuf_array;
4084  segment = 0;
4085 
4086  } else if (global_segment == IO_LOG_SEGMENT) {
4087  *array = os_aio_log_array;
4088  segment = 0;
4089 
4090  } else if (global_segment < os_aio_read_array->n_segments + 2) {
4091  *array = os_aio_read_array;
4092 
4093  segment = global_segment - 2;
4094  } else {
4095  *array = os_aio_write_array;
4096 
4097  segment = global_segment - (os_aio_read_array->n_segments + 2);
4098  }
4099 
4100  return(segment);
4101 }
4102 
4103 /*******************************************************************/
4107 static
4109 os_aio_array_reserve_slot(
4110 /*======================*/
4111  ulint type,
4112  os_aio_array_t* array,
4113  fil_node_t* message1,
4115  void* message2,
4117  os_file_t file,
4118  const char* name,
4120  void* buf,
4122  os_offset_t offset,
4123  ulint len)
4124 {
4125  os_aio_slot_t* slot = NULL;
4126 #ifdef WIN_ASYNC_IO
4127  OVERLAPPED* control;
4128 
4129 #elif defined(LINUX_NATIVE_AIO)
4130 
4131  struct iocb* iocb;
4132  off_t aio_offset;
4133 
4134 #endif /* WIN_ASYNC_IO */
4135  ulint i;
4136  ulint counter;
4137  ulint slots_per_seg;
4138  ulint local_seg;
4139 
4140 #ifdef WIN_ASYNC_IO
4141  ut_a((len & 0xFFFFFFFFUL) == len);
4142 #endif /* WIN_ASYNC_IO */
4143 
4144  /* No need of a mutex. Only reading constant fields */
4145  slots_per_seg = array->n_slots / array->n_segments;
4146 
4147  /* We attempt to keep adjacent blocks in the same local
4148  segment. This can help in merging IO requests when we are
4149  doing simulated AIO */
4150  local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
4151  % array->n_segments;
4152 
4153 loop:
4154  os_mutex_enter(array->mutex);
4155 
4156  if (array->n_reserved == array->n_slots) {
4157  os_mutex_exit(array->mutex);
4158 
4159  if (!srv_use_native_aio) {
4160  /* If the handler threads are suspended, wake them
4161  so that we get more slots */
4162 
4164  }
4165 
4166  os_event_wait(array->not_full);
4167 
4168  goto loop;
4169  }
4170 
4171  /* We start our search for an available slot from our preferred
4172  local segment and do a full scan of the array. We are
4173  guaranteed to find a slot in full scan. */
4174  for (i = local_seg * slots_per_seg, counter = 0;
4175  counter < array->n_slots;
4176  i++, counter++) {
4177 
4178  i %= array->n_slots;
4179 
4180  slot = os_aio_array_get_nth_slot(array, i);
4181 
4182  if (slot->reserved == FALSE) {
4183  goto found;
4184  }
4185  }
4186 
4187  /* We MUST always be able to get hold of a reserved slot. */
4188  ut_error;
4189 
4190 found:
4191  ut_a(slot->reserved == FALSE);
4192  array->n_reserved++;
4193 
4194  if (array->n_reserved == 1) {
4195  os_event_reset(array->is_empty);
4196  }
4197 
4198  if (array->n_reserved == array->n_slots) {
4199  os_event_reset(array->not_full);
4200  }
4201 
4202  slot->reserved = TRUE;
4203  slot->reservation_time = ut_time();
4204  slot->message1 = message1;
4205  slot->message2 = message2;
4206  slot->file = file;
4207  slot->name = name;
4208  slot->len = len;
4209  slot->type = type;
4210  slot->buf = static_cast<byte*>(buf);
4211  slot->offset = offset;
4212  slot->io_already_done = FALSE;
4213 
4214 #ifdef WIN_ASYNC_IO
4215  control = &slot->control;
4216  control->Offset = (DWORD) offset & 0xFFFFFFFF;
4217  control->OffsetHigh = (DWORD) (offset >> 32);
4218  ResetEvent(slot->handle);
4219 
4220 #elif defined(LINUX_NATIVE_AIO)
4221 
4222  /* If we are not using native AIO skip this part. */
4223  if (!srv_use_native_aio) {
4224  goto skip_native_aio;
4225  }
4226 
4227  /* Check if we are dealing with 64 bit arch.
4228  If not then make sure that offset fits in 32 bits. */
4229  aio_offset = (off_t) offset;
4230 
4231  ut_a(sizeof(aio_offset) >= sizeof(offset)
4232  || ((os_offset_t) aio_offset) == offset);
4233 
4234  iocb = &slot->control;
4235 
4236  if (type == OS_FILE_READ) {
4237  io_prep_pread(iocb, file, buf, len, aio_offset);
4238  } else {
4239  ut_a(type == OS_FILE_WRITE);
4240  io_prep_pwrite(iocb, file, buf, len, aio_offset);
4241  }
4242 
4243  iocb->data = (void*) slot;
4244  slot->n_bytes = 0;
4245  slot->ret = 0;
4246 
4247 skip_native_aio:
4248 #endif /* LINUX_NATIVE_AIO */
4249  os_mutex_exit(array->mutex);
4250 
4251  return(slot);
4252 }
4253 
4254 /*******************************************************************/
4256 static
4257 void
4258 os_aio_array_free_slot(
4259 /*===================*/
4260  os_aio_array_t* array,
4261  os_aio_slot_t* slot)
4262 {
4263  os_mutex_enter(array->mutex);
4264 
4265  ut_ad(slot->reserved);
4266 
4267  slot->reserved = FALSE;
4268 
4269  array->n_reserved--;
4270 
4271  if (array->n_reserved == array->n_slots - 1) {
4272  os_event_set(array->not_full);
4273  }
4274 
4275  if (array->n_reserved == 0) {
4276  os_event_set(array->is_empty);
4277  }
4278 
4279 #ifdef WIN_ASYNC_IO
4280 
4281  ResetEvent(slot->handle);
4282 
4283 #elif defined(LINUX_NATIVE_AIO)
4284 
4285  if (srv_use_native_aio) {
4286  memset(&slot->control, 0x0, sizeof(slot->control));
4287  slot->n_bytes = 0;
4288  slot->ret = 0;
4289  /*fprintf(stderr, "Freed up Linux native slot.\n");*/
4290  } else {
4291  /* These fields should not be used if we are not
4292  using native AIO. */
4293  ut_ad(slot->n_bytes == 0);
4294  ut_ad(slot->ret == 0);
4295  }
4296 
4297 #endif
4298  os_mutex_exit(array->mutex);
4299 }
4300 
4301 /**********************************************************************/
4303 static
4304 void
4305 os_aio_simulated_wake_handler_thread(
4306 /*=================================*/
4307  ulint global_segment)
4309 {
4310  os_aio_array_t* array;
4311  ulint segment;
4312 
4313  ut_ad(!srv_use_native_aio);
4314 
4315  segment = os_aio_get_array_and_local_segment(&array, global_segment);
4316 
4317  ulint n = array->n_slots / array->n_segments;
4318 
4319  segment *= n;
4320 
4321  /* Look through n slots after the segment * n'th slot */
4322 
4323  os_mutex_enter(array->mutex);
4324 
4325  for (ulint i = 0; i < n; ++i) {
4326  const os_aio_slot_t* slot;
4327 
4328  slot = os_aio_array_get_nth_slot(array, segment + i);
4329 
4330  if (slot->reserved) {
4331 
4332  /* Found an i/o request */
4333 
4334  os_mutex_exit(array->mutex);
4335 
4336  os_event_t event;
4337 
4338  event = os_aio_segment_wait_events[global_segment];
4339 
4340  os_event_set(event);
4341 
4342  return;
4343  }
4344  }
4345 
4346  os_mutex_exit(array->mutex);
4347 }
4348 
4349 /**********************************************************************/
4351 UNIV_INTERN
4352 void
4354 /*=======================================*/
4355 {
4356  if (srv_use_native_aio) {
4357  /* We do not use simulated aio: do nothing */
4358 
4359  return;
4360  }
4361 
4362  os_aio_recommend_sleep_for_read_threads = FALSE;
4363 
4364  for (ulint i = 0; i < os_aio_n_segments; i++) {
4365  os_aio_simulated_wake_handler_thread(i);
4366  }
4367 }
4368 
4369 /**********************************************************************/
4374 UNIV_INTERN
4375 void
4377 /*============================================*/
4378 {
4379 
4380 /* The idea of putting background IO threads to sleep is only for
4381 Windows when using simulated AIO. Windows XP seems to schedule
4382 background threads too eagerly to allow for coalescing during
4383 readahead requests. */
4384 #ifdef __WIN__
4385  os_aio_array_t* array;
4386 
4387  if (srv_use_native_aio) {
4388  /* We do not use simulated aio: do nothing */
4389 
4390  return;
4391  }
4392 
4393  os_aio_recommend_sleep_for_read_threads = TRUE;
4394 
4395  for (ulint i = 0; i < os_aio_n_segments; i++) {
4396  os_aio_get_array_and_local_segment(&array, i);
4397 
4398  if (array == os_aio_read_array) {
4399 
4400  os_event_reset(os_aio_segment_wait_events[i]);
4401  }
4402  }
4403 #endif /* __WIN__ */
4404 }
4405 
4406 #if defined(LINUX_NATIVE_AIO)
4407 /*******************************************************************/
4410 static
4411 ibool
4412 os_aio_linux_dispatch(
4413 /*==================*/
4414  os_aio_array_t* array,
4415  os_aio_slot_t* slot)
4416 {
4417  int ret;
4418  ulint io_ctx_index;
4419  struct iocb* iocb;
4420 
4421  ut_ad(slot != NULL);
4422  ut_ad(array);
4423 
4424  ut_a(slot->reserved);
4425 
4426  /* Find out what we are going to work with.
4427  The iocb struct is directly in the slot.
4428  The io_context is one per segment. */
4429 
4430  iocb = &slot->control;
4431  io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
4432 
4433  ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
4434 
4435 #if defined(UNIV_AIO_DEBUG)
4436  fprintf(stderr,
4437  "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
4438  (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
4439  array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
4440 #endif
4441 
4442  /* io_submit returns number of successfully
4443  queued requests or -errno. */
4444  if (UNIV_UNLIKELY(ret != 1)) {
4445  errno = -ret;
4446  return(FALSE);
4447  }
4448 
4449  return(TRUE);
4450 }
4451 #endif /* LINUX_NATIVE_AIO */
4452 
4453 
4454 /*******************************************************************/
4458 UNIV_INTERN
4459 ibool
4460 os_aio_func(
4461 /*========*/
4462  ulint type,
4463  ulint mode,
4476  const char* name,
4478  os_file_t file,
4479  void* buf,
4481  os_offset_t offset,
4482  ulint n,
4483  fil_node_t* message1,
4487  void* message2)
4491 {
4492  os_aio_array_t* array;
4493  os_aio_slot_t* slot;
4494 #ifdef WIN_ASYNC_IO
4495  ibool retval;
4496  BOOL ret = TRUE;
4497  DWORD len = (DWORD) n;
4498  struct fil_node_t* dummy_mess1;
4499  void* dummy_mess2;
4500  ulint dummy_type;
4501 #endif /* WIN_ASYNC_IO */
4502  ulint wake_later;
4503 
4504  ut_ad(file);
4505  ut_ad(buf);
4506  ut_ad(n > 0);
4507  ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4508  ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4509  ut_ad(os_aio_validate_skip());
4510 #ifdef WIN_ASYNC_IO
4511  ut_ad((n & 0xFFFFFFFFUL) == n);
4512 #endif
4513 
4514  wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4515  mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4516 
4517  if (mode == OS_AIO_SYNC
4518 #ifdef WIN_ASYNC_IO
4519  && !srv_use_native_aio
4520 #endif /* WIN_ASYNC_IO */
4521  ) {
4522  /* This is actually an ordinary synchronous read or write:
4523  no need to use an i/o-handler thread. NOTE that if we use
4524  Windows async i/o, Windows does not allow us to use
4525  ordinary synchronous os_file_read etc. on the same file,
4526  therefore we have built a special mechanism for synchronous
4527  wait in the Windows case.
4528  Also note that the Performance Schema instrumentation has
4529  been performed by current os_aio_func()'s wrapper function
4530  pfs_os_aio_func(). So we would no longer need to call
4531  Performance Schema instrumented os_file_read() and
4532  os_file_write(). Instead, we should use os_file_read_func()
4533  and os_file_write_func() */
4534 
4535  if (type == OS_FILE_READ) {
4536  return(os_file_read_func(file, buf, offset, n));
4537  }
4538 
4540  ut_a(type == OS_FILE_WRITE);
4541 
4542  return(os_file_write_func(name, file, buf, offset, n));
4543  }
4544 
4545 try_again:
4546  switch (mode) {
4547  case OS_AIO_NORMAL:
4548  if (type == OS_FILE_READ) {
4549  array = os_aio_read_array;
4550  } else {
4552  array = os_aio_write_array;
4553  }
4554  break;
4555  case OS_AIO_IBUF:
4556  ut_ad(type == OS_FILE_READ);
4557  /* Reduce probability of deadlock bugs in connection with ibuf:
4558  do not let the ibuf i/o handler sleep */
4559 
4560  wake_later = FALSE;
4561 
4562  if (srv_read_only_mode) {
4563  array = os_aio_read_array;
4564  } else {
4565  array = os_aio_ibuf_array;
4566  }
4567  break;
4568  case OS_AIO_LOG:
4569  if (srv_read_only_mode) {
4570  array = os_aio_read_array;
4571  } else {
4572  array = os_aio_log_array;
4573  }
4574  break;
4575  case OS_AIO_SYNC:
4576  array = os_aio_sync_array;
4577 #if defined(LINUX_NATIVE_AIO)
4578  /* In Linux native AIO we don't use sync IO array. */
4579  ut_a(!srv_use_native_aio);
4580 #endif /* LINUX_NATIVE_AIO */
4581  break;
4582  default:
4583  ut_error;
4584  array = NULL; /* Eliminate compiler warning */
4585  }
4586 
4587  slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4588  name, buf, offset, n);
4589  if (type == OS_FILE_READ) {
4590  if (srv_use_native_aio) {
4591  os_n_file_reads++;
4592  os_bytes_read_since_printout += n;
4593 #ifdef WIN_ASYNC_IO
4594  ret = ReadFile(file, buf, (DWORD) n, &len,
4595  &(slot->control));
4596 
4597 #elif defined(LINUX_NATIVE_AIO)
4598  if (!os_aio_linux_dispatch(array, slot)) {
4599  goto err_exit;
4600  }
4601 #endif /* WIN_ASYNC_IO */
4602  } else {
4603  if (!wake_later) {
4604  os_aio_simulated_wake_handler_thread(
4605  os_aio_get_segment_no_from_slot(
4606  array, slot));
4607  }
4608  }
4609  } else if (type == OS_FILE_WRITE) {
4611  if (srv_use_native_aio) {
4612  os_n_file_writes++;
4613 #ifdef WIN_ASYNC_IO
4614  ret = WriteFile(file, buf, (DWORD) n, &len,
4615  &(slot->control));
4616 
4617 #elif defined(LINUX_NATIVE_AIO)
4618  if (!os_aio_linux_dispatch(array, slot)) {
4619  goto err_exit;
4620  }
4621 #endif /* WIN_ASYNC_IO */
4622  } else {
4623  if (!wake_later) {
4624  os_aio_simulated_wake_handler_thread(
4625  os_aio_get_segment_no_from_slot(
4626  array, slot));
4627  }
4628  }
4629  } else {
4630  ut_error;
4631  }
4632 
4633 #ifdef WIN_ASYNC_IO
4634  if (srv_use_native_aio) {
4635  if ((ret && len == n)
4636  || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4637  /* aio was queued successfully! */
4638 
4639  if (mode == OS_AIO_SYNC) {
4640  /* We want a synchronous i/o operation on a
4641  file where we also use async i/o: in Windows
4642  we must use the same wait mechanism as for
4643  async i/o */
4644 
4645  retval = os_aio_windows_handle(
4646  ULINT_UNDEFINED, slot->pos,
4647  &dummy_mess1, &dummy_mess2,
4648  &dummy_type);
4649 
4650  return(retval);
4651  }
4652 
4653  return(TRUE);
4654  }
4655 
4656  goto err_exit;
4657  }
4658 #endif /* WIN_ASYNC_IO */
4659  /* aio was queued successfully! */
4660  return(TRUE);
4661 
4662 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4663 err_exit:
4664 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4665  os_aio_array_free_slot(array, slot);
4666 
4667  if (os_file_handle_error(
4668  name,type == OS_FILE_READ ? "aio read" : "aio write")) {
4669 
4670  goto try_again;
4671  }
4672 
4673  return(FALSE);
4674 }
4675 
4676 #ifdef WIN_ASYNC_IO
4677 /**********************************************************************/
4685 UNIV_INTERN
4686 ibool
4687 os_aio_windows_handle(
4688 /*==================*/
4689  ulint segment,
4697  ulint pos,
4699  fil_node_t**message1,
4704  void** message2,
4705  ulint* type)
4706 {
4707  ulint orig_seg = segment;
4708  os_aio_array_t* array;
4709  os_aio_slot_t* slot;
4710  ulint n;
4711  ulint i;
4712  ibool ret_val;
4713  BOOL ret;
4714  DWORD len;
4715  BOOL retry = FALSE;
4716 
4717  if (segment == ULINT_UNDEFINED) {
4718  segment = 0;
4719  array = os_aio_sync_array;
4720  } else {
4721  segment = os_aio_get_array_and_local_segment(&array, segment);
4722  }
4723 
4724  /* NOTE! We only access constant fields in os_aio_array. Therefore
4725  we do not have to acquire the protecting mutex yet */
4726 
4727  ut_ad(os_aio_validate_skip());
4728  ut_ad(segment < array->n_segments);
4729 
4730  n = array->n_slots / array->n_segments;
4731 
4732  if (array == os_aio_sync_array) {
4733 
4734  WaitForSingleObject(
4735  os_aio_array_get_nth_slot(array, pos)->handle,
4736  INFINITE);
4737 
4738  i = pos;
4739 
4740  } else {
4741  if (orig_seg != ULINT_UNDEFINED) {
4742  srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4743  }
4744 
4745  i = WaitForMultipleObjects(
4746  (DWORD) n, array->handles + segment * n,
4747  FALSE, INFINITE);
4748  }
4749 
4750  os_mutex_enter(array->mutex);
4751 
4753  && array->n_reserved == 0) {
4754  *message1 = NULL;
4755  *message2 = NULL;
4756  os_mutex_exit(array->mutex);
4757  return(TRUE);
4758  }
4759 
4760  ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
4761 
4762  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4763 
4764  ut_a(slot->reserved);
4765 
4766  if (orig_seg != ULINT_UNDEFINED) {
4768  orig_seg, "get windows aio return value");
4769  }
4770 
4771  ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
4772 
4773  *message1 = slot->message1;
4774  *message2 = slot->message2;
4775 
4776  *type = slot->type;
4777 
4778  if (ret && len == slot->len) {
4779 
4780  ret_val = TRUE;
4781  } else if (os_file_handle_error(slot->name, "Windows aio")) {
4782 
4783  retry = TRUE;
4784  } else {
4785 
4786  ret_val = FALSE;
4787  }
4788 
4789  os_mutex_exit(array->mutex);
4790 
4791  if (retry) {
4792  /* retry failed read/write operation synchronously.
4793  No need to hold array->mutex. */
4794 
4795 #ifdef UNIV_PFS_IO
4796  /* This read/write does not go through os_file_read
4797  and os_file_write APIs, need to register with
4798  performance schema explicitly here. */
4799  struct PSI_file_locker* locker = NULL;
4800  register_pfs_file_io_begin(locker, slot->file, slot->len,
4801  (slot->type == OS_FILE_WRITE)
4802  ? PSI_FILE_WRITE
4803  : PSI_FILE_READ,
4804  __FILE__, __LINE__);
4805 #endif
4806 
4807  ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4808 
4809  switch (slot->type) {
4810  case OS_FILE_WRITE:
4811  ret = WriteFile(slot->file, slot->buf,
4812  (DWORD) slot->len, &len,
4813  &(slot->control));
4814 
4815  break;
4816  case OS_FILE_READ:
4817  ret = ReadFile(slot->file, slot->buf,
4818  (DWORD) slot->len, &len,
4819  &(slot->control));
4820 
4821  break;
4822  default:
4823  ut_error;
4824  }
4825 
4826 #ifdef UNIV_PFS_IO
4827  register_pfs_file_io_end(locker, len);
4828 #endif
4829 
4830  if (!ret && GetLastError() == ERROR_IO_PENDING) {
4831  /* aio was queued successfully!
4832  We want a synchronous i/o operation on a
4833  file where we also use async i/o: in Windows
4834  we must use the same wait mechanism as for
4835  async i/o */
4836 
4837  ret = GetOverlappedResult(slot->file,
4838  &(slot->control),
4839  &len, TRUE);
4840  }
4841 
4842  ret_val = ret && len == slot->len;
4843  }
4844 
4845  os_aio_array_free_slot(array, slot);
4846 
4847  return(ret_val);
4848 }
4849 #endif
4850 
4851 #if defined(LINUX_NATIVE_AIO)
4852 /******************************************************************/
4863 static
4864 void
4865 os_aio_linux_collect(
4866 /*=================*/
4867  os_aio_array_t* array,
4868  ulint segment,
4869  ulint seg_size)
4870 {
4871  int i;
4872  int ret;
4873  ulint start_pos;
4874  ulint end_pos;
4875  struct timespec timeout;
4876  struct io_event* events;
4877  struct io_context* io_ctx;
4878 
4879  /* sanity checks. */
4880  ut_ad(array != NULL);
4881  ut_ad(seg_size > 0);
4882  ut_ad(segment < array->n_segments);
4883 
4884  /* Which part of event array we are going to work on. */
4885  events = &array->aio_events[segment * seg_size];
4886 
4887  /* Which io_context we are going to use. */
4888  io_ctx = array->aio_ctx[segment];
4889 
4890  /* Starting point of the segment we will be working on. */
4891  start_pos = segment * seg_size;
4892 
4893  /* End point. */
4894  end_pos = start_pos + seg_size;
4895 
4896 retry:
4897 
4898  /* Initialize the events. The timeout value is arbitrary.
4899  We probably need to experiment with it a little. */
4900  memset(events, 0, sizeof(*events) * seg_size);
4901  timeout.tv_sec = 0;
4902  timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4903 
4904  ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4905 
4906  if (ret > 0) {
4907  for (i = 0; i < ret; i++) {
4908  os_aio_slot_t* slot;
4909  struct iocb* control;
4910 
4911  control = (struct iocb*) events[i].obj;
4912  ut_a(control != NULL);
4913 
4914  slot = (os_aio_slot_t*) control->data;
4915 
4916  /* Some sanity checks. */
4917  ut_a(slot != NULL);
4918  ut_a(slot->reserved);
4919 
4920 #if defined(UNIV_AIO_DEBUG)
4921  fprintf(stderr,
4922  "io_getevents[%c]: slot[%p] ctx[%p]"
4923  " seg[%lu]\n",
4924  (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4925  slot, io_ctx, segment);
4926 #endif
4927 
4928  /* We are not scribbling previous segment. */
4929  ut_a(slot->pos >= start_pos);
4930 
4931  /* We have not overstepped to next segment. */
4932  ut_a(slot->pos < end_pos);
4933 
4934  /* Mark this request as completed. The error handling
4935  will be done in the calling function. */
4936  os_mutex_enter(array->mutex);
4937  slot->n_bytes = events[i].res;
4938  slot->ret = events[i].res2;
4939  slot->io_already_done = TRUE;
4940  os_mutex_exit(array->mutex);
4941  }
4942  return;
4943  }
4944 
4945  if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
4946  return;
4947  }
4948 
4949  /* This error handling is for any error in collecting the
4950  IO requests. The errors, if any, for any particular IO
4951  request are simply passed on to the calling routine. */
4952 
4953  switch (ret) {
4954  case -EAGAIN:
4955  /* Not enough resources! Try again. */
4956  case -EINTR:
4957  /* Interrupted! I have tested the behaviour in case of an
4958  interrupt. If we have some completed IOs available then
4959  the return code will be the number of IOs. We get EINTR only
4960  if there are no completed IOs and we have been interrupted. */
4961  case 0:
4962  /* No pending request! Go back and check again. */
4963  goto retry;
4964  }
4965 
4966  /* All other errors should cause a trap for now. */
4967  ut_print_timestamp(stderr);
4968  fprintf(stderr,
4969  " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
4970  ret);
4971  ut_error;
4972 }
4973 
4974 /**********************************************************************/
4982 UNIV_INTERN
4983 ibool
4984 os_aio_linux_handle(
4985 /*================*/
4986  ulint global_seg,
4992  fil_node_t**message1,
4993  void** message2,
4997  ulint* type)
4998 {
4999  ulint segment;
5000  os_aio_array_t* array;
5001  os_aio_slot_t* slot;
5002  ulint n;
5003  ulint i;
5004  ibool ret = FALSE;
5005 
5006  /* Should never be doing Sync IO here. */
5007  ut_a(global_seg != ULINT_UNDEFINED);
5008 
5009  /* Find the array and the local segment. */
5010  segment = os_aio_get_array_and_local_segment(&array, global_seg);
5011  n = array->n_slots / array->n_segments;
5012 
5013  /* Loop until we have found a completed request. */
5014  for (;;) {
5015  ibool any_reserved = FALSE;
5016  os_mutex_enter(array->mutex);
5017  for (i = 0; i < n; ++i) {
5018  slot = os_aio_array_get_nth_slot(
5019  array, i + segment * n);
5020  if (!slot->reserved) {
5021  continue;
5022  } else if (slot->io_already_done) {
5023  /* Something for us to work on. */
5024  goto found;
5025  } else {
5026  any_reserved = TRUE;
5027  }
5028  }
5029 
5030  os_mutex_exit(array->mutex);
5031 
5032  /* There is no completed request.
5033  If there is no pending request at all,
5034  and the system is being shut down, exit. */
5035  if (UNIV_UNLIKELY
5036  (!any_reserved
5038  *message1 = NULL;
5039  *message2 = NULL;
5040  return(TRUE);
5041  }
5042 
5043  /* Wait for some request. Note that we return
5044  from wait iff we have found a request. */
5045 
5046  srv_set_io_thread_op_info(global_seg,
5047  "waiting for completed aio requests");
5048  os_aio_linux_collect(array, segment, n);
5049  }
5050 
5051 found:
5052  /* Note that it may be that there are more then one completed
5053  IO requests. We process them one at a time. We may have a case
5054  here to improve the performance slightly by dealing with all
5055  requests in one sweep. */
5056  srv_set_io_thread_op_info(global_seg,
5057  "processing completed aio requests");
5058 
5059  /* Ensure that we are scribbling only our segment. */
5060  ut_a(i < n);
5061 
5062  ut_ad(slot != NULL);
5063  ut_ad(slot->reserved);
5064  ut_ad(slot->io_already_done);
5065 
5066  *message1 = slot->message1;
5067  *message2 = slot->message2;
5068 
5069  *type = slot->type;
5070 
5071  if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
5072 
5073  ret = TRUE;
5074  } else {
5075  errno = -slot->ret;
5076 
5077  /* os_file_handle_error does tell us if we should retry
5078  this IO. As it stands now, we don't do this retry when
5079  reaping requests from a different context than
5080  the dispatcher. This non-retry logic is the same for
5081  windows and linux native AIO.
5082  We should probably look into this to transparently
5083  re-submit the IO. */
5084  os_file_handle_error(slot->name, "Linux aio");
5085 
5086  ret = FALSE;
5087  }
5088 
5089  os_mutex_exit(array->mutex);
5090 
5091  os_aio_array_free_slot(array, slot);
5092 
5093  return(ret);
5094 }
5095 #endif /* LINUX_NATIVE_AIO */
5096 
5097 /**********************************************************************/
5101 UNIV_INTERN
5102 ibool
5104 /*====================*/
5105  ulint global_segment,
5110  fil_node_t**message1,
5115  void** message2,
5116  ulint* type)
5117 {
5118  os_aio_array_t* array;
5119  ulint segment;
5120  os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
5121  ulint n_consecutive;
5122  ulint total_len;
5123  ulint offs;
5124  os_offset_t lowest_offset;
5125  ulint biggest_age;
5126  ulint age;
5127  byte* combined_buf;
5128  byte* combined_buf2;
5129  ibool ret;
5130  ibool any_reserved;
5131  ulint n;
5132  os_aio_slot_t* aio_slot;
5133 
5134  /* Fix compiler warning */
5135  *consecutive_ios = NULL;
5136 
5137  segment = os_aio_get_array_and_local_segment(&array, global_segment);
5138 
5139 restart:
5140  /* NOTE! We only access constant fields in os_aio_array. Therefore
5141  we do not have to acquire the protecting mutex yet */
5142 
5143  srv_set_io_thread_op_info(global_segment,
5144  "looking for i/o requests (a)");
5145  ut_ad(os_aio_validate_skip());
5146  ut_ad(segment < array->n_segments);
5147 
5148  n = array->n_slots / array->n_segments;
5149 
5150  /* Look through n slots after the segment * n'th slot */
5151 
5152  if (array == os_aio_read_array
5153  && os_aio_recommend_sleep_for_read_threads) {
5154 
5155  /* Give other threads chance to add several i/os to the array
5156  at once. */
5157 
5158  goto recommended_sleep;
5159  }
5160 
5161  srv_set_io_thread_op_info(global_segment,
5162  "looking for i/o requests (b)");
5163 
5164  /* Check if there is a slot for which the i/o has already been
5165  done */
5166  any_reserved = FALSE;
5167 
5168  os_mutex_enter(array->mutex);
5169 
5170  for (ulint i = 0; i < n; i++) {
5171  os_aio_slot_t* slot;
5172 
5173  slot = os_aio_array_get_nth_slot(array, i + segment * n);
5174 
5175  if (!slot->reserved) {
5176  continue;
5177  } else if (slot->io_already_done) {
5178 
5179  if (os_aio_print_debug) {
5180  fprintf(stderr,
5181  "InnoDB: i/o for slot %lu"
5182  " already done, returning\n",
5183  (ulong) i);
5184  }
5185 
5186  aio_slot = slot;
5187  ret = TRUE;
5188  goto slot_io_done;
5189  } else {
5190  any_reserved = TRUE;
5191  }
5192  }
5193 
5194  /* There is no completed request.
5195  If there is no pending request at all,
5196  and the system is being shut down, exit. */
5197  if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
5198  os_mutex_exit(array->mutex);
5199  *message1 = NULL;
5200  *message2 = NULL;
5201  return(TRUE);
5202  }
5203 
5204  n_consecutive = 0;
5205 
5206  /* If there are at least 2 seconds old requests, then pick the oldest
5207  one to prevent starvation. If several requests have the same age,
5208  then pick the one at the lowest offset. */
5209 
5210  biggest_age = 0;
5211  lowest_offset = IB_UINT64_MAX;
5212 
5213  for (ulint i = 0; i < n; i++) {
5214  os_aio_slot_t* slot;
5215 
5216  slot = os_aio_array_get_nth_slot(array, i + segment * n);
5217 
5218  if (slot->reserved) {
5219 
5220  age = (ulint) difftime(
5221  ut_time(), slot->reservation_time);
5222 
5223  if ((age >= 2 && age > biggest_age)
5224  || (age >= 2 && age == biggest_age
5225  && slot->offset < lowest_offset)) {
5226 
5227  /* Found an i/o request */
5228  consecutive_ios[0] = slot;
5229 
5230  n_consecutive = 1;
5231 
5232  biggest_age = age;
5233  lowest_offset = slot->offset;
5234  }
5235  }
5236  }
5237 
5238  if (n_consecutive == 0) {
5239  /* There were no old requests. Look for an i/o request at the
5240  lowest offset in the array (we ignore the high 32 bits of the
5241  offset in these heuristics) */
5242 
5243  lowest_offset = IB_UINT64_MAX;
5244 
5245  for (ulint i = 0; i < n; i++) {
5246  os_aio_slot_t* slot;
5247 
5248  slot = os_aio_array_get_nth_slot(
5249  array, i + segment * n);
5250 
5251  if (slot->reserved && slot->offset < lowest_offset) {
5252 
5253  /* Found an i/o request */
5254  consecutive_ios[0] = slot;
5255 
5256  n_consecutive = 1;
5257 
5258  lowest_offset = slot->offset;
5259  }
5260  }
5261  }
5262 
5263  if (n_consecutive == 0) {
5264 
5265  /* No i/o requested at the moment */
5266 
5267  goto wait_for_io;
5268  }
5269 
5270  /* if n_consecutive != 0, then we have assigned
5271  something valid to consecutive_ios[0] */
5272  ut_ad(n_consecutive != 0);
5273  ut_ad(consecutive_ios[0] != NULL);
5274 
5275  aio_slot = consecutive_ios[0];
5276 
5277  /* Check if there are several consecutive blocks to read or write */
5278 
5279 consecutive_loop:
5280  for (ulint i = 0; i < n; i++) {
5281  os_aio_slot_t* slot;
5282 
5283  slot = os_aio_array_get_nth_slot(array, i + segment * n);
5284 
5285  if (slot->reserved
5286  && slot != aio_slot
5287  && slot->offset == aio_slot->offset + aio_slot->len
5288  && slot->type == aio_slot->type
5289  && slot->file == aio_slot->file) {
5290 
5291  /* Found a consecutive i/o request */
5292 
5293  consecutive_ios[n_consecutive] = slot;
5294  n_consecutive++;
5295 
5296  aio_slot = slot;
5297 
5298  if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
5299 
5300  goto consecutive_loop;
5301  } else {
5302  break;
5303  }
5304  }
5305  }
5306 
5307  srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
5308 
5309  /* We have now collected n_consecutive i/o requests in the array;
5310  allocate a single buffer which can hold all data, and perform the
5311  i/o */
5312 
5313  total_len = 0;
5314  aio_slot = consecutive_ios[0];
5315 
5316  for (ulint i = 0; i < n_consecutive; i++) {
5317  total_len += consecutive_ios[i]->len;
5318  }
5319 
5320  if (n_consecutive == 1) {
5321  /* We can use the buffer of the i/o request */
5322  combined_buf = aio_slot->buf;
5323  combined_buf2 = NULL;
5324  } else {
5325  combined_buf2 = static_cast<byte*>(
5326  ut_malloc(total_len + UNIV_PAGE_SIZE));
5327 
5328  ut_a(combined_buf2);
5329 
5330  combined_buf = static_cast<byte*>(
5331  ut_align(combined_buf2, UNIV_PAGE_SIZE));
5332  }
5333 
5334  /* We release the array mutex for the time of the i/o: NOTE that
5335  this assumes that there is just one i/o-handler thread serving
5336  a single segment of slots! */
5337 
5338  os_mutex_exit(array->mutex);
5339 
5340  if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
5341  /* Copy the buffers to the combined buffer */
5342  offs = 0;
5343 
5344  for (ulint i = 0; i < n_consecutive; i++) {
5345 
5346  ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
5347  consecutive_ios[i]->len);
5348 
5349  offs += consecutive_ios[i]->len;
5350  }
5351  }
5352 
5353  srv_set_io_thread_op_info(global_segment, "doing file i/o");
5354 
5355  /* Do the i/o with ordinary, synchronous i/o functions: */
5356  if (aio_slot->type == OS_FILE_WRITE) {
5358  ret = os_file_write(
5359  aio_slot->name, aio_slot->file, combined_buf,
5360  aio_slot->offset, total_len);
5361  } else {
5362  ret = os_file_read(
5363  aio_slot->file, combined_buf,
5364  aio_slot->offset, total_len);
5365  }
5366 
5367  ut_a(ret);
5368  srv_set_io_thread_op_info(global_segment, "file i/o done");
5369 
5370  if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
5371  /* Copy the combined buffer to individual buffers */
5372  offs = 0;
5373 
5374  for (ulint i = 0; i < n_consecutive; i++) {
5375 
5376  ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
5377  consecutive_ios[i]->len);
5378  offs += consecutive_ios[i]->len;
5379  }
5380  }
5381 
5382  if (combined_buf2) {
5383  ut_free(combined_buf2);
5384  }
5385 
5386  os_mutex_enter(array->mutex);
5387 
5388  /* Mark the i/os done in slots */
5389 
5390  for (ulint i = 0; i < n_consecutive; i++) {
5391  consecutive_ios[i]->io_already_done = TRUE;
5392  }
5393 
5394  /* We return the messages for the first slot now, and if there were
5395  several slots, the messages will be returned with subsequent calls
5396  of this function */
5397 
5398 slot_io_done:
5399 
5400  ut_a(aio_slot->reserved);
5401 
5402  *message1 = aio_slot->message1;
5403  *message2 = aio_slot->message2;
5404 
5405  *type = aio_slot->type;
5406 
5407  os_mutex_exit(array->mutex);
5408 
5409  os_aio_array_free_slot(array, aio_slot);
5410 
5411  return(ret);
5412 
5413 wait_for_io:
5414  srv_set_io_thread_op_info(global_segment, "resetting wait event");
5415 
5416  /* We wait here until there again can be i/os in the segment
5417  of this thread */
5418 
5419  os_event_reset(os_aio_segment_wait_events[global_segment]);
5420 
5421  os_mutex_exit(array->mutex);
5422 
5423 recommended_sleep:
5424  srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
5425 
5426  os_event_wait(os_aio_segment_wait_events[global_segment]);
5427 
5428  goto restart;
5429 }
5430 
5431 /**********************************************************************/
5434 static
5435 bool
5436 os_aio_array_validate(
5437 /*==================*/
5438  os_aio_array_t* array)
5439 {
5440  ulint i;
5441  ulint n_reserved = 0;
5442 
5443  os_mutex_enter(array->mutex);
5444 
5445  ut_a(array->n_slots > 0);
5446  ut_a(array->n_segments > 0);
5447 
5448  for (i = 0; i < array->n_slots; i++) {
5449  os_aio_slot_t* slot;
5450 
5451  slot = os_aio_array_get_nth_slot(array, i);
5452 
5453  if (slot->reserved) {
5454  n_reserved++;
5455  ut_a(slot->len > 0);
5456  }
5457  }
5458 
5459  ut_a(array->n_reserved == n_reserved);
5460 
5461  os_mutex_exit(array->mutex);
5462 
5463  return(true);
5464 }
5465 
5466 /**********************************************************************/
5469 UNIV_INTERN
5470 ibool
5471 os_aio_validate(void)
5472 /*=================*/
5473 {
5474  os_aio_array_validate(os_aio_read_array);
5475 
5476  if (os_aio_write_array != 0) {
5477  os_aio_array_validate(os_aio_write_array);
5478  }
5479 
5480  if (os_aio_ibuf_array != 0) {
5481  os_aio_array_validate(os_aio_ibuf_array);
5482  }
5483 
5484  if (os_aio_log_array != 0) {
5485  os_aio_array_validate(os_aio_log_array);
5486  }
5487 
5488  if (os_aio_sync_array != 0) {
5489  os_aio_array_validate(os_aio_sync_array);
5490  }
5491 
5492  return(TRUE);
5493 }
5494 
5495 /**********************************************************************/
5500 static
5501 void
5502 os_aio_print_segment_info(
5503 /*======================*/
5504  FILE* file,
5505  ulint* n_seg,
5506  os_aio_array_t* array)
5507 {
5508  ulint i;
5509 
5510  ut_ad(array);
5511  ut_ad(n_seg);
5512  ut_ad(array->n_segments > 0);
5513 
5514  if (array->n_segments == 1) {
5515  return;
5516  }
5517 
5518  fprintf(file, " [");
5519  for (i = 0; i < array->n_segments; i++) {
5520  if (i != 0) {
5521  fprintf(file, ", ");
5522  }
5523 
5524  fprintf(file, "%lu", n_seg[i]);
5525  }
5526  fprintf(file, "] ");
5527 }
5528 
5529 /**********************************************************************/
5531 UNIV_INTERN
5532 void
5533 os_aio_print_array(
5534 /*==============*/
5535  FILE* file,
5536  os_aio_array_t* array)
5537 {
5538  ulint n_reserved = 0;
5539  ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5540 
5541  os_mutex_enter(array->mutex);
5542 
5543  ut_a(array->n_slots > 0);
5544  ut_a(array->n_segments > 0);
5545 
5546  memset(n_res_seg, 0x0, sizeof(n_res_seg));
5547 
5548  for (ulint i = 0; i < array->n_slots; ++i) {
5549  os_aio_slot_t* slot;
5550  ulint seg_no;
5551 
5552  slot = os_aio_array_get_nth_slot(array, i);
5553 
5554  seg_no = (i * array->n_segments) / array->n_slots;
5555 
5556  if (slot->reserved) {
5557  ++n_reserved;
5558  ++n_res_seg[seg_no];
5559 
5560  ut_a(slot->len > 0);
5561  }
5562  }
5563 
5564  ut_a(array->n_reserved == n_reserved);
5565 
5566  fprintf(file, " %lu", (ulong) n_reserved);
5567 
5568  os_aio_print_segment_info(file, n_res_seg, array);
5569 
5570  os_mutex_exit(array->mutex);
5571 }
5572 
5573 /**********************************************************************/
5575 UNIV_INTERN
5576 void
5577 os_aio_print(
5578 /*=========*/
5579  FILE* file)
5580 {
5581  time_t current_time;
5582  double time_elapsed;
5583  double avg_bytes_read;
5584 
5585  for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
5586  fprintf(file, "I/O thread %lu state: %s (%s)",
5587  (ulong) i,
5588  srv_io_thread_op_info[i],
5589  srv_io_thread_function[i]);
5590 
5591 #ifndef __WIN__
5592  if (os_aio_segment_wait_events[i]->is_set) {
5593  fprintf(file, " ev set");
5594  }
5595 #endif /* __WIN__ */
5596 
5597  fprintf(file, "\n");
5598  }
5599 
5600  fputs("Pending normal aio reads:", file);
5601 
5602  os_aio_print_array(file, os_aio_read_array);
5603 
5604  if (os_aio_write_array != 0) {
5605  fputs(", aio writes:", file);
5606  os_aio_print_array(file, os_aio_write_array);
5607  }
5608 
5609  if (os_aio_ibuf_array != 0) {
5610  fputs(",\n ibuf aio reads:", file);
5611  os_aio_print_array(file, os_aio_ibuf_array);
5612  }
5613 
5614  if (os_aio_log_array != 0) {
5615  fputs(", log i/o's:", file);
5616  os_aio_print_array(file, os_aio_log_array);
5617  }
5618 
5619  if (os_aio_sync_array != 0) {
5620  fputs(", sync i/o's:", file);
5621  os_aio_print_array(file, os_aio_sync_array);
5622  }
5623 
5624  putc('\n', file);
5625  current_time = ut_time();
5626  time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5627 
5628  fprintf(file,
5629  "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5630  "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5631  (ulong) fil_n_pending_log_flushes,
5633  (ulong) os_n_file_reads,
5634  (ulong) os_n_file_writes,
5635  (ulong) os_n_fsyncs);
5636 
5638  fprintf(file,
5639  "%lu pending preads, %lu pending pwrites\n",
5640  (ulong) os_file_n_pending_preads,
5641  (ulong) os_file_n_pending_pwrites);
5642  }
5643 
5644  if (os_n_file_reads == os_n_file_reads_old) {
5645  avg_bytes_read = 0.0;
5646  } else {
5647  avg_bytes_read = (double) os_bytes_read_since_printout
5648  / (os_n_file_reads - os_n_file_reads_old);
5649  }
5650 
5651  fprintf(file,
5652  "%.2f reads/s, %lu avg bytes/read,"
5653  " %.2f writes/s, %.2f fsyncs/s\n",
5654  (os_n_file_reads - os_n_file_reads_old)
5655  / time_elapsed,
5656  (ulong) avg_bytes_read,
5657  (os_n_file_writes - os_n_file_writes_old)
5658  / time_elapsed,
5659  (os_n_fsyncs - os_n_fsyncs_old)
5660  / time_elapsed);
5661 
5662  os_n_file_reads_old = os_n_file_reads;
5663  os_n_file_writes_old = os_n_file_writes;
5664  os_n_fsyncs_old = os_n_fsyncs;
5665  os_bytes_read_since_printout = 0;
5666 
5667  os_last_printout = current_time;
5668 }
5669 
5670 /**********************************************************************/
5672 UNIV_INTERN
5673 void
5675 /*======================*/
5676 {
5677  os_n_file_reads_old = os_n_file_reads;
5678  os_n_file_writes_old = os_n_file_writes;
5679  os_n_fsyncs_old = os_n_fsyncs;
5680  os_bytes_read_since_printout = 0;
5681 
5682  os_last_printout = time(NULL);
5683 }
5684 
5685 #ifdef UNIV_DEBUG
5686 /**********************************************************************/
5690 UNIV_INTERN
5691 ibool
5692 os_aio_all_slots_free(void)
5693 /*=======================*/
5694 {
5695  os_aio_array_t* array;
5696  ulint n_res = 0;
5697 
5698  array = os_aio_read_array;
5699 
5700  os_mutex_enter(array->mutex);
5701 
5702  n_res += array->n_reserved;
5703 
5704  os_mutex_exit(array->mutex);
5705 
5706  if (!srv_read_only_mode) {
5707  ut_a(os_aio_write_array == 0);
5708 
5709  array = os_aio_write_array;
5710 
5711  os_mutex_enter(array->mutex);
5712 
5713  n_res += array->n_reserved;
5714 
5715  os_mutex_exit(array->mutex);
5716 
5717  ut_a(os_aio_ibuf_array == 0);
5718 
5719  array = os_aio_ibuf_array;
5720 
5721  os_mutex_enter(array->mutex);
5722 
5723  n_res += array->n_reserved;
5724 
5725  os_mutex_exit(array->mutex);
5726  }
5727 
5728  ut_a(os_aio_log_array == 0);
5729 
5730  array = os_aio_log_array;
5731 
5732  os_mutex_enter(array->mutex);
5733 
5734  n_res += array->n_reserved;
5735 
5736  os_mutex_exit(array->mutex);
5737 
5738  array = os_aio_sync_array;
5739 
5740  os_mutex_enter(array->mutex);
5741 
5742  n_res += array->n_reserved;
5743 
5744  os_mutex_exit(array->mutex);
5745 
5746  if (n_res == 0) {
5747 
5748  return(TRUE);
5749  }
5750 
5751  return(FALSE);
5752 }
5753 #endif /* UNIV_DEBUG */
5754 
5755 #endif /* !UNIV_HOTBACKUP */