MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
pgman.hpp
1 /*
2  Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
3 
4  This program is free software; you can redistribute it and/or modify
5  it under the terms of the GNU General Public License as published by
6  the Free Software Foundation; version 2 of the License.
7 
8  This program is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  GNU General Public License for more details.
12 
13  You should have received a copy of the GNU General Public License
14  along with this program; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 #ifndef PGMAN_H
19 #define PGMAN_H
20 
21 #include <SimulatedBlock.hpp>
22 
23 #include <DLCHashTable.hpp>
24 #include <DLCFifoList.hpp>
25 #include <NodeBitmask.hpp>
26 #include <signaldata/LCP.hpp>
27 #include "lgman.hpp"
28 
29 #include <NdbOut.hpp>
30 #include <OutputStream.hpp>
31 
32 /*
33  * PGMAN
34  *
35  * PAGE ENTRIES AND REQUESTS
36  *
37  * Central structure is "page entry". It corresponds to a disk page
38  * identified by file and page number (file_no, page_no).
39  *
40  * A page entry is created by first request for the disk page.
41  * Subsequent requests are queued under the same page entry.
42  *
43  * There is a limited number of in-memory "cache pages", also called
44  * "buffer pages" or "real pages". These are used by the more numerous
45  * page entries to buffer the disk pages.
46  *
47  * A new or non-resident page entry must first be "bound" to an
48  * available cache page. Next the disk page must be "mapped" to the
49  * cache page. If the page is empty (never written) it is considered
50  * mapped trivially. Otherwise the cache page must be updated via
51  * "pagein" from disk. A bound and mapped page is called "resident".
52  *
53  * Updating a resident cache page makes it "dirty". A background
54  * clean-up process makes dirty pages "clean" via "pageout" to disk.
55  * Write ahead logging (WAL) of the page is done first i.e. UNDO log is
56  * flushed up to the page log sequence number (LSN) by calling a LGMAN
57  * method. The reason for this is obvious but not relevant to PGMAN.
58  *
59  * A local check point (LCP) periodically performs a complete pageout of
60  * dirty pages. It must iterate over a list which will cover all pages
61  * which had been dirty since LCP start.
62  *
63  * A clean page is a candidate ("victim") for being "unmapped" and
64  * "evicted" from the cache, to allow another page to become resident.
65  * This process is called "page replacement".
66  *
67  * PAGE REPLACEMENT
68  *
69  * Page replacement uses the LIRS algorithm (Jiang-Zhang).
70  *
71  * The "recency" of a page is the time between now and the last request
72  * for the page. The "inter-reference recency" (IRR) of a page is the
73  * time between the last 2 requests for the page. "Time" is advanced by
74  * request for any page.
75  *
76  * Page entries are divided into "hot" ("lir") and "cold" ("hir"). Here
77  * lir/hir refers to low/high IRR. Hot pages are always resident but
78  * cold pages need not be.
79  *
80  * Number of hot pages is limited to slightly less than number of cache
81  * pages. Until this number is reached, all used cache pages are hot.
82  * Then the algorithm described next is applied. The algorithm avoids
83  * storing any of the actual recency values.
84  *
85  * Primary data structure is the "stack". It contains all hot entries
86  * and recently referenced cold entries (resident or not). The stack is
87  * in recency order with most recent (lowest recency) entry on top.
88  * Entries which are less recent than the least recent hot page are
89  * removed ("stack pruning"). So the bottom page is always hot.
90  *
91  * The cold entries on the stack are undergoing a "trial period". If
92  * they are referenced soon again (see IRR), they become hot. Otherwise
93  * they fall off the bottom of the stack.
94  *
95  * Secondary data structure is the "queue". It contains all resident
96  * cold pages (on stack or not). When a hot page is removed from the
97  * stack it is added to the end of the queue. When page replacement
98  * needs a page it removes it from the front of the queue.
99  *
100  * Page requests cause the input entry to be inserted and updated in
101  * LIRS lists. Remember that an entry can be present on both stack and
102  * queue. The rules per type of input entry are:
103  *
104  * 1. Hot. Move input entry to stack top. If input entry was at stack
105  * bottom, do stack pruning.
106  *
107  * 2. Cold resident. Move input entry to stack top. Then:
108  *
109  * 2a. If input entry was on stack, change it to hot, remove it from
110  * queue, change stack bottom entry to cold and move the bottom entry to
111  * queue end, and do stack pruning.
112  *
113  * 2b. If input entry was on queue only, leave it cold but move it to
114  * end of queue.
115  *
116  * 3. Cold non-resident. Remove entry at queue front and evict it from
117  * the cache. If the evicted entry was on stack, it remains as unbound
118  * entry on stack, to continue its trial period. Map input entry to the
119  * freed cache page. Move input entry to stack top. Then:
120  *
121  * 3a. If input entry was on stack, change it to hot, change stack
122  * bottom entry to cold and move the bottom entry to queue end, and do
123  * stack pruning.
124  *
125  * 3b. If input entry was new, leave it cold but move it to end of
126  * queue.
127  *
128  * LIRS CHANGES
129  *
130  * In LIRS the 'resident' requirement is changed as follows:
131  *
132  * Stack entries, including hot ones, can have any state. Unbound stack
133  * entries are created by new requests and by pages evicted from queue
134  * front which are still on stack.
135  *
136  * Queue entries must be bound. They become resident and evictable
137  * within a finite time. A page is "evictable" if it is mapped, clean,
138  * and has no requests.
139  *
140  * An unbound entry which should be on queue is added there at bind
141  * time. Such entries are created when an unbound entry with open
142  * requests is popped (hot) or pruned (cold) from the stack. This can
143  * happen if the cache is too small.
144  *
145  * CLEANUP PROCESS
146  *
147  * LIRS (and related algorithms) do not address dirty pages. From above
148  * it is obvious that the clean-up process should process dirty queue
149  * entries proceeding from front to end. This also favors pages with
150  * lower LSN numbers which minimizes amount of WAL to write.
151  *
152  * In fact the clean-up process holds a permanent pointer into the queue
153  * where all entries strictly towards the front are clean. For such an
154  * entry to become dirty it must be referenced again which moves it to
155  * queue end and past the clean-up pointer. (In practice, until this
156  * works, cleanup recycles back to queue front).
157  *
158  * PAGE LISTS
159  *
160  * Page entries are put on a number of lists.
161  *
162  * 1. Hash table on (file_no, page_no). Used for fast lookup and for
163  * LCP to iterate over.
164  *
165  * The other lists are doubly-linked FIFOs. In general entries are
166  * added to the end (last entry) and processed from the front (first
167  * entry). When used as stack, end is top and front is bottom.
168  *
169  * 2. The LIRS stack and queue. These control page replacement.
170  *
171  * 3. Page entries are divided into disjoint "sublists" based on page
172  * "state" i.e. the set of page properties. Some sublists drive page
173  * processing and have next entry to process at the front.
174  *
175  * Current sublists are as follows. Those that drive processing are
176  * marked with a plus (+).
177  *
178  * SL_BIND + waiting for available buffer page
179  * SL_MAP + waiting to start pagein from disk
180  * SL_MAP_IO - above in i/o wait (the pagein)
181  * SL_CALLBACK + request done, waiting to invoke callbacks
182  * SL_CALLBACK_IO - above in i/o wait (pageout by cleanup)
183  * SL_BUSY - being written to by PGMAN client
184  * SL_LOCKED - permanently locked to cache
185  * SL_OTHER - default sublist
186  *
187  * PAGE PROCESSING
188  *
189  * Page processing uses a number independent continueB loops.
190  *
191  * 1. The "stats loop". Started at node start. Checks lists in debug
192  * mode. In the future could gather statistics and adjust parameters
193  * based on load. Continues via delay signal.
194  *
195  * 2. The "busy loop". Started by page request. Each loop does bind,
196  * map, and callback of a number of entries. Continues via no-delay
197  * signal until nothing to do.
198  *
199  * 3. The "cleanup loop". Started at node start. Each loop starts
200  * pageout of a number of dirty queue entries. Continues via delay
201  * signal.
202  *
203  * 4. The "LCP loop". Started periodically by NDB. Each loop starts
204  * pageout of a number of hash list entries. Continues via delay signal
205  * until done.
206  *
207  * SPECIAL CASES
208  *
209  * LOCKED pages are not put on stack or queue. They are flushed to disk
210  * by LCP but not by clean-up.
211  *
212  * A TUP scan is likely to access a page repeatedly within a short time.
213  * This can make the page hot when it should not be. Such "correlated
214  * requests" are handled by a request flag which modifies default LIRS
215  * processing. [fill in details later]
216  *
217  * Also PK operations make 2 rapid page references. The 2nd one is for
218  * commit. This too should be handled as a correlated request.
219  *
220  * CLIENT TSMAN
221  *
222  * TSMAN reads "meta" pages such as extent headers. There are currently
223  * "locked" forever in PGMAN cache.
224  *
225  * CLIENT DBTUP
226  *
227  * DBTUP works with copy pages (or UNDO buffers) in memory. The real
228  * page is updated only between page request with COMMIT_REQ flag and
229  * a subsequent LSN update. These need not occur in same timeslice
230  * since DBTUP may need to flush UNDO log in-between.
231  *
232  * The page is "busy" if any transaction is between COMMIT_REQ and LSN
233  * update. A busy page must be locked in buffer cache. No pageout of
234  * a busy page can be started by clean-up or LCP.
235  */
236 
237 class Pgman : public SimulatedBlock
238 {
239 public:
240  Pgman(Block_context& ctx, Uint32 instanceNumber = 0);
241  virtual ~Pgman();
242  BLOCK_DEFINES(Pgman);
243 
244 private:
245  friend class Page_cache_client;
246  friend class PgmanProxy;
247 
248  struct Page_entry; // CC
249  friend struct Page_entry;
250 
251  struct Page_request {
252  enum Flags {
253  OP_MASK = 0x000F // 4 bits for TUP operation
254  ,LOCK_PAGE = 0x0020 // lock page in memory
255  ,EMPTY_PAGE = 0x0040 // empty (new) page
256  ,ALLOC_REQ = 0x0080 // part of alloc
257  ,COMMIT_REQ = 0x0100 // part of commit
258  ,DIRTY_REQ = 0x0200 // make page dirty wo/ update_lsn
259  ,UNLOCK_PAGE = 0x0400
260  ,CORR_REQ = 0x0800 // correlated request (no LIRS update)
261 #ifdef ERROR_INSERT
262  ,DELAY_REQ = 0x1000 // Force request to be delayed
263 #endif
264  };
265 
266  Uint16 m_block; // includes instance
267  Uint16 m_flags;
268  SimulatedBlock::Callback m_callback;
269 
270 #ifdef ERROR_INSERT
271  Uint64 m_delay_until_time;
272 #endif
273  Uint32 nextList;
274  Uint32 m_magic;
275  };
276 
280 
281  struct Page_entry_stack_ptr {
282  Uint32 nextList;
283  Uint32 prevList;
284  };
285 
286  struct Page_entry_queue_ptr {
287  Uint32 nextList;
288  Uint32 prevList;
289  };
290 
291  struct Page_entry_sublist_ptr {
292  Uint32 nextList;
293  Uint32 prevList;
294  };
295 
296  typedef Uint16 Page_state;
297 
298  struct Page_entry : Page_entry_stack_ptr,
299  Page_entry_queue_ptr,
300  Page_entry_sublist_ptr {
301  Page_entry() {}
302  Page_entry(Uint32 file_no, Uint32 page_no);
303 
304  enum State {
305  NO_STATE = 0x0000
306  ,REQUEST = 0x0001 // has outstanding request
307  ,EMPTY = 0x0002 // empty (never written) page
308  ,BOUND = 0x0004 // m_real_page_ptr assigned
309  ,MAPPED = 0x0008 // bound, and empty or paged in
310  ,DIRTY = 0x0010 // page is modified
311  ,USED = 0x0020 // used by some tx (not set currently)
312  ,BUSY = 0x0040 // page is being written to
313  ,LOCKED = 0x0080 // locked in cache (forever)
314  ,PAGEIN = 0x0100 // paging in
315  ,PAGEOUT = 0x0200 // paging out
316  ,LOGSYNC = 0x0400 // undo WAL as part of pageout
317  ,LCP = 0x1000 // page is LCP flushed
318  ,HOT = 0x2000 // page is hot
319  ,ONSTACK = 0x4000 // page is on LIRS stack
320  ,ONQUEUE = 0x8000 // page is on LIRS queue
321  };
322 
323  enum Sublist {
324  SL_BIND = 0
325  ,SL_MAP = 1
326  ,SL_MAP_IO = 2
327  ,SL_CALLBACK = 3
328  ,SL_CALLBACK_IO = 4
329  ,SL_BUSY = 5
330  ,SL_LOCKED = 6
331  ,SL_IDLE = 7
332  ,SL_OTHER = 8
333  ,SUBLIST_COUNT = 9
334  };
335 
336  Uint16 m_file_no; // disk page address set at seize
337  Page_state m_state; // flags (0 for new entry)
338 
339  Uint32 m_page_no;
340  Uint32 m_real_page_i;
341  Uint64 m_lsn;
342 
343  Uint32 m_last_lcp;
344  Uint32 m_dirty_count;
345  Uint32 m_copy_page_i;
346  union {
347  Uint32 m_busy_count; // non-zero means BUSY
348  Uint32 nextPool;
349  };
350 
351  Page_request_list::Head m_requests;
352 
353  Uint32 nextHash;
354  Uint32 prevHash;
355 
356  Uint32 hashValue() const { return m_file_no << 16 | m_page_no; }
357  bool equal(const Page_entry& obj) const {
358  return
359  m_file_no == obj.m_file_no && m_page_no == obj.m_page_no;
360  }
361 
362 #ifdef VM_TRACE
363  Pgman* m_this;
364 #endif
365  };
366 
371 
372  class Dbtup *c_tup;
373  class Lgman *c_lgman;
374  class Tsman *c_tsman;
375 
376  // loop status
377  bool m_stats_loop_on;
378  bool m_busy_loop_on;
379  bool m_cleanup_loop_on;
380 
381  // LCP variables
382  enum LCP_STATE
383  {
384  LS_LCP_OFF = 0
385  ,LS_LCP_ON = 1
386  ,LS_LCP_MAX_LCP_OUTSTANDING = 2
387  ,LS_LCP_LOCKED = 3
388  } m_lcp_state;
389  Uint32 m_last_lcp;
390  Uint32 m_last_lcp_complete;
391  Uint32 m_lcp_curr_bucket;
392  Uint32 m_lcp_outstanding; // remaining i/o waits
393  EndLcpReq m_end_lcp_req;
394 
395  // clean-up variables
396  Ptr<Page_entry> m_cleanup_ptr;
397 
398  // file map
399  typedef DataBuffer<15> File_map;
400  File_map m_file_map;
401  File_map::DataBufferPool m_data_buffer_pool;
402 
403  // page entries and requests
404  Page_request_pool m_page_request_pool;
405  ArrayPool<Page_entry> m_page_entry_pool;
406  Page_hashlist m_page_hashlist;
407  Page_stack m_page_stack;
408  Page_queue m_page_queue;
409  Page_sublist* m_page_sublist[Page_entry::SUBLIST_COUNT];
410 
411  // configuration
412  struct Param {
413  Param();
414  Uint32 m_max_pages; // max number of cache pages
415  Uint32 m_lirs_stack_mult; // in m_max_pages (around 3-10)
416  Uint32 m_max_hot_pages; // max hot cache pages (up to 99%)
417  Uint32 m_max_loop_count; // limit purely local loops
418  Uint32 m_max_io_waits;
419  Uint32 m_stats_loop_delay;
420  Uint32 m_cleanup_loop_delay;
421  Uint32 m_lcp_loop_delay;
422  } m_param;
423 
424  // runtime sizes and statistics
425  struct Stats {
426  Stats() :
427  m_num_pages(0),
428  m_num_hot_pages(0),
429  m_current_io_waits(0),
430  m_page_hits(0),
431  m_page_faults(0),
432  m_pages_written(0),
433  m_pages_written_lcp(0),
434  m_pages_read(0),
435  m_log_waits(0),
436  m_page_requests_direct_return(0),
437  m_page_requests_wait_q(0),
438  m_page_requests_wait_io(0)
439  {}
440  Uint32 m_num_pages; // current number of cache pages
441  Uint32 m_num_hot_pages;
442  Uint32 m_current_io_waits;
443  Uint64 m_page_hits;
444  Uint64 m_page_faults;
445  Uint64 m_pages_written;
446  Uint64 m_pages_written_lcp;
447  Uint64 m_pages_read;
448  Uint64 m_log_waits; // wait for undo WAL to flush the log recs
449  Uint64 m_page_requests_direct_return;
450  Uint64 m_page_requests_wait_q;
451  Uint64 m_page_requests_wait_io;
452  } m_stats;
453 
454  enum CallbackIndex {
455  // lgman
456  LOGSYNC_CALLBACK = 1,
457  COUNT_CALLBACKS = 2
458  };
459  CallbackEntry m_callbackEntry[COUNT_CALLBACKS];
460  CallbackTable m_callbackTable;
461 
462 protected:
463  void execSTTOR(Signal* signal);
464  void sendSTTORRY(Signal*);
465  void execREAD_CONFIG_REQ(Signal* signal);
466  void execCONTINUEB(Signal* signal);
467 
468  void execLCP_FRAG_ORD(Signal*);
469  void execEND_LCP_REQ(Signal*);
470  void execRELEASE_PAGES_REQ(Signal*);
471 
472  void execFSREADCONF(Signal*);
473  void execFSREADREF(Signal*);
474  void execFSWRITECONF(Signal*);
475  void execFSWRITEREF(Signal*);
476 
477  void execDUMP_STATE_ORD(Signal* signal);
478 
479  void execDATA_FILE_ORD(Signal*);
480 
481  void execDBINFO_SCANREQ(Signal*);
482 
483 private:
484  static Uint32 get_sublist_no(Page_state state);
485  void set_page_state(Ptr<Page_entry> ptr, Page_state new_state);
486 
487  bool seize_cache_page(Ptr<GlobalPage>& gptr);
488  void release_cache_page(Uint32 i);
489 
490  bool find_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
491  Uint32 seize_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
492  bool get_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
493  void release_page_entry(Ptr<Page_entry>&);
494 
495  void lirs_stack_prune();
496  void lirs_stack_pop();
497  void lirs_reference(Ptr<Page_entry> ptr);
498 
499  void do_stats_loop(Signal*);
500  void do_busy_loop(Signal*, bool direct = false);
501  void do_cleanup_loop(Signal*);
502  void do_lcp_loop(Signal*);
503 
504  bool process_bind(Signal*);
505  bool process_bind(Signal*, Ptr<Page_entry> ptr);
506  bool process_map(Signal*);
507  bool process_map(Signal*, Ptr<Page_entry> ptr);
508  bool process_callback(Signal*);
509  bool process_callback(Signal*, Ptr<Page_entry> ptr);
510 
511  bool process_cleanup(Signal*);
512  void move_cleanup_ptr(Ptr<Page_entry> ptr);
513 
514  LCP_STATE process_lcp(Signal*);
515  void process_lcp_locked(Signal* signal, Ptr<Page_entry> ptr);
516  void process_lcp_locked_fswriteconf(Signal* signal, Ptr<Page_entry> ptr);
517 
518  void pagein(Signal*, Ptr<Page_entry>);
519  void fsreadreq(Signal*, Ptr<Page_entry>);
520  void fsreadconf(Signal*, Ptr<Page_entry>);
521  void pageout(Signal*, Ptr<Page_entry>);
522  void logsync_callback(Signal*, Uint32 ptrI, Uint32 res);
523  void fswritereq(Signal*, Ptr<Page_entry>);
524  void fswriteconf(Signal*, Ptr<Page_entry>);
525 
526  int get_page_no_lirs(Signal*, Ptr<Page_entry>, Page_request page_req);
527  int get_page(Signal*, Ptr<Page_entry>, Page_request page_req);
528  void update_lsn(Ptr<Page_entry>, Uint32 block, Uint64 lsn);
529  Uint32 create_data_file();
530  Uint32 alloc_data_file(Uint32 file_no);
531  void map_file_no(Uint32 file_no, Uint32 fd);
532  void free_data_file(Uint32 file_no, Uint32 fd = RNIL);
533  int drop_page(Ptr<Page_entry>);
534 
535 #ifdef VM_TRACE
536  bool debugFlag; // not yet in use in 7.0
537  bool debugSummaryFlag; // loop summary to signal log even if ! debugFlag
538  void verify_page_entry(Ptr<Page_entry> ptr);
539  void verify_page_lists();
540  void verify_all();
541  bool dump_page_lists(Uint32 ptrI = RNIL);
542 #endif
543  static const char* get_sublist_name(Uint32 list_no);
544  friend class NdbOut& operator<<(NdbOut&, Ptr<Page_request>);
545  friend class NdbOut& operator<<(NdbOut&, Ptr<Page_entry>);
546 };
547 
548 class NdbOut& operator<<(NdbOut&, Ptr<Pgman::Page_request>);
549 class NdbOut& operator<<(NdbOut&, Ptr<Pgman::Page_entry>);
550 
552 {
553  friend class PgmanProxy;
554  Uint32 m_block; // includes instance
555  class PgmanProxy* m_pgman_proxy; // set if we go via proxy
556  Pgman* m_pgman;
557  DEBUG_OUT_DEFINES(PGMAN);
558 
559 public:
561 
562  struct Request {
563  Local_key m_page;
564  SimulatedBlock::Callback m_callback;
565 
566 #ifdef ERROR_INSERT
567  Uint64 m_delay_until_time;
568 #endif
569  };
570 
571  Ptr<GlobalPage> m_ptr; // TODO remove
572 
573  enum RequestFlags {
574  LOCK_PAGE = Pgman::Page_request::LOCK_PAGE
575  ,EMPTY_PAGE = Pgman::Page_request::EMPTY_PAGE
576  ,ALLOC_REQ = Pgman::Page_request::ALLOC_REQ
577  ,COMMIT_REQ = Pgman::Page_request::COMMIT_REQ
578  ,DIRTY_REQ = Pgman::Page_request::DIRTY_REQ
579  ,UNLOCK_PAGE = Pgman::Page_request::UNLOCK_PAGE
580  ,CORR_REQ = Pgman::Page_request::CORR_REQ
581 #ifdef ERROR_INSERT
582  ,DELAY_REQ = Pgman::Page_request::DELAY_REQ
583 #endif
584  };
585 
594  int get_page(Signal*, Request&, Uint32 flags);
595 
596  void update_lsn(Local_key, Uint64 lsn);
597 
605  int drop_page(Local_key, Uint32 page_id);
606 
610  Uint32 create_data_file(Signal*);
611 
615  Uint32 alloc_data_file(Signal*, Uint32 file_no);
616 
620  void map_file_no(Signal*, Uint32 m_file_no, Uint32 m_fd);
621 
625  void free_data_file(Signal*, Uint32 file_no, Uint32 fd = RNIL);
626 };
627 
628 #endif