MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
semisync_master.h
1 /* Copyright (C) 2007 Google Inc.
2  Copyright (c) 2008 MySQL AB, 2009 Sun Microsystems, Inc.
3  Use is subject to license terms.
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; version 2 of the License.
8 
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  GNU General Public License for more details.
13 
14  You should have received a copy of the GNU General Public License
15  along with this program; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
17 
18 
19 #ifndef SEMISYNC_MASTER_H
20 #define SEMISYNC_MASTER_H
21 
22 #include "semisync.h"
23 
24 #ifdef HAVE_PSI_INTERFACE
25 extern PSI_mutex_key key_ss_mutex_LOCK_binlog_;
26 extern PSI_cond_key key_ss_cond_COND_binlog_send_;
27 #endif
28 
29 extern PSI_stage_info stage_waiting_for_semi_sync_ack_from_slave;
30 
31 struct TranxNode {
32  char log_name_[FN_REFLEN];
33  my_off_t log_pos_;
34  struct TranxNode *next_; /* the next node in the sorted list */
35  struct TranxNode *hash_next_; /* the next node during hash collision */
36 };
37 
71 #define BLOCK_TRANX_NODES 16
73 {
74 public:
82  TranxNodeAllocator(uint reserved_nodes) :
83  reserved_blocks(reserved_nodes/BLOCK_TRANX_NODES +
84  (reserved_nodes%BLOCK_TRANX_NODES > 1 ? 2 : 1)),
85  first_block(NULL), last_block(NULL),
86  current_block(NULL), last_node(-1), block_num(0) {}
87 
89  {
90  Block *block= first_block;
91  while (block != NULL)
92  {
93  Block *next= block->next;
94  free_block(block);
95  block= next;
96  }
97  }
98 
108  {
109  TranxNode *trx_node;
110  Block *block= current_block;
111 
112  if (last_node == BLOCK_TRANX_NODES-1)
113  {
114  current_block= current_block->next;
115  last_node= -1;
116  }
117 
118  if (current_block == NULL && allocate_block())
119  {
120  current_block= block;
121  if (current_block)
122  last_node= BLOCK_TRANX_NODES-1;
123  return NULL;
124  }
125 
126  trx_node= &(current_block->nodes[++last_node]);
127  trx_node->log_name_[0] = '\0';
128  trx_node->log_pos_= 0;
129  trx_node->next_= 0;
130  trx_node->hash_next_= 0;
131  return trx_node;
132  }
133 
140  {
141  current_block= first_block;
142  last_node= -1;
143  free_blocks();
144  return 0;
145  }
146 
156  {
157  Block *block;
158  Block *prev_block= NULL;
159 
160  block= first_block;
161  while (block != current_block->next)
162  {
163  /* Find the Block containing the given node */
164  if (&(block->nodes[0]) <= node && &(block->nodes[BLOCK_TRANX_NODES]) >= node)
165  {
166  /* All Blocks before the given node are put into the rear */
167  if (first_block != block)
168  {
169  last_block->next= first_block;
170  first_block= block;
171  last_block= prev_block;
172  last_block->next= NULL;
173  free_blocks();
174  }
175  return 0;
176  }
177  prev_block= block;
178  block= block->next;
179  }
180 
181  /* Node does not find should never happen */
182  DBUG_ASSERT(0);
183  return 1;
184  }
185 
186 private:
187  uint reserved_blocks;
188 
197  struct Block {
198  Block *next;
199  TranxNode nodes[BLOCK_TRANX_NODES];
200  };
201 
205  Block *first_block;
209  Block *last_block;
210 
216  Block *current_block;
217 
222  int last_node;
223 
227  uint block_num;
228 
232  int allocate_block()
233  {
234  Block *block= (Block *)my_malloc(sizeof(Block), MYF(0));
235  if (block)
236  {
237  block->next= NULL;
238 
239  if (first_block == NULL)
240  first_block= block;
241  else
242  last_block->next= block;
243 
244  /* New Block is always put into the rear */
245  last_block= block;
246  /* New Block is always the current_block */
247  current_block= block;
248  ++block_num;
249  return 0;
250  }
251  return 1;
252  }
253 
258  void free_block(Block *block)
259  {
260  my_free(block);
261  --block_num;
262  }
263 
264 
272  void free_blocks()
273  {
274  if (current_block == NULL || current_block->next == NULL)
275  return;
276 
277  /* One free Block is always kept behind the current block */
278  Block *block= current_block->next->next;
279  while (block_num > reserved_blocks && block != NULL)
280  {
281  Block *next= block->next;
282  free_block(block);
283  block= next;
284  }
285  current_block->next->next= block;
286  if (block == NULL)
287  last_block= current_block->next;
288  }
289 };
290 
300  :public Trace {
301 private:
302 
303  TranxNodeAllocator allocator_;
304  /* These two record the active transaction list in sort order. */
305  TranxNode *trx_front_, *trx_rear_;
306 
307  TranxNode **trx_htb_; /* A hash table on active transactions. */
308 
309  int num_entries_; /* maximum hash table entries */
310  mysql_mutex_t *lock_; /* mutex lock */
311 
312  inline void assert_lock_owner();
313 
314  inline unsigned int calc_hash(const unsigned char *key,unsigned int length);
315  unsigned int get_hash_value(const char *log_file_name, my_off_t log_file_pos);
316 
317  int compare(const char *log_file_name1, my_off_t log_file_pos1,
318  const TranxNode *node2) {
319  return compare(log_file_name1, log_file_pos1,
320  node2->log_name_, node2->log_pos_);
321  }
322  int compare(const TranxNode *node1,
323  const char *log_file_name2, my_off_t log_file_pos2) {
324  return compare(node1->log_name_, node1->log_pos_,
325  log_file_name2, log_file_pos2);
326  }
327  int compare(const TranxNode *node1, const TranxNode *node2) {
328  return compare(node1->log_name_, node1->log_pos_,
329  node2->log_name_, node2->log_pos_);
330  }
331 
332 public:
333  ActiveTranx(mysql_mutex_t *lock, unsigned long trace_level);
334  ~ActiveTranx();
335 
336  /* Insert an active transaction node with the specified position.
337  *
338  * Return:
339  * 0: success; non-zero: error
340  */
341  int insert_tranx_node(const char *log_file_name, my_off_t log_file_pos);
342 
343  /* Clear the active transaction nodes until(inclusive) the specified
344  * position.
345  * If log_file_name is NULL, everything will be cleared: the sorted
346  * list and the hash table will be reset to empty.
347  *
348  * Return:
349  * 0: success; non-zero: error
350  */
351  int clear_active_tranx_nodes(const char *log_file_name,
352  my_off_t log_file_pos);
353 
354  /* Given a position, check to see whether the position is an active
355  * transaction's ending position by probing the hash table.
356  */
357  bool is_tranx_end_pos(const char *log_file_name, my_off_t log_file_pos);
358 
359  /* Given two binlog positions, compare which one is bigger based on
360  * (file_name, file_position).
361  */
362  static int compare(const char *log_file_name1, my_off_t log_file_pos1,
363  const char *log_file_name2, my_off_t log_file_pos2);
364 
365 };
366 
371  :public ReplSemiSyncBase {
372  private:
373  ActiveTranx *active_tranxs_; /* active transaction list: the list will
374  be cleared when semi-sync switches off. */
375 
376  /* True when initObject has been called */
377  bool init_done_;
378 
379  /* This cond variable is signaled when enough binlog has been sent to slave,
380  * so that a waiting trx can return the 'ok' to the client for a commit.
381  */
382  mysql_cond_t COND_binlog_send_;
383 
384  /* Mutex that protects the following state variables and the active
385  * transaction list.
386  * Under no cirumstances we can acquire mysql_bin_log.LOCK_log if we are
387  * already holding LOCK_binlog_ because it can cause deadlocks.
388  */
389  mysql_mutex_t LOCK_binlog_;
390 
391  /* This is set to true when reply_file_name_ contains meaningful data. */
392  bool reply_file_name_inited_;
393 
394  /* The binlog name up to which we have received replies from any slaves. */
395  char reply_file_name_[FN_REFLEN];
396 
397  /* The position in that file up to which we have the reply from any slaves. */
398  my_off_t reply_file_pos_;
399 
400  /* This is set to true when we know the 'smallest' wait position. */
401  bool wait_file_name_inited_;
402 
403  /* NULL, or the 'smallest' filename that a transaction is waiting for
404  * slave replies.
405  */
406  char wait_file_name_[FN_REFLEN];
407 
408  /* The smallest position in that file that a trx is waiting for: the trx
409  * can proceed and send an 'ok' to the client when the master has got the
410  * reply from the slave indicating that it already got the binlog events.
411  */
412  my_off_t wait_file_pos_;
413 
414  /* This is set to true when we know the 'largest' transaction commit
415  * position in the binlog file.
416  * We always maintain the position no matter whether semi-sync is switched
417  * on switched off. When a transaction wait timeout occurs, semi-sync will
418  * switch off. Binlog-dump thread can use the three fields to detect when
419  * slaves catch up on replication so that semi-sync can switch on again.
420  */
421  bool commit_file_name_inited_;
422 
423  /* The 'largest' binlog filename that a commit transaction is seeing. */
424  char commit_file_name_[FN_REFLEN];
425 
426  /* The 'largest' position in that file that a commit transaction is seeing. */
427  my_off_t commit_file_pos_;
428 
429  /* All global variables which can be set by parameters. */
430  volatile bool master_enabled_; /* semi-sync is enabled on the master */
431  unsigned long wait_timeout_; /* timeout period(ms) during tranx wait */
432 
433  bool state_; /* whether semi-sync is switched */
434 
435  void lock();
436  void unlock();
437  void cond_broadcast();
438  int cond_timewait(struct timespec *wait_time);
439 
440  /* Is semi-sync replication on? */
441  bool is_on() {
442  return (state_);
443  }
444 
445  void set_master_enabled(bool enabled) {
446  master_enabled_ = enabled;
447  }
448 
449  /* Switch semi-sync off because of timeout in transaction waiting. */
450  int switch_off();
451 
452  /* Switch semi-sync on when slaves catch up. */
453  int try_switch_on(int server_id,
454  const char *log_file_name, my_off_t log_file_pos);
455 
456  public:
459 
460  bool getMasterEnabled() {
461  return master_enabled_;
462  }
463  void setTraceLevel(unsigned long trace_level) {
464  trace_level_ = trace_level;
465  if (active_tranxs_)
466  active_tranxs_->trace_level_ = trace_level;
467  }
468 
469  /* Set the transaction wait timeout period, in milliseconds. */
470  void setWaitTimeout(unsigned long wait_timeout) {
471  wait_timeout_ = wait_timeout;
472  }
473 
474  /* Initialize this class after MySQL parameters are initialized. this
475  * function should be called once at bootstrap time.
476  */
477  int initObject();
478 
479  /* Enable the object to enable semi-sync replication inside the master. */
480  int enableMaster();
481 
482  /* Enable the object to enable semi-sync replication inside the master. */
483  int disableMaster();
484 
485  /* Add a semi-sync replication slave */
486  void add_slave();
487 
488  /* Remove a semi-sync replication slave */
489  void remove_slave();
490 
491  /* Is the slave servered by the thread requested semi-sync */
492  bool is_semi_sync_slave();
493 
494  /* In semi-sync replication, reports up to which binlog position we have
495  * received replies from the slave indicating that it already get the events
496  * or that was skipped in the master.
497  *
498  * Input:
499  * server_id - (IN) master server id number
500  * log_file_name - (IN) binlog file name
501  * end_offset - (IN) the offset in the binlog file up to which we have
502  * the replies from the slave or that was skipped
503  * skipped_event - (IN) if the event was skipped
504  *
505  * Return:
506  * 0: success; non-zero: error
507  */
508  int reportReplyBinlog(uint32 server_id,
509  const char* log_file_name,
510  my_off_t end_offset,
511  bool skipped_event= false);
512 
513  /* Commit a transaction in the final step. This function is called from
514  * InnoDB before returning from the low commit. If semi-sync is switch on,
515  * the function will wait to see whether binlog-dump thread get the reply for
516  * the events of the transaction. Remember that this is not a direct wait,
517  * instead, it waits to see whether the binlog-dump thread has reached the
518  * point. If the wait times out, semi-sync status will be switched off and
519  * all other transaction would not wait either.
520  *
521  * Input: (the transaction events' ending binlog position)
522  * trx_wait_binlog_name - (IN) ending position's file name
523  * trx_wait_binlog_pos - (IN) ending position's file offset
524  *
525  * Return:
526  * 0: success; non-zero: error
527  */
528  int commitTrx(const char* trx_wait_binlog_name,
529  my_off_t trx_wait_binlog_pos);
530 
531  /* Reserve space in the replication event packet header:
532  * . slave semi-sync off: 1 byte - (0)
533  * . slave semi-sync on: 3 byte - (0, 0xef, 0/1}
534  *
535  * Input:
536  * header - (IN) the header buffer
537  * size - (IN) size of the header buffer
538  *
539  * Return:
540  * size of the bytes reserved for header
541  */
542  int reserveSyncHeader(unsigned char *header, unsigned long size);
543 
544  /* Update the sync bit in the packet header to indicate to the slave whether
545  * the master will wait for the reply of the event. If semi-sync is switched
546  * off and we detect that the slave is catching up, we switch semi-sync on.
547  *
548  * Input:
549  * packet - (IN) the packet containing the replication event
550  * log_file_name - (IN) the event ending position's file name
551  * log_file_pos - (IN) the event ending position's file offset
552  * server_id - (IN) master server id number
553  *
554  * Return:
555  * 0: success; non-zero: error
556  */
557  int updateSyncHeader(unsigned char *packet,
558  const char *log_file_name,
559  my_off_t log_file_pos,
560  uint32 server_id);
561 
562  /* Called when a transaction finished writing binlog events.
563  * . update the 'largest' transactions' binlog event position
564  * . insert the ending position in the active transaction list if
565  * semi-sync is on
566  *
567  * Input: (the transaction events' ending binlog position)
568  * log_file_name - (IN) transaction ending position's file name
569  * log_file_pos - (IN) transaction ending position's file offset
570  *
571  * Return:
572  * 0: success; non-zero: error
573  */
574  int writeTranxInBinlog(const char* log_file_name, my_off_t log_file_pos);
575 
576  /* Read the slave's reply so that we know how much progress the slave makes
577  * on receive replication events.
578  *
579  * Input:
580  * net - (IN) the connection to master
581  * server_id - (IN) master server id number
582  * event_buf - (IN) pointer to the event packet
583  *
584  * Return:
585  * 0: success; non-zero: error
586  */
587  int readSlaveReply(NET *net, uint32 server_id, const char *event_buf);
588 
589  /* In semi-sync replication, this method simulates the reception of
590  * an reply and executes reportReplyBinlog directly when a transaction
591  * is skipped in the master.
592  *
593  * Input:
594  * event_buf - (IN) pointer to the event packet
595  * server_id - (IN) master server id numbe
596  * log_file_name - (IN) the event ending position's file name
597  * log_file_pos - (IN) the event ending position's file offset
598  *
599  * Return:
600  * 0: success; non-zero: error
601  */
602  int skipSlaveReply(const char *event_buf, uint32 server_id,
603  const char* log_file_name, my_off_t log_file_pos);
604 
605  /* Export internal statistics for semi-sync replication. */
606  void setExportStats();
607 
608  /* 'reset master' command is issued from the user and semi-sync need to
609  * go off for that.
610  */
611  int resetMaster();
612 };
613 
614 /* System and status variables for the master component */
615 extern char rpl_semi_sync_master_enabled;
616 extern char rpl_semi_sync_master_status;
617 extern unsigned long rpl_semi_sync_master_clients;
618 extern unsigned long rpl_semi_sync_master_timeout;
619 extern unsigned long rpl_semi_sync_master_trace_level;
620 extern unsigned long rpl_semi_sync_master_yes_transactions;
621 extern unsigned long rpl_semi_sync_master_no_transactions;
622 extern unsigned long rpl_semi_sync_master_off_times;
623 extern unsigned long rpl_semi_sync_master_wait_timeouts;
624 extern unsigned long rpl_semi_sync_master_timefunc_fails;
625 extern unsigned long rpl_semi_sync_master_num_timeouts;
626 extern unsigned long rpl_semi_sync_master_wait_sessions;
627 extern unsigned long rpl_semi_sync_master_wait_pos_backtraverse;
628 extern unsigned long rpl_semi_sync_master_avg_trx_wait_time;
629 extern unsigned long rpl_semi_sync_master_avg_net_wait_time;
630 extern unsigned long long rpl_semi_sync_master_net_wait_num;
631 extern unsigned long long rpl_semi_sync_master_trx_wait_num;
632 extern unsigned long long rpl_semi_sync_master_net_wait_time;
633 extern unsigned long long rpl_semi_sync_master_trx_wait_time;
634 
635 /*
636  This indicates whether we should keep waiting if no semi-sync slave
637  is available.
638  0 : stop waiting if detected no avaialable semi-sync slave.
639  1 (default) : keep waiting until timeout even no available semi-sync slave.
640 */
641 extern char rpl_semi_sync_master_wait_no_slave;
642 
643 #endif /* SEMISYNC_MASTER_H */