1/* Copyright (C) 2007 Google Inc.
2 Copyright (c) 2008 MySQL AB, 2009 Sun Microsystems, Inc.
3 Use is subject to license terms.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; version 2 of the License.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
17
18
19#ifndef SEMISYNC_MASTER_H
20#define SEMISYNC_MASTER_H
21
22#include "semisync.h"
23#include "semisync_master_ack_receiver.h"
24
25#ifdef HAVE_PSI_INTERFACE
26extern PSI_mutex_key key_LOCK_binlog;
27extern PSI_cond_key key_COND_binlog_send;
28#endif
29
30struct Tranx_node {
31 char log_name[FN_REFLEN];
32 my_off_t log_pos;
33 struct Tranx_node *next; /* the next node in the sorted list */
34 struct Tranx_node *hash_next; /* the next node during hash collision */
35};
36
37/**
38 @class Tranx_node_allocator
39
40 This class provides memory allocating and freeing methods for
41 Tranx_node. The main target is performance.
42
43 @section ALLOCATE How to allocate a node
44 The pointer of the first node after 'last_node' in current_block is
45 returned. current_block will move to the next free Block when all nodes of
46 it are in use. A new Block is allocated and is put into the rear of the
47 Block link table if no Block is free.
48
49 The list starts up empty (ie, there is no allocated Block).
50
51 After some nodes are freed, there probably are some free nodes before
52 the sequence of the allocated nodes, but we do not reuse it. It is better
53 to keep the allocated nodes are in the sequence, for it is more efficient
54 for allocating and freeing Tranx_node.
55
56 @section FREENODE How to free nodes
57 There are two methods for freeing nodes. They are free_all_nodes and
58 free_nodes_before.
59
60 'A Block is free' means all of its nodes are free.
61 @subsection free_nodes_before
62 As all allocated nodes are in the sequence, 'Before one node' means all
63 nodes before given node in the same Block and all Blocks before the Block
64 which containing the given node. As such, all Blocks before the given one
65 ('node') are free Block and moved into the rear of the Block link table.
66 The Block containing the given 'node', however, is not. For at least the
67 given 'node' is still in use. This will waste at most one Block, but it is
68 more efficient.
69 */
70#define BLOCK_TRANX_NODES 16
71class Tranx_node_allocator
72{
73public:
74 /**
75 @param reserved_nodes
76 The number of reserved Tranx_nodes. It is used to set 'reserved_blocks'
77 which can contain at least 'reserved_nodes' number of Tranx_nodes. When
78 freeing memory, we will reserve at least reserved_blocks of Blocks not
79 freed.
80 */
81 Tranx_node_allocator(uint reserved_nodes) :
82 reserved_blocks(reserved_nodes/BLOCK_TRANX_NODES +
83 (reserved_nodes%BLOCK_TRANX_NODES > 1 ? 2 : 1)),
84 first_block(NULL), last_block(NULL),
85 current_block(NULL), last_node(-1), block_num(0) {}
86
87 ~Tranx_node_allocator()
88 {
89 Block *block= first_block;
90 while (block != NULL)
91 {
92 Block *next= block->next;
93 free_block(block);
94 block= next;
95 }
96 }
97
98 /**
99 The pointer of the first node after 'last_node' in current_block is
100 returned. current_block will move to the next free Block when all nodes of
101 it are in use. A new Block is allocated and is put into the rear of the
102 Block link table if no Block is free.
103
104 @return Return a Tranx_node *, or NULL if an error occurred.
105 */
106 Tranx_node *allocate_node()
107 {
108 Tranx_node *trx_node;
109 Block *block= current_block;
110
111 if (last_node == BLOCK_TRANX_NODES-1)
112 {
113 current_block= current_block->next;
114 last_node= -1;
115 }
116
117 if (current_block == NULL && allocate_block())
118 {
119 current_block= block;
120 if (current_block)
121 last_node= BLOCK_TRANX_NODES-1;
122 return NULL;
123 }
124
125 trx_node= &(current_block->nodes[++last_node]);
126 trx_node->log_name[0] = '\0';
127 trx_node->log_pos= 0;
128 trx_node->next= 0;
129 trx_node->hash_next= 0;
130 return trx_node;
131 }
132
133 /**
134 All nodes are freed.
135
136 @return Return 0, or 1 if an error occurred.
137 */
138 int free_all_nodes()
139 {
140 current_block= first_block;
141 last_node= -1;
142 free_blocks();
143 return 0;
144 }
145
146 /**
147 All Blocks before the given 'node' are free Block and moved into the rear
148 of the Block link table.
149
150 @param node All nodes before 'node' will be freed
151
152 @return Return 0, or 1 if an error occurred.
153 */
154 int free_nodes_before(Tranx_node* node)
155 {
156 Block *block;
157 Block *prev_block= NULL;
158
159 block= first_block;
160 while (block != current_block->next)
161 {
162 /* Find the Block containing the given node */
163 if (&(block->nodes[0]) <= node && &(block->nodes[BLOCK_TRANX_NODES]) >= node)
164 {
165 /* All Blocks before the given node are put into the rear */
166 if (first_block != block)
167 {
168 last_block->next= first_block;
169 first_block= block;
170 last_block= prev_block;
171 last_block->next= NULL;
172 free_blocks();
173 }
174 return 0;
175 }
176 prev_block= block;
177 block= block->next;
178 }
179
180 /* Node does not find should never happen */
181 DBUG_ASSERT(0);
182 return 1;
183 }
184
185private:
186 uint reserved_blocks;
187
188 /**
189 A sequence memory which contains BLOCK_TRANX_NODES Tranx_nodes.
190
191 BLOCK_TRANX_NODES The number of Tranx_nodes which are in a Block.
192
193 next Every Block has a 'next' pointer which points to the next Block.
194 These linking Blocks constitute a Block link table.
195 */
196 struct Block {
197 Block *next;
198 Tranx_node nodes[BLOCK_TRANX_NODES];
199 };
200
201 /**
202 The 'first_block' is the head of the Block link table;
203 */
204 Block *first_block;
205 /**
206 The 'last_block' is the rear of the Block link table;
207 */
208 Block *last_block;
209
210 /**
211 current_block always points the Block in the Block link table in
212 which the last allocated node is. The Blocks before it are all in use
213 and the Blocks after it are all free.
214 */
215 Block *current_block;
216
217 /**
218 It always points to the last node which has been allocated in the
219 current_block.
220 */
221 int last_node;
222
223 /**
224 How many Blocks are in the Block link table.
225 */
226 uint block_num;
227
228 /**
229 Allocate a block and then assign it to current_block.
230 */
231 int allocate_block()
232 {
233 Block *block= (Block *)my_malloc(sizeof(Block), MYF(0));
234 if (block)
235 {
236 block->next= NULL;
237
238 if (first_block == NULL)
239 first_block= block;
240 else
241 last_block->next= block;
242
243 /* New Block is always put into the rear */
244 last_block= block;
245 /* New Block is always the current_block */
246 current_block= block;
247 ++block_num;
248 return 0;
249 }
250 return 1;
251 }
252
253 /**
254 Free a given Block.
255 @param block The Block will be freed.
256 */
257 void free_block(Block *block)
258 {
259 my_free(block);
260 --block_num;
261 }
262
263
264 /**
265 If there are some free Blocks and the total number of the Blocks in the
266 Block link table is larger than the 'reserved_blocks', Some free Blocks
267 will be freed until the total number of the Blocks is equal to the
268 'reserved_blocks' or there is only one free Block behind the
269 'current_block'.
270 */
271 void free_blocks()
272 {
273 if (current_block == NULL || current_block->next == NULL)
274 return;
275
276 /* One free Block is always kept behind the current block */
277 Block *block= current_block->next->next;
278 while (block_num > reserved_blocks && block != NULL)
279 {
280 Block *next= block->next;
281 free_block(block);
282 block= next;
283 }
284 current_block->next->next= block;
285 if (block == NULL)
286 last_block= current_block->next;
287 }
288};
289
290/**
291 This class manages memory for active transaction list.
292
293 We record each active transaction with a Tranx_node, each session
294 can have only one open transaction. Because of EVENT, the total
295 active transaction nodes can exceed the maximum allowed
296 connections.
297*/
298class Active_tranx
299 :public Trace {
300private:
301
302 Tranx_node_allocator m_allocator;
303 /* These two record the active transaction list in sort order. */
304 Tranx_node *m_trx_front, *m_trx_rear;
305
306 Tranx_node **m_trx_htb; /* A hash table on active transactions. */
307
308 int m_num_entries; /* maximum hash table entries */
309 mysql_mutex_t *m_lock; /* mutex lock */
310
311 inline void assert_lock_owner();
312
313 inline unsigned int calc_hash(const unsigned char *key, size_t length);
314 unsigned int get_hash_value(const char *log_file_name, my_off_t log_file_pos);
315
316 int compare(const char *log_file_name1, my_off_t log_file_pos1,
317 const Tranx_node *node2) {
318 return compare(log_file_name1, log_file_pos1,
319 node2->log_name, node2->log_pos);
320 }
321 int compare(const Tranx_node *node1,
322 const char *log_file_name2, my_off_t log_file_pos2) {
323 return compare(node1->log_name, node1->log_pos,
324 log_file_name2, log_file_pos2);
325 }
326 int compare(const Tranx_node *node1, const Tranx_node *node2) {
327 return compare(node1->log_name, node1->log_pos,
328 node2->log_name, node2->log_pos);
329 }
330
331public:
332 Active_tranx(mysql_mutex_t *lock, unsigned long trace_level);
333 ~Active_tranx();
334
335 /* Insert an active transaction node with the specified position.
336 *
337 * Return:
338 * 0: success; non-zero: error
339 */
340 int insert_tranx_node(const char *log_file_name, my_off_t log_file_pos);
341
342 /* Clear the active transaction nodes until(inclusive) the specified
343 * position.
344 * If log_file_name is NULL, everything will be cleared: the sorted
345 * list and the hash table will be reset to empty.
346 *
347 * Return:
348 * 0: success; non-zero: error
349 */
350 int clear_active_tranx_nodes(const char *log_file_name,
351 my_off_t log_file_pos);
352
353 /* Given a position, check to see whether the position is an active
354 * transaction's ending position by probing the hash table.
355 */
356 bool is_tranx_end_pos(const char *log_file_name, my_off_t log_file_pos);
357
358 /* Given two binlog positions, compare which one is bigger based on
359 * (file_name, file_position).
360 */
361 static int compare(const char *log_file_name1, my_off_t log_file_pos1,
362 const char *log_file_name2, my_off_t log_file_pos2);
363
364};
365
366/**
367 The extension class for the master of semi-synchronous replication
368*/
369class Repl_semi_sync_master
370 :public Repl_semi_sync_base {
371 private:
372 Active_tranx *m_active_tranxs; /* active transaction list: the list will
373 be cleared when semi-sync switches off. */
374
375 /* True when init_object has been called */
376 bool m_init_done;
377
378 /* This cond variable is signaled when enough binlog has been sent to slave,
379 * so that a waiting trx can return the 'ok' to the client for a commit.
380 */
381 mysql_cond_t COND_binlog_send;
382
383 /* Mutex that protects the following state variables and the active
384 * transaction list.
385 * Under no cirumstances we can acquire mysql_bin_log.LOCK_log if we are
386 * already holding m_LOCK_binlog because it can cause deadlocks.
387 */
388 mysql_mutex_t LOCK_binlog;
389
390 /* This is set to true when m_reply_file_name contains meaningful data. */
391 bool m_reply_file_name_inited;
392
393 /* The binlog name up to which we have received replies from any slaves. */
394 char m_reply_file_name[FN_REFLEN];
395
396 /* The position in that file up to which we have the reply from any slaves. */
397 my_off_t m_reply_file_pos;
398
399 /* This is set to true when we know the 'smallest' wait position. */
400 bool m_wait_file_name_inited;
401
402 /* NULL, or the 'smallest' filename that a transaction is waiting for
403 * slave replies.
404 */
405 char m_wait_file_name[FN_REFLEN];
406
407 /* The smallest position in that file that a trx is waiting for: the trx
408 * can proceed and send an 'ok' to the client when the master has got the
409 * reply from the slave indicating that it already got the binlog events.
410 */
411 my_off_t m_wait_file_pos;
412
413 /* This is set to true when we know the 'largest' transaction commit
414 * position in the binlog file.
415 * We always maintain the position no matter whether semi-sync is switched
416 * on switched off. When a transaction wait timeout occurs, semi-sync will
417 * switch off. Binlog-dump thread can use the three fields to detect when
418 * slaves catch up on replication so that semi-sync can switch on again.
419 */
420 bool m_commit_file_name_inited;
421
422 /* The 'largest' binlog filename that a commit transaction is seeing. */
423 char m_commit_file_name[FN_REFLEN];
424
425 /* The 'largest' position in that file that a commit transaction is seeing. */
426 my_off_t m_commit_file_pos;
427
428 /* All global variables which can be set by parameters. */
429 volatile bool m_master_enabled; /* semi-sync is enabled on the master */
430 unsigned long m_wait_timeout; /* timeout period(ms) during tranx wait */
431
432 bool m_state; /* whether semi-sync is switched */
433
434 /*Waiting for ACK before/after innodb commit*/
435 ulong m_wait_point;
436
437 void lock();
438 void unlock();
439 void cond_broadcast();
440 int cond_timewait(struct timespec *wait_time);
441
442 /* Is semi-sync replication on? */
443 bool is_on() {
444 return (m_state);
445 }
446
447 void set_master_enabled(bool enabled) {
448 m_master_enabled = enabled;
449 }
450
451 /* Switch semi-sync off because of timeout in transaction waiting. */
452 int switch_off();
453
454 /* Switch semi-sync on when slaves catch up. */
455 int try_switch_on(int server_id,
456 const char *log_file_name, my_off_t log_file_pos);
457
458 public:
459 Repl_semi_sync_master();
460 ~Repl_semi_sync_master() {}
461
462 void cleanup();
463
464 bool get_master_enabled() {
465 return m_master_enabled;
466 }
467 void set_trace_level(unsigned long trace_level) {
468 m_trace_level = trace_level;
469 if (m_active_tranxs)
470 m_active_tranxs->m_trace_level = trace_level;
471 }
472
473 /* Set the transaction wait timeout period, in milliseconds. */
474 void set_wait_timeout(unsigned long wait_timeout) {
475 m_wait_timeout = wait_timeout;
476 }
477
478 /*set the ACK point, after binlog sync or after transaction commit*/
479 void set_wait_point(unsigned long ack_point)
480 {
481 m_wait_point = ack_point;
482 }
483
484 ulong wait_point() //no cover line
485 {
486 return m_wait_point; //no cover line
487 }
488
489 /* Initialize this class after MySQL parameters are initialized. this
490 * function should be called once at bootstrap time.
491 */
492 int init_object();
493
494 /* Enable the object to enable semi-sync replication inside the master. */
495 int enable_master();
496
497 /* Enable the object to enable semi-sync replication inside the master. */
498 int disable_master();
499
500 /* Add a semi-sync replication slave */
501 void add_slave();
502
503 /* Remove a semi-sync replication slave */
504 void remove_slave();
505
506 /* It parses a reply packet and call report_reply_binlog to handle it. */
507 int report_reply_packet(uint32 server_id, const uchar *packet,
508 ulong packet_len);
509
510 /* In semi-sync replication, reports up to which binlog position we have
511 * received replies from the slave indicating that it already get the events.
512 *
513 * Input:
514 * server_id - (IN) master server id number
515 * log_file_name - (IN) binlog file name
516 * end_offset - (IN) the offset in the binlog file up to which we have
517 * the replies from the slave
518 *
519 * Return:
520 * 0: success; non-zero: error
521 */
522 int report_reply_binlog(uint32 server_id,
523 const char* log_file_name,
524 my_off_t end_offset);
525
526 /* Commit a transaction in the final step. This function is called from
527 * InnoDB before returning from the low commit. If semi-sync is switch on,
528 * the function will wait to see whether binlog-dump thread get the reply for
529 * the events of the transaction. Remember that this is not a direct wait,
530 * instead, it waits to see whether the binlog-dump thread has reached the
531 * point. If the wait times out, semi-sync status will be switched off and
532 * all other transaction would not wait either.
533 *
534 * Input: (the transaction events' ending binlog position)
535 * trx_wait_binlog_name - (IN) ending position's file name
536 * trx_wait_binlog_pos - (IN) ending position's file offset
537 *
538 * Return:
539 * 0: success; non-zero: error
540 */
541 int commit_trx(const char* trx_wait_binlog_name,
542 my_off_t trx_wait_binlog_pos);
543
544 /*Wait for ACK after writing/sync binlog to file*/
545 int wait_after_sync(const char* log_file, my_off_t log_pos);
546
547 /*Wait for ACK after commting the transaction*/
548 int wait_after_commit(THD* thd, bool all);
549
550 /*Wait after the transaction is rollback*/
551 int wait_after_rollback(THD *thd, bool all);
552 /*Store the current binlog position in m_active_tranxs. This position should
553 * be acked by slave*/
554 int report_binlog_update(THD *thd, const char *log_file,my_off_t log_pos);
555
556 int dump_start(THD* thd,
557 const char *log_file,
558 my_off_t log_pos);
559
560 void dump_end(THD* thd);
561
562 /* Reserve space in the replication event packet header:
563 * . slave semi-sync off: 1 byte - (0)
564 * . slave semi-sync on: 3 byte - (0, 0xef, 0/1}
565 *
566 * Input:
567 * packet - (IN) the header buffer
568 *
569 * Return:
570 * size of the bytes reserved for header
571 */
572 int reserve_sync_header(String* packet);
573
574 /* Update the sync bit in the packet header to indicate to the slave whether
575 * the master will wait for the reply of the event. If semi-sync is switched
576 * off and we detect that the slave is catching up, we switch semi-sync on.
577 *
578 * Input:
579 * THD - (IN) current dump thread
580 * packet - (IN) the packet containing the replication event
581 * log_file_name - (IN) the event ending position's file name
582 * log_file_pos - (IN) the event ending position's file offset
583 * need_sync - (IN) identify if flush_net is needed to call.
584 * server_id - (IN) master server id number
585 *
586 * Return:
587 * 0: success; non-zero: error
588 */
589 int update_sync_header(THD* thd, unsigned char *packet,
590 const char *log_file_name,
591 my_off_t log_file_pos,
592 bool* need_sync);
593
594 /* Called when a transaction finished writing binlog events.
595 * . update the 'largest' transactions' binlog event position
596 * . insert the ending position in the active transaction list if
597 * semi-sync is on
598 *
599 * Input: (the transaction events' ending binlog position)
600 * log_file_name - (IN) transaction ending position's file name
601 * log_file_pos - (IN) transaction ending position's file offset
602 *
603 * Return:
604 * 0: success; non-zero: error
605 */
606 int write_tranx_in_binlog(const char* log_file_name, my_off_t log_file_pos);
607
608 /* Read the slave's reply so that we know how much progress the slave makes
609 * on receive replication events.
610 */
611 int flush_net(THD* thd, const char *event_buf);
612
613 /* Export internal statistics for semi-sync replication. */
614 void set_export_stats();
615
616 /* 'reset master' command is issued from the user and semi-sync need to
617 * go off for that.
618 */
619 int after_reset_master();
620
621 /*called before reset master*/
622 int before_reset_master();
623
624 void check_and_switch();
625};
626
627enum rpl_semi_sync_master_wait_point_t {
628 SEMI_SYNC_MASTER_WAIT_POINT_AFTER_BINLOG_SYNC,
629 SEMI_SYNC_MASTER_WAIT_POINT_AFTER_STORAGE_COMMIT,
630};
631
632extern Repl_semi_sync_master repl_semisync_master;
633extern Ack_receiver ack_receiver;
634
635/* System and status variables for the master component */
636extern my_bool rpl_semi_sync_master_enabled;
637extern my_bool rpl_semi_sync_master_status;
638extern ulong rpl_semi_sync_master_wait_point;
639extern ulong rpl_semi_sync_master_clients;
640extern ulong rpl_semi_sync_master_timeout;
641extern ulong rpl_semi_sync_master_trace_level;
642extern ulong rpl_semi_sync_master_yes_transactions;
643extern ulong rpl_semi_sync_master_no_transactions;
644extern ulong rpl_semi_sync_master_off_times;
645extern ulong rpl_semi_sync_master_wait_timeouts;
646extern ulong rpl_semi_sync_master_timefunc_fails;
647extern ulong rpl_semi_sync_master_num_timeouts;
648extern ulong rpl_semi_sync_master_wait_sessions;
649extern ulong rpl_semi_sync_master_wait_pos_backtraverse;
650extern ulong rpl_semi_sync_master_avg_trx_wait_time;
651extern ulong rpl_semi_sync_master_avg_net_wait_time;
652extern ulonglong rpl_semi_sync_master_net_wait_num;
653extern ulonglong rpl_semi_sync_master_trx_wait_num;
654extern ulonglong rpl_semi_sync_master_net_wait_time;
655extern ulonglong rpl_semi_sync_master_trx_wait_time;
656extern unsigned long long rpl_semi_sync_master_request_ack;
657extern unsigned long long rpl_semi_sync_master_get_ack;
658
659/*
660 This indicates whether we should keep waiting if no semi-sync slave
661 is available.
662 0 : stop waiting if detected no avaialable semi-sync slave.
663 1 (default) : keep waiting until timeout even no available semi-sync slave.
664*/
665extern char rpl_semi_sync_master_wait_no_slave;
666extern Repl_semi_sync_master repl_semisync_master;
667
668extern PSI_stage_info stage_waiting_for_semi_sync_ack_from_slave;
669extern PSI_stage_info stage_reading_semi_sync_ack;
670extern PSI_stage_info stage_waiting_for_semi_sync_slave;
671
672void semi_sync_master_deinit();
673
674#endif /* SEMISYNC_MASTER_H */
675