semisync_master.h source code [MariaDB/sql/semisync_master.h]

1	/ Copyright (C) 2007 Google Inc.*
2	Copyright (c) 2008 MySQL AB, 2009 Sun Microsystems, Inc.
3	Use is subject to license terms.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; version 2 of the License.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, write to the Free Software
16	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /*
17
18
19	#ifndef SEMISYNC_MASTER_H
20	#define SEMISYNC_MASTER_H
21
22	#include "semisync.h"
23	#include "semisync_master_ack_receiver.h"
24
25	#ifdef HAVE_PSI_INTERFACE
26	extern PSI_mutex_key key_LOCK_binlog;
27	extern PSI_cond_key key_COND_binlog_send;
28	#endif
29
30	struct Tranx_node {
31	char log_name[FN_REFLEN];
32	my_off_t log_pos;
33	struct Tranx_node next; /* the next node in the sorted list /
34	struct Tranx_node hash_next; /* the next node during hash collision /
35	};
36
37	/**
38	@class Tranx_node_allocator
39
40	This class provides memory allocating and freeing methods for
41	Tranx_node. The main target is performance.
42
43	@section ALLOCATE How to allocate a node
44	The pointer of the first node after 'last_node' in current_block is
45	returned. current_block will move to the next free Block when all nodes of
46	it are in use. A new Block is allocated and is put into the rear of the
47	Block link table if no Block is free.
48
49	The list starts up empty (ie, there is no allocated Block).
50
51	After some nodes are freed, there probably are some free nodes before
52	the sequence of the allocated nodes, but we do not reuse it. It is better
53	to keep the allocated nodes are in the sequence, for it is more efficient
54	for allocating and freeing Tranx_node.
55
56	@section FREENODE How to free nodes
57	There are two methods for freeing nodes. They are free_all_nodes and
58	free_nodes_before.
59
60	'A Block is free' means all of its nodes are free.
61	@subsection free_nodes_before
62	As all allocated nodes are in the sequence, 'Before one node' means all
63	nodes before given node in the same Block and all Blocks before the Block
64	which containing the given node. As such, all Blocks before the given one
65	('node') are free Block and moved into the rear of the Block link table.
66	The Block containing the given 'node', however, is not. For at least the
67	given 'node' is still in use. This will waste at most one Block, but it is
68	more efficient.
69	*/
70	#define BLOCK_TRANX_NODES 16
71	class Tranx_node_allocator
72	{
73	public:
74	/**
75	@param reserved_nodes
76	The number of reserved Tranx_nodes. It is used to set 'reserved_blocks'
77	which can contain at least 'reserved_nodes' number of Tranx_nodes. When
78	freeing memory, we will reserve at least reserved_blocks of Blocks not
79	freed.
80	*/
81	Tranx_node_allocator(uint reserved_nodes) :
82	reserved_blocks(reserved_nodes/BLOCK_TRANX_NODES +
83	(reserved_nodes%BLOCK_TRANX_NODES > `1` ? `2` : `1`)),
84	first_block(NULL), last_block(NULL),
85	current_block(NULL), last_node(-`1`), block_num(`0`) {}
86
87	~Tranx_node_allocator()
88	{
89	Block *block= first_block;
90	while (block != NULL)
91	{
92	Block *next= block->next;
93	free_block(block);
94	block= next;
95	}
96	}
97
98	/**
99	The pointer of the first node after 'last_node' in current_block is
100	returned. current_block will move to the next free Block when all nodes of
101	it are in use. A new Block is allocated and is put into the rear of the
102	Block link table if no Block is free.
103
104	@return Return a Tranx_node , or NULL if an error occurred.*
105	*/
106	Tranx_node *allocate_node()
107	{
108	Tranx_node *trx_node;
109	Block *block= current_block;
110
111	if (last_node == BLOCK_TRANX_NODES-`1`)
112	{
113	current_block= current_block->next;
114	last_node= -`1`;
115	}
116
117	if (current_block == NULL && allocate_block())
118	{
119	current_block= block;
120	if (current_block)
121	last_node= BLOCK_TRANX_NODES-`1`;
122	return NULL;
123	}
124
125	trx_node= &(current_block->nodes[++last_node]);
126	trx_node->log_name[`0`] = `'\0'`;
127	trx_node->log_pos= `0`;
128	trx_node->next= `0`;
129	trx_node->hash_next= `0`;
130	return trx_node;
131	}
132
133	/**
134	All nodes are freed.
135
136	@return Return 0, or 1 if an error occurred.
137	*/
138	int free_all_nodes()
139	{
140	current_block= first_block;
141	last_node= -`1`;
142	free_blocks();
143	return `0`;
144	}
145
146	/**
147	All Blocks before the given 'node' are free Block and moved into the rear
148	of the Block link table.
149
150	@param node All nodes before 'node' will be freed
151
152	@return Return 0, or 1 if an error occurred.
153	*/
154	int free_nodes_before(Tranx_node* node)
155	{
156	Block *block;
157	Block *prev_block= NULL;
158
159	block= first_block;
160	while (block != current_block->next)
161	{
162	/ Find the Block containing the given node /
163	if (&(block->nodes[`0`]) <= node && &(block->nodes[BLOCK_TRANX_NODES]) >= node)
164	{
165	/ All Blocks before the given node are put into the rear /
166	if (first_block != block)
167	{
168	last_block->next= first_block;
169	first_block= block;
170	last_block= prev_block;
171	last_block->next= NULL;
172	free_blocks();
173	}
174	return `0`;
175	}
176	prev_block= block;
177	block= block->next;
178	}
179
180	/ Node does not find should never happen /
181	DBUG_ASSERT(`0`);
182	return `1`;
183	}
184
185	private:
186	uint reserved_blocks;
187
188	/**
189	A sequence memory which contains BLOCK_TRANX_NODES Tranx_nodes.
190
191	BLOCK_TRANX_NODES The number of Tranx_nodes which are in a Block.
192
193	next Every Block has a 'next' pointer which points to the next Block.
194	These linking Blocks constitute a Block link table.
195	*/
196	struct Block {
197	Block *next;
198	Tranx_node nodes[BLOCK_TRANX_NODES];
199	};
200
201	/**
202	The 'first_block' is the head of the Block link table;
203	*/
204	Block *first_block;
205	/**
206	The 'last_block' is the rear of the Block link table;
207	*/
208	Block *last_block;
209
210	/**
211	current_block always points the Block in the Block link table in
212	which the last allocated node is. The Blocks before it are all in use
213	and the Blocks after it are all free.
214	*/
215	Block *current_block;
216
217	/**
218	It always points to the last node which has been allocated in the
219	current_block.
220	*/
221	int last_node;
222
223	/**
224	How many Blocks are in the Block link table.
225	*/
226	uint block_num;
227
228	/**
229	Allocate a block and then assign it to current_block.
230	*/
231	int allocate_block()
232	{
233	Block block= (Block )my_malloc(sizeof(Block), MYF(`0`));
234	if (block)
235	{
236	block->next= NULL;
237
238	if (first_block == NULL)
239	first_block= block;
240	else
241	last_block->next= block;
242
243	/ New Block is always put into the rear /
244	last_block= block;
245	/ New Block is always the current_block /
246	current_block= block;
247	++block_num;
248	return `0`;
249	}
250	return `1`;
251	}
252
253	/**
254	Free a given Block.
255	@param block The Block will be freed.
256	*/
257	void free_block(Block *block)
258	{
259	my_free(block);
260	--block_num;
261	}
262
263
264	/**
265	If there are some free Blocks and the total number of the Blocks in the
266	Block link table is larger than the 'reserved_blocks', Some free Blocks
267	will be freed until the total number of the Blocks is equal to the
268	'reserved_blocks' or there is only one free Block behind the
269	'current_block'.
270	*/
271	void free_blocks()
272	{
273	if (current_block == NULL \|\| current_block->next == NULL)
274	return;
275
276	/ One free Block is always kept behind the current block /
277	Block *block= current_block->next->next;
278	while (block_num > reserved_blocks && block != NULL)
279	{
280	Block *next= block->next;
281	free_block(block);
282	block= next;
283	}
284	current_block->next->next= block;
285	if (block == NULL)
286	last_block= current_block->next;
287	}
288	};
289
290	/**
291	This class manages memory for active transaction list.
292
293	We record each active transaction with a Tranx_node, each session
294	can have only one open transaction. Because of EVENT, the total
295	active transaction nodes can exceed the maximum allowed
296	connections.
297	*/
298	class Active_tranx
299	:public Trace {
300	private:
301
302	Tranx_node_allocator m_allocator;
303	/ These two record the active transaction list in sort order. /
304	Tranx_node m_trx_front, m_trx_rear;
305
306	Tranx_node *m_trx_htb; /* A hash table on active transactions. /
307
308	int m_num_entries; / maximum hash table entries /
309	mysql_mutex_t m_lock; /* mutex lock /
310
311	inline void assert_lock_owner();
312
313	inline unsigned int calc_hash(const unsigned char *key, size_t length);
314	unsigned int get_hash_value(const char *log_file_name, my_off_t log_file_pos);
315
316	int compare(const char *log_file_name1, my_off_t log_file_pos1,
317	const Tranx_node *node2) {
318	return compare(log_file_name1, log_file_pos1,
319	node2->log_name, node2->log_pos);
320	}
321	int compare(const Tranx_node *node1,
322	const char *log_file_name2, my_off_t log_file_pos2) {
323	return compare(node1->log_name, node1->log_pos,
324	log_file_name2, log_file_pos2);
325	}
326	int compare(const Tranx_node node1, const* Tranx_node *node2) {
327	return compare(node1->log_name, node1->log_pos,
328	node2->log_name, node2->log_pos);
329	}
330
331	public:
332	Active_tranx(mysql_mutex_t lock, unsigned* long trace_level);
333	~Active_tranx();
334
335	/ Insert an active transaction node with the specified position.*
336	*
337	* Return:
338	* 0: success; non-zero: error
339	*/
340	int insert_tranx_node(const char *log_file_name, my_off_t log_file_pos);
341
342	/ Clear the active transaction nodes until(inclusive) the specified*
343	* position.
344	* If log_file_name is NULL, everything will be cleared: the sorted
345	* list and the hash table will be reset to empty.
346	*
347	* Return:
348	* 0: success; non-zero: error
349	*/
350	int clear_active_tranx_nodes(const char *log_file_name,
351	my_off_t log_file_pos);
352
353	/ Given a position, check to see whether the position is an active*
354	* transaction's ending position by probing the hash table.
355	*/
356	bool is_tranx_end_pos(const char *log_file_name, my_off_t log_file_pos);
357
358	/ Given two binlog positions, compare which one is bigger based on*
359	* (file_name, file_position).
360	*/
361	static int compare(const char *log_file_name1, my_off_t log_file_pos1,
362	const char *log_file_name2, my_off_t log_file_pos2);
363
364	};
365
366	/**
367	The extension class for the master of semi-synchronous replication
368	*/
369	class Repl_semi_sync_master
370	:public Repl_semi_sync_base {
371	private:
372	Active_tranx m_active_tranxs; /* active transaction list: the list will*
373	be cleared when semi-sync switches off. /*
374
375	/ True when init_object has been called /
376	bool m_init_done;
377
378	/ This cond variable is signaled when enough binlog has been sent to slave,*
379	* so that a waiting trx can return the 'ok' to the client for a commit.
380	*/
381	mysql_cond_t COND_binlog_send;
382
383	/ Mutex that protects the following state variables and the active*
384	* transaction list.
385	* Under no cirumstances we can acquire mysql_bin_log.LOCK_log if we are
386	* already holding m_LOCK_binlog because it can cause deadlocks.
387	*/
388	mysql_mutex_t LOCK_binlog;
389
390	/ This is set to true when m_reply_file_name contains meaningful data. /
391	bool m_reply_file_name_inited;
392
393	/ The binlog name up to which we have received replies from any slaves. /
394	char m_reply_file_name[FN_REFLEN];
395
396	/ The position in that file up to which we have the reply from any slaves. /
397	my_off_t m_reply_file_pos;
398
399	/ This is set to true when we know the 'smallest' wait position. /
400	bool m_wait_file_name_inited;
401
402	/ NULL, or the 'smallest' filename that a transaction is waiting for*
403	* slave replies.
404	*/
405	char m_wait_file_name[FN_REFLEN];
406
407	/ The smallest position in that file that a trx is waiting for: the trx*
408	* can proceed and send an 'ok' to the client when the master has got the
409	* reply from the slave indicating that it already got the binlog events.
410	*/
411	my_off_t m_wait_file_pos;
412
413	/ This is set to true when we know the 'largest' transaction commit*
414	* position in the binlog file.
415	* We always maintain the position no matter whether semi-sync is switched
416	* on switched off. When a transaction wait timeout occurs, semi-sync will
417	* switch off. Binlog-dump thread can use the three fields to detect when
418	* slaves catch up on replication so that semi-sync can switch on again.
419	*/
420	bool m_commit_file_name_inited;
421
422	/ The 'largest' binlog filename that a commit transaction is seeing. /
423	char m_commit_file_name[FN_REFLEN];
424
425	/ The 'largest' position in that file that a commit transaction is seeing. /
426	my_off_t m_commit_file_pos;
427
428	/ All global variables which can be set by parameters. /
429	volatile bool m_master_enabled; / semi-sync is enabled on the master /
430	unsigned long m_wait_timeout; / timeout period(ms) during tranx wait /
431
432	bool m_state; / whether semi-sync is switched /
433
434	/Waiting for ACK before/after innodb commit/
435	ulong m_wait_point;
436
437	void lock();
438	void unlock();
439	void cond_broadcast();
440	int cond_timewait(struct timespec *wait_time);
441
442	/ Is semi-sync replication on? /
443	bool is_on() {
444	return (m_state);
445	}
446
447	void set_master_enabled(bool enabled) {
448	m_master_enabled = enabled;
449	}
450
451	/ Switch semi-sync off because of timeout in transaction waiting. /
452	int switch_off();
453
454	/ Switch semi-sync on when slaves catch up. /
455	int try_switch_on(int server_id,
456	const char *log_file_name, my_off_t log_file_pos);
457
458	public:
459	Repl_semi_sync_master();
460	~Repl_semi_sync_master() {}
461
462	void cleanup();
463
464	bool get_master_enabled() {
465	return m_master_enabled;
466	}
467	void set_trace_level(unsigned long trace_level) {
468	m_trace_level = trace_level;
469	if (m_active_tranxs)
470	m_active_tranxs->m_trace_level = trace_level;
471	}
472
473	/ Set the transaction wait timeout period, in milliseconds. /
474	void set_wait_timeout(unsigned long wait_timeout) {
475	m_wait_timeout = wait_timeout;
476	}
477
478	/set the ACK point, after binlog sync or after transaction commit/
479	void set_wait_point(unsigned long ack_point)
480	{
481	m_wait_point = ack_point;
482	}
483
484	ulong wait_point() //no cover line
485	{
486	return m_wait_point; //no cover line
487	}
488
489	/ Initialize this class after MySQL parameters are initialized. this*
490	* function should be called once at bootstrap time.
491	*/
492	int init_object();
493
494	/ Enable the object to enable semi-sync replication inside the master. /
495	int enable_master();
496
497	/ Enable the object to enable semi-sync replication inside the master. /
498	int disable_master();
499
500	/ Add a semi-sync replication slave /
501	void add_slave();
502
503	/ Remove a semi-sync replication slave /
504	void remove_slave();
505
506	/ It parses a reply packet and call report_reply_binlog to handle it. /
507	int report_reply_packet(uint32 server_id, const uchar *packet,
508	ulong packet_len);
509
510	/ In semi-sync replication, reports up to which binlog position we have*
511	* received replies from the slave indicating that it already get the events.
512	*
513	* Input:
514	* server_id - (IN) master server id number
515	* log_file_name - (IN) binlog file name
516	* end_offset - (IN) the offset in the binlog file up to which we have
517	* the replies from the slave
518	*
519	* Return:
520	* 0: success; non-zero: error
521	*/
522	int report_reply_binlog(uint32 server_id,
523	const char* log_file_name,
524	my_off_t end_offset);
525
526	/ Commit a transaction in the final step. This function is called from*
527	* InnoDB before returning from the low commit. If semi-sync is switch on,
528	* the function will wait to see whether binlog-dump thread get the reply for
529	* the events of the transaction. Remember that this is not a direct wait,
530	* instead, it waits to see whether the binlog-dump thread has reached the
531	* point. If the wait times out, semi-sync status will be switched off and
532	* all other transaction would not wait either.
533	*
534	* Input: (the transaction events' ending binlog position)
535	* trx_wait_binlog_name - (IN) ending position's file name
536	* trx_wait_binlog_pos - (IN) ending position's file offset
537	*
538	* Return:
539	* 0: success; non-zero: error
540	*/
541	int commit_trx(const char* trx_wait_binlog_name,
542	my_off_t trx_wait_binlog_pos);
543
544	/Wait for ACK after writing/sync binlog to file/
545	int wait_after_sync(const char* log_file, my_off_t log_pos);
546
547	/Wait for ACK after commting the transaction/
548	int wait_after_commit(THD* thd, bool all);
549
550	/Wait after the transaction is rollback/
551	int wait_after_rollback(THD thd, bool* all);
552	/Store the current binlog position in m_active_tranxs. This position should*
553	* be acked by slave*/
554	int report_binlog_update(THD thd, const* char *log_file,my_off_t log_pos);
555
556	int dump_start(THD* thd,
557	const char *log_file,
558	my_off_t log_pos);
559
560	void dump_end(THD* thd);
561
562	/ Reserve space in the replication event packet header:*
563	* . slave semi-sync off: 1 byte - (0)
564	* . slave semi-sync on: 3 byte - (0, 0xef, 0/1}
565	*
566	* Input:
567	* packet - (IN) the header buffer
568	*
569	* Return:
570	* size of the bytes reserved for header
571	*/
572	int reserve_sync_header(String* packet);
573
574	/ Update the sync bit in the packet header to indicate to the slave whether*
575	* the master will wait for the reply of the event. If semi-sync is switched
576	* off and we detect that the slave is catching up, we switch semi-sync on.
577	*
578	* Input:
579	* THD - (IN) current dump thread
580	* packet - (IN) the packet containing the replication event
581	* log_file_name - (IN) the event ending position's file name
582	* log_file_pos - (IN) the event ending position's file offset
583	* need_sync - (IN) identify if flush_net is needed to call.
584	* server_id - (IN) master server id number
585	*
586	* Return:
587	* 0: success; non-zero: error
588	*/
589	int update_sync_header(THD* thd, unsigned char *packet,
590	const char *log_file_name,
591	my_off_t log_file_pos,
592	bool* need_sync);
593
594	/ Called when a transaction finished writing binlog events.*
595	* . update the 'largest' transactions' binlog event position
596	* . insert the ending position in the active transaction list if
597	* semi-sync is on
598	*
599	* Input: (the transaction events' ending binlog position)
600	* log_file_name - (IN) transaction ending position's file name
601	* log_file_pos - (IN) transaction ending position's file offset
602	*
603	* Return:
604	* 0: success; non-zero: error
605	*/
606	int write_tranx_in_binlog(const char* log_file_name, my_off_t log_file_pos);
607
608	/ Read the slave's reply so that we know how much progress the slave makes*
609	* on receive replication events.
610	*/
611	int flush_net(THD* thd, const char *event_buf);
612
613	/ Export internal statistics for semi-sync replication. /
614	void set_export_stats();
615
616	/ 'reset master' command is issued from the user and semi-sync need to*
617	* go off for that.
618	*/
619	int after_reset_master();
620
621	/called before reset master/
622	int before_reset_master();
623
624	void check_and_switch();
625	};
626
627	enum rpl_semi_sync_master_wait_point_t {
628	SEMI_SYNC_MASTER_WAIT_POINT_AFTER_BINLOG_SYNC,
629	SEMI_SYNC_MASTER_WAIT_POINT_AFTER_STORAGE_COMMIT,
630	};
631
632	extern Repl_semi_sync_master repl_semisync_master;
633	extern Ack_receiver ack_receiver;
634
635	/ System and status variables for the master component /
636	extern my_bool rpl_semi_sync_master_enabled;
637	extern my_bool rpl_semi_sync_master_status;
638	extern ulong rpl_semi_sync_master_wait_point;
639	extern ulong rpl_semi_sync_master_clients;
640	extern ulong rpl_semi_sync_master_timeout;
641	extern ulong rpl_semi_sync_master_trace_level;
642	extern ulong rpl_semi_sync_master_yes_transactions;
643	extern ulong rpl_semi_sync_master_no_transactions;
644	extern ulong rpl_semi_sync_master_off_times;
645	extern ulong rpl_semi_sync_master_wait_timeouts;
646	extern ulong rpl_semi_sync_master_timefunc_fails;
647	extern ulong rpl_semi_sync_master_num_timeouts;
648	extern ulong rpl_semi_sync_master_wait_sessions;
649	extern ulong rpl_semi_sync_master_wait_pos_backtraverse;
650	extern ulong rpl_semi_sync_master_avg_trx_wait_time;
651	extern ulong rpl_semi_sync_master_avg_net_wait_time;
652	extern ulonglong rpl_semi_sync_master_net_wait_num;
653	extern ulonglong rpl_semi_sync_master_trx_wait_num;
654	extern ulonglong rpl_semi_sync_master_net_wait_time;
655	extern ulonglong rpl_semi_sync_master_trx_wait_time;
656	extern unsigned long long rpl_semi_sync_master_request_ack;
657	extern unsigned long long rpl_semi_sync_master_get_ack;
658
659	/*
660	This indicates whether we should keep waiting if no semi-sync slave
661	is available.
662	0 : stop waiting if detected no avaialable semi-sync slave.
663	1 (default) : keep waiting until timeout even no available semi-sync slave.
664	*/
665	extern char rpl_semi_sync_master_wait_no_slave;
666	extern Repl_semi_sync_master repl_semisync_master;
667
668	extern PSI_stage_info stage_waiting_for_semi_sync_ack_from_slave;
669	extern PSI_stage_info stage_reading_semi_sync_ack;
670	extern PSI_stage_info stage_waiting_for_semi_sync_slave;
671
672	void semi_sync_master_deinit();
673
674	#endif /* SEMISYNC_MASTER_H */
675

Browse the source code of MariaDB/sql/semisync_master.h