trx0sys.h source code [MariaDB/storage/innobase/include/trx0sys.h]

1	/*****************************************************************************
2
3	Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2017, 2018, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/************************************************//**
21	@file include/trx0sys.h
22	Transaction system
23
24	Created 3/26/1996 Heikki Tuuri
25	*******************************************************/
26
27	#ifndef trx0sys_h
28	#define trx0sys_h
29
30	#include "univ.i"
31
32	#include "buf0buf.h"
33	#include "fil0fil.h"
34	#include "trx0types.h"
35	#include "mem0mem.h"
36	#include "mtr0mtr.h"
37	#include "ut0byte.h"
38	#include "ut0lst.h"
39	#include "read0types.h"
40	#include "page0types.h"
41	#include "ut0mutex.h"
42	#include "trx0trx.h"
43	#ifdef WITH_WSREP
44	#include "trx0xa.h"
45	#endif /* WITH_WSREP */
46
47	typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
48
49	/* Checks if a page address is the trx sys header page.*
50	@param[in] page_id page id
51	@return true if trx sys header page /*
52	inline
53	bool
54	trx_sys_hdr_page(const page_id_t& page_id)
55	{
56	return(page_id.space() == TRX_SYS_SPACE
57	&& page_id.page_no() == TRX_SYS_PAGE_NO);
58	}
59
60	/***************************************************************//**
61	Creates and initializes the transaction system at the database creation. /*
62	void
63	trx_sys_create_sys_pages(void);
64	/==========================/
65	/* Find an available rollback segment.*
66	@param[in] sys_header
67	@return an unallocated rollback segment slot in the TRX_SYS header
68	@retval ULINT_UNDEFINED if not found /*
69	ulint
70	trx_sys_rseg_find_free(const buf_block_t* sys_header);
71	/* Request the TRX_SYS page.*
72	@param[in] rw whether to lock the page for writing
73	@return the TRX_SYS page
74	@retval NULL if the page cannot be read /*
75	inline
76	buf_block_t*
77	trx_sysf_get(mtr_t* mtr, bool rw = true)
78	{
79	buf_block_t* block = buf_page_get(
80	page_id_t (TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
81	univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
82	if (block) {
83	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
84	}
85	return block;
86	}
87
88	#ifdef UNIV_DEBUG
89	/ Flag to control TRX_RSEG_N_SLOTS behavior debugging. /
90	extern uint trx_rseg_n_slots_debug;
91	#endif
92
93	/* Write DB_TRX_ID.*
94	@param[out] db_trx_id the DB_TRX_ID field to be written to
95	@param[in] id transaction ID /*
96	UNIV_INLINE
97	void
98	trx_write_trx_id(byte* db_trx_id, trx_id_t id)
99	{
100	compile_time_assert(DATA_TRX_ID_LEN == `6`);
101	ut_ad(id);
102	mach_write_to_6(db_trx_id, id);
103	}
104
105	/* Read a transaction identifier.*
106	@return id /*
107	inline
108	trx_id_t
109	trx_read_trx_id(const byte* ptr)
110	{
111	compile_time_assert(DATA_TRX_ID_LEN == `6`);
112	return(mach_read_from_6(ptr));
113	}
114
115	#ifdef UNIV_DEBUG
116	/* Check that the DB_TRX_ID in a record is valid.*
117	@param[in] db_trx_id the DB_TRX_ID column to validate
118	@param[in] trx_id the id of the ALTER TABLE transaction /*
119	inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
120	{
121	trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
122	ut_ad(id == `0` \|\| id > trx_id);
123	return true;
124	}
125	#endif
126
127	/***************************************************************//**
128	Updates the offset information about the end of the MySQL binlog entry
129	which corresponds to the transaction just being committed. In a MySQL
130	replication slave updates the latest master binlog position up to which
131	replication has proceeded. /*
132	void
133	trx_sys_update_mysql_binlog_offset(
134	/===============================/
135	const char* file_name,/!< in: MySQL log file name /
136	int64_t offset, /!< in: position in that log file /
137	buf_block_t* sys_header, /!< in,out: trx sys header /
138	mtr_t* mtr); /!< in,out: mini-transaction /
139	/* Display the MySQL binlog offset info if it is present in the trx*
140	system header. /*
141	void
142	trx_sys_print_mysql_binlog_offset();
143
144	/* Create the rollback segments.*
145	@return whether the creation succeeded /*
146	bool
147	trx_sys_create_rsegs();
148
149	/* The automatically created system rollback segment has this id /
150	#define TRX_SYS_SYSTEM_RSEG_ID 0
151
152	/* The offset of the transaction system header on the page /
153	#define TRX_SYS FSEG_PAGE_DATA
154
155	/* Transaction system header /
156	/------------------------------------------------------------- @{ /
157	/* In old versions of InnoDB, this persisted the value of*
158	trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
159	the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
160	and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
161	are used instead. The field only exists for the purpose of upgrading
162	from older MySQL or MariaDB versions. /*
163	#define TRX_SYS_TRX_ID_STORE 0
164	#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
165	tablespace segment the trx
166	system is created into */
167	#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
168	/!< the start of the array of*
169	rollback segment specification
170	slots /*
171	/------------------------------------------------------------- @} /
172
173	/* The number of rollback segments; rollback segment id must fit in*
174	the 7 bits reserved for it in DB_ROLL_PTR. /*
175	#define TRX_SYS_N_RSEGS 128
176	/* Maximum number of undo tablespaces (not counting the system tablespace) /
177	#define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1)
178
179	/ Rollback segment specification slot offsets /
180
181	/* the tablespace ID of an undo log header; starting with*
182	MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused /*
183	#define TRX_SYS_RSEG_SPACE 0
184	/* the page number of an undo log header, or FIL_NULL if unused /
185	#define TRX_SYS_RSEG_PAGE_NO 4
186	/* Size of a rollback segment specification slot /
187	#define TRX_SYS_RSEG_SLOT_SIZE 8
188
189	/* Read the tablespace ID of a rollback segment slot.*
190	@param[in] sys_header TRX_SYS page
191	@param[in] rseg_id rollback segment identifier
192	@return undo tablespace id /*
193	inline
194	uint32_t
195	trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
196	{
197	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
198	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
199	+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
200	+ sys_header->frame);
201	}
202
203	/* Read the page number of a rollback segment slot.*
204	@param[in] sys_header TRX_SYS page
205	@param[in] rseg_id rollback segment identifier
206	@return undo page number /*
207	inline
208	uint32_t
209	trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id)
210	{
211	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
212	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
213	+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
214	+ sys_header->frame);
215	}
216
217	/* Maximum length of MySQL binlog file name, in bytes.*
218	(Used before MariaDB 10.3.5.) /*
219	#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
220	/* Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD /
221	#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
222
223	#if UNIV_PAGE_SIZE_MIN < 4096
224	# error "UNIV_PAGE_SIZE_MIN < 4096"
225	#endif
226	/* The offset of the MySQL binlog offset info in the trx system header /
227	#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000)
228	#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
229	TRX_SYS_MYSQL_LOG_MAGIC_N
230	if we have valid data in the
231	MySQL binlog info */
232	#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset
233	within that file */
234	#define TRX_SYS_MYSQL_LOG_NAME 12 /!< MySQL log file name /
235
236	/* Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096*
237
238	0...37 FIL_HEADER
239	38...45 TRX_SYS_TRX_ID_STORE
240	46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
241	56 TRX_SYS_RSEGS
242	56...59 TRX_SYS_RSEG_SPACE for slot 0
243	60...63 TRX_SYS_RSEG_PAGE_NO for slot 0
244	64...67 TRX_SYS_RSEG_SPACE for slot 1
245	68...71 TRX_SYS_RSEG_PAGE_NO for slot 1
246	....
247	594..597 TRX_SYS_RSEG_SPACE for slot 72
248	598..601 TRX_SYS_RSEG_PAGE_NO for slot 72
249	...
250	...1063 TRX_SYS_RSEG_PAGE_NO for slot 126
251
252	(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
253	space_id, page_no pairs :::)
254	596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
255	600 TRX_SYS_WSREP_XID_FORMAT
256	604 TRX_SYS_WSREP_XID_GTRID_LEN
257	608 TRX_SYS_WSREP_XID_BQUAL_LEN
258	612 TRX_SYS_WSREP_XID_DATA (len = 128)
259	739 TRX_SYS_WSREP_XID_DATA_END
260
261	FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
262	(srv_page_size-2500)
263	1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
264	1600 TRX_SYS_WSREP_XID_FORMAT
265	1604 TRX_SYS_WSREP_XID_GTRID_LEN
266	1608 TRX_SYS_WSREP_XID_BQUAL_LEN
267	1612 TRX_SYS_WSREP_XID_DATA (len = 128)
268	1739 TRX_SYS_WSREP_XID_DATA_END
269
270	(srv_page_size - 2000 MYSQL MASTER LOG)
271	2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
272	2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
273	2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
274	2108 TRX_SYS_MYSQL_LOG_NAME
275
276	(srv_page_size - 1000 MYSQL LOG)
277	3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
278	3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
279	3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
280	3108 TRX_SYS_MYSQL_LOG_NAME
281
282	(srv_page_size - 200 DOUBLEWRITE)
283	3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG
284	3906 TRX_SYS_DOUBLEWRITE_MAGIC
285	3910 TRX_SYS_DOUBLEWRITE_BLOCK1
286	3914 TRX_SYS_DOUBLEWRITE_BLOCK2
287	3918 TRX_SYS_DOUBLEWRITE_REPEAT
288	3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
289
290	(srv_page_size - 8, TAILER)
291	4088..4096 FIL_TAILER
292
293	*/
294	#ifdef WITH_WSREP
295	/* The offset to WSREP XID headers (used before MariaDB 10.3.5) /
296	#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
297	#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
298	#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
299
300	/* XID field: formatID, gtrid_len, bqual_len, xid_data /
301	#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
302	#define TRX_SYS_WSREP_XID_FORMAT 4
303	#define TRX_SYS_WSREP_XID_GTRID_LEN 8
304	#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
305	#define TRX_SYS_WSREP_XID_DATA 16
306	#endif /* WITH_WSREP*/
307
308	/* Doublewrite buffer /
309	/ @{ /
310	/* The offset of the doublewrite buffer header on the trx system header page /
311	#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200)
312	/-------------------------------------------------------------/
313	#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
314	containing the doublewrite
315	buffer */
316	#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
317	/!< 4-byte magic number which*
318	shows if we already have
319	created the doublewrite
320	buffer /*
321	#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
322	/!< page number of the*
323	first page in the first
324	sequence of 64
325	(= FSP_EXTENT_SIZE) consecutive
326	pages in the doublewrite
327	buffer /*
328	#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
329	/!< page number of the*
330	first page in the second
331	sequence of 64 consecutive
332	pages in the doublewrite
333	buffer /*
334	#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
335	TRX_SYS_DOUBLEWRITE_MAGIC,
336	TRX_SYS_DOUBLEWRITE_BLOCK1,
337	TRX_SYS_DOUBLEWRITE_BLOCK2
338	so that if the trx sys
339	header is half-written
340	to disk, we still may
341	be able to recover the
342	information */
343	/* If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,*
344	we must reset the doublewrite buffer, because starting from 4.1.x the
345	space id of a data page is stored into
346	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. /*
347	#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
348
349	/-------------------------------------------------------------/
350	/* Contents of TRX_SYS_DOUBLEWRITE_MAGIC /
351	#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
352	/* Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED /
353	#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
354
355	/* Size of the doublewrite block in pages /
356	#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
357	/ @} /
358
359	trx_t* current_trx();
360
361	struct rw_trx_hash_element_t
362	{
363	rw_trx_hash_element_t(): trx(`0`)
364	{
365	mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
366	}
367
368
369	~rw_trx_hash_element_t()
370	{
371	mutex_free(&mutex);
372	}
373
374
375	trx_id_t id; / lf_hash_init() relies on this to be first in the struct /
376	trx_id_t no;
377	trx_t *trx;
378	ib_mutex_t mutex;
379	};
380
381
382	/**
383	Wrapper around LF_HASH to store set of in memory read-write transactions.
384	*/
385
386	class rw_trx_hash_t
387	{
388	LF_HASH hash;
389
390
391	/**
392	Constructor callback for lock-free allocator.
393
394	Object is just allocated and is not yet accessible via rw_trx_hash by
395	concurrent threads. Object can be reused multiple times before it is freed.
396	Every time object is being reused initializer() callback is called.
397	*/
398
399	static void rw_trx_hash_constructor(uchar *arg)
400	{
401	new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t ();
402	}
403
404
405	/**
406	Destructor callback for lock-free allocator.
407
408	Object is about to be freed and is not accessible via rw_trx_hash by
409	concurrent threads.
410	*/
411
412	static void rw_trx_hash_destructor(uchar *arg)
413	{
414	reinterpret_cast<rw_trx_hash_element_t*>
415	(arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
416	}
417
418
419	/**
420	Destructor callback for lock-free allocator.
421
422	This destructor is used at shutdown. It frees remaining transaction
423	objects.
424
425	XA PREPARED transactions may remain if they haven't been committed or
426	rolled back. ACTIVE transactions may remain if startup was interrupted or
427	server is running in read-only mode or for certain srv_force_recovery
428	levels.
429	*/
430
431	static void rw_trx_hash_shutdown_destructor(uchar *arg)
432	{
433	rw_trx_hash_element_t *element=
434	reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
435	if (trx_t *trx= element->trx)
436	{
437	ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) \|\|
438	(trx_state_eq(trx, TRX_STATE_ACTIVE) &&
439	(!srv_was_started \|\|
440	srv_read_only_mode \|\|
441	srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
442	trx_free_at_shutdown(trx);
443	}
444	element->~rw_trx_hash_element_t();
445	}
446
447
448	/**
449	Initializer callback for lock-free hash.
450
451	Object is not yet accessible via rw_trx_hash by concurrent threads, but is
452	about to become such. Object id can be changed only by this callback and
453	remains the same until all pins to this object are released.
454
455	Object trx can be changed to 0 by erase() under object mutex protection,
456	which indicates it is about to be removed from lock-free hash and become
457	not accessible by concurrent threads.
458	*/
459
460	static void rw_trx_hash_initializer(LF_HASH *,
461	rw_trx_hash_element_t *element,
462	trx_t *trx)
463	{
464	ut_ad(element->trx == `0`);
465	element->trx= trx;
466	element->id= trx->id;
467	element->no= TRX_ID_MAX;
468	trx->rw_trx_hash_element= element;
469	}
470
471
472	/**
473	Gets LF_HASH pins.
474
475	Pins are used to protect object from being destroyed or reused. They are
476	normally stored in trx object for quick access. If caller doesn't have trx
477	available, we try to get it using currnet_trx(). If caller doesn't have trx
478	at all, temporary pins are allocated.
479	*/
480
481	LF_PINS get_pins(trx_t trx)
482	{
483	if (!trx->rw_trx_hash_pins)
484	{
485	trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
486	ut_a(trx->rw_trx_hash_pins);
487	}
488	return trx->rw_trx_hash_pins;
489	}
490
491
492	struct eliminate_duplicates_arg
493	{
494	trx_ids_t ids;
495	my_hash_walk_action action;
496	void *argument;
497	eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
498	action(act), argument(arg) { ids.reserve(size); }
499	};
500
501
502	static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
503	eliminate_duplicates_arg *arg)
504	{
505	for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it ++)
506	{
507	if (*it == element->id)
508	return `0`;
509	}
510	arg->ids.push_back(element->id);
511	return arg->action(element, arg->argument);
512	}
513
514
515	#ifdef UNIV_DEBUG
516	static void validate_element(trx_t *trx)
517	{
518	ut_ad(!trx->read_only \|\| !trx->rsegs.m_redo.rseg);
519	ut_ad(!trx_is_autocommit_non_locking(trx));
520	mutex_enter(&trx->mutex);
521	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) \|\|
522	trx_state_eq(trx, TRX_STATE_PREPARED));
523	mutex_exit(&trx->mutex);
524	}
525
526
527	struct debug_iterator_arg
528	{
529	my_hash_walk_action action;
530	void *argument;
531	};
532
533
534	static my_bool debug_iterator(rw_trx_hash_element_t *element,
535	debug_iterator_arg *arg)
536	{
537	mutex_enter(&element->mutex);
538	if (element->trx)
539	validate_element(element->trx);
540	mutex_exit(&element->mutex);
541	return arg->action(element, arg->argument);
542	}
543	#endif
544
545
546	public:
547	void init()
548	{
549	lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, `0`,
550	sizeof(trx_id_t), `0`, &my_charset_bin);
551	hash.alloc.constructor= rw_trx_hash_constructor;
552	hash.alloc.destructor= rw_trx_hash_destructor;
553	hash.initializer=
554	reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
555	}
556
557
558	void destroy()
559	{
560	hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
561	lf_hash_destroy(&hash);
562	}
563
564
565	/**
566	Releases LF_HASH pins.
567
568	Must be called by thread that owns trx_t object when the latter is being
569	"detached" from thread (e.g. released to the pool by trx_free()). Can be
570	called earlier if thread is expected not to use rw_trx_hash.
571
572	Since pins are not allowed to be transferred to another thread,
573	initialisation thread calls this for recovered transactions.
574	*/
575
576	void put_pins(trx_t *trx)
577	{
578	if (trx->rw_trx_hash_pins)
579	{
580	lf_hash_put_pins(trx->rw_trx_hash_pins);
581	trx->rw_trx_hash_pins= `0`;
582	}
583	}
584
585
586	/**
587	Finds trx object in lock-free hash with given id.
588
589	Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
590	the transaction may get committed before this method returns.
591
592	With do_ref_count == false the caller may dereference returned trx pointer
593	only if lock_sys.mutex was acquired before calling find().
594
595	With do_ref_count == true caller may dereference trx even if it is not
596	holding lock_sys.mutex. Caller is responsible for calling
597	trx->release_reference() when it is done playing with trx.
598
599	Ideally this method should get caller rw_trx_hash_pins along with trx
600	object as a parameter, similar to insert() and erase(). However most
601	callers lose trx early in their call chains and it is not that easy to pass
602	them through.
603
604	So we take more expensive approach: get trx through current_thd()->ha_data.
605	Some threads don't have trx attached to THD, and at least server
606	initialisation thread, fts_optimize_thread, srv_master_thread,
607	dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
608	have THD at all. For such cases we allocate pins only for duration of
609	search and free them immediately.
610
611	This has negative performance impact and should be fixed eventually (by
612	passing caller_trx as a parameter). Still stream of DML is more or less Ok.
613
614	@return
615	@retval 0 not found
616	@retval pointer to trx
617	*/
618
619	trx_t find(trx_t caller_trx, trx_id_t trx_id, bool do_ref_count= false)
620	{
621	/*
622	In MariaDB 10.3, purge will reset DB_TRX_ID to 0
623	when the history is lost. Read/write transactions will
624	always have a nonzero trx_t::id; there the value 0 is
625	reserved for transactions that did not write or lock
626	anything yet.
627	*/
628	if (!trx_id)
629	return NULL;
630	if (caller_trx && caller_trx->id == trx_id)
631	{
632	if (do_ref_count)
633	caller_trx->reference();
634	return caller_trx;
635	}
636
637	trx_t *trx= `0`;
638	LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
639	ut_a(pins);
640
641	rw_trx_hash_element_t element= reinterpret_cast<rw_trx_hash_element_t>
642	(lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
643	sizeof(trx_id_t)));
644	if (element)
645	{
646	mutex_enter(&element->mutex);
647	lf_hash_search_unpin(pins);
648	if ((trx= element->trx))
649	{
650	if (do_ref_count)
651	trx->reference();
652	ut_d(validate_element(trx));
653	}
654	mutex_exit(&element->mutex);
655	}
656	if (!caller_trx)
657	lf_hash_put_pins(pins);
658	return trx;
659	}
660
661
662	/**
663	Inserts trx to lock-free hash.
664
665	Object becomes accessible via rw_trx_hash.
666	*/
667
668	void insert(trx_t *trx)
669	{
670	ut_d(validate_element(trx));
671	int res= lf_hash_insert(&hash, get_pins(trx),
672	reinterpret_cast<void*>(trx));
673	ut_a(res == `0`);
674	}
675
676
677	/**
678	Removes trx from lock-free hash.
679
680	Object becomes not accessible via rw_trx_hash. But it still can be pinned
681	by concurrent find(), which is supposed to release it immediately after
682	it sees object trx is 0.
683	*/
684
685	void erase(trx_t *trx)
686	{
687	ut_d(validate_element(trx));
688	mutex_enter(&trx->rw_trx_hash_element->mutex);
689	trx->rw_trx_hash_element->trx= `0`;
690	mutex_exit(&trx->rw_trx_hash_element->mutex);
691	int res= lf_hash_delete(&hash, get_pins(trx),
692	reinterpret_cast<const void*>(&trx->id),
693	sizeof(trx_id_t));
694	ut_a(res == `0`);
695	}
696
697
698	/**
699	Returns the number of elements in the hash.
700
701	The number is exact only if hash is protected against concurrent
702	modifications (e.g. single threaded startup or hash is protected
703	by some mutex). Otherwise the number may be used as a hint only,
704	because it may change even before this method returns.
705	*/
706
707	uint32_t size()
708	{
709	return uint32_t(my_atomic_load32_explicit(&hash.count,
710	MY_MEMORY_ORDER_RELAXED));
711	}
712
713
714	/**
715	Iterates the hash.
716
717	@param caller_trx used to get/set pins
718	@param action called for every element in hash
719	@param argument opque argument passed to action
720
721	May return the same element multiple times if hash is under contention.
722	If caller doesn't like to see the same transaction multiple times, it has
723	to call iterate_no_dups() instead.
724
725	May return element with committed transaction. If caller doesn't like to
726	see committed transactions, it has to skip those under element mutex:
727
728	mutex_enter(&element->mutex);
729	if (trx_t trx= element->trx)
730	{
731	// trx is protected against commit in this branch
732	}
733	mutex_exit(&element->mutex);
734
735	May miss concurrently inserted transactions.
736
737	@return
738	@retval 0 iteration completed successfully
739	@retval 1 iteration was interrupted (action returned 1)
740	*/
741
742	int iterate(trx_t caller_trx, my_hash_walk_action action, void* *argument)
743	{
744	LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
745	ut_a(pins);
746	#ifdef UNIV_DEBUG
747	debug_iterator_arg debug_arg= { action, argument };
748	action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
749	argument= &debug_arg;
750	#endif
751	int res= lf_hash_iterate(&hash, pins, action, argument);
752	if (!caller_trx)
753	lf_hash_put_pins(pins);
754	return res;
755	}
756
757
758	int iterate(my_hash_walk_action action, void *argument)
759	{
760	return iterate(current_trx(), action, argument);
761	}
762
763
764	/**
765	Iterates the hash and eliminates duplicate elements.
766
767	@sa iterate()
768	*/
769
770	int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
771	void *argument)
772	{
773	eliminate_duplicates_arg arg(size() + `32`, action, argument);
774	return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
775	(eliminate_duplicates), &arg);
776	}
777
778
779	int iterate_no_dups(my_hash_walk_action action, void *argument)
780	{
781	return iterate_no_dups(current_trx(), action, argument);
782	}
783	};
784
785
786	/* The transaction system central memory data structure. /
787	class trx_sys_t
788	{
789	/**
790	The smallest number not yet assigned as a transaction id or transaction
791	number. Accessed and updated with atomic operations.
792	*/
793	MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id;
794
795
796	/**
797	Solves race conditions between register_rw() and snapshot_ids() as well as
798	race condition between assign_new_trx_no() and snapshot_ids().
799
800	@sa register_rw()
801	@sa assign_new_trx_no()
802	@sa snapshot_ids()
803	*/
804	MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version;
805
806
807	/**
808	TRX_RSEG_HISTORY list length (number of committed transactions to purge)
809	*/
810	MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len;
811
812	bool m_initialised;
813
814	public:
815	/* Mutex protecting trx_list. /
816	MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
817
818	/* List of all transactions. /
819	MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list;
820
821	MY_ALIGNED(CACHE_LINE_SIZE)
822	/* Temporary rollback segments /
823	trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS];
824
825	MY_ALIGNED(CACHE_LINE_SIZE)
826	trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
827	/!< Pointer array to rollback*
828	segments; NULL if slot not in use;
829	created and destroyed in
830	single-threaded mode; not protected
831	by any mutex, because it is read-only
832	during multi-threaded operation /*
833
834	/**
835	Lock-free hash of in memory read-write transactions.
836	Works faster when it is on it's own cache line (tested).
837	*/
838
839	MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
840
841
842	#ifdef WITH_WSREP
843	/* Latest recovered XID during startup /
844	XID recovered_wsrep_xid;
845	#endif
846	/* Latest recovered binlog offset /
847	uint64_t recovered_binlog_offset;
848	/* Latest recovred binlog file name /
849	char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
850
851
852	/**
853	Constructor.
854
855	Some members may require late initialisation, thus we just mark object as
856	uninitialised. Real initialisation happens in create().
857	*/
858
859	trx_sys_t(): m_initialised(false) {}
860
861
862	/**
863	Returns the minimum trx id in rw trx list.
864
865	This is the smallest id for which the trx can possibly be active. (But, you
866	must look at the trx->state to find out if the minimum trx id transaction
867	itself is active, or already committed.)
868
869	@return the minimum trx id, or m_max_trx_id if the trx list is empty
870	*/
871
872	trx_id_t get_min_trx_id()
873	{
874	trx_id_t id= get_max_trx_id();
875	rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
876	(get_min_trx_id_callback), &id);
877	return id;
878	}
879
880
881	/**
882	Determines the maximum transaction id.
883
884	@return maximum currently allocated trx id; will be stale after the
885	next call to trx_sys.get_new_trx_id()
886	*/
887
888	trx_id_t get_max_trx_id()
889	{
890	return static_cast<trx_id_t>
891	(my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id),
892	MY_MEMORY_ORDER_RELAXED));
893	}
894
895
896	/**
897	Allocates a new transaction id.
898	@return new, allocated trx id
899	*/
900
901	trx_id_t get_new_trx_id()
902	{
903	trx_id_t id= get_new_trx_id_no_refresh();
904	refresh_rw_trx_hash_version();
905	return id;
906	}
907
908
909	/**
910	Allocates and assigns new transaction serialisation number.
911
912	There's a gap between m_max_trx_id increment and transaction serialisation
913	number becoming visible through rw_trx_hash. While we're in this gap
914	concurrent thread may come and do MVCC snapshot without seeing allocated
915	but not yet assigned serialisation number. Then at some point purge thread
916	may clone this view. As a result it won't see newly allocated serialisation
917	number and may remove "unnecessary" history data of this transaction from
918	rollback segments.
919
920	m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
921	to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
922	means that all transaction serialisation numbers up to m_max_trx_id are
923	available through rw_trx_hash.
924
925	We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
926	that m_rw_trx_hash_version increment happens after
927	trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
928
929	@param trx transaction
930	*/
931	void assign_new_trx_no(trx_t *trx)
932	{
933	trx->no= get_new_trx_id_no_refresh();
934	my_atomic_store64_explicit(reinterpret_cast<int64*>
935	(&trx->rw_trx_hash_element->no),
936	trx->no, MY_MEMORY_ORDER_RELAXED);
937	refresh_rw_trx_hash_version();
938	}
939
940
941	/**
942	Takes MVCC snapshot.
943
944	To reduce malloc probablility we reserver rw_trx_hash.size() + 32 elements
945	in ids.
946
947	For details about get_rw_trx_hash_version() != get_max_trx_id() spin
948	@sa register_rw() and @sa assign_new_trx_no().
949
950	We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
951	that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
952
953	To optimise snapshot creation rw_trx_hash.iterate() is being used instead
954	of rw_trx_hash.iterate_no_dups(). It means that some transaction
955	identifiers may appear multiple times in ids.
956
957	@param[in,out] caller_trx used to get access to rw_trx_hash_pins
958	@param[out] ids array to store registered transaction identifiers
959	@param[out] max_trx_id variable to store m_max_trx_id value
960	@param[out] mix_trx_no variable to store min(trx->no) value
961	*/
962
963	void snapshot_ids(trx_t caller_trx, trx_ids_t ids, trx_id_t *max_trx_id,
964	trx_id_t *min_trx_no)
965	{
966	ut_ad(!mutex_own(&mutex));
967	snapshot_ids_arg arg(ids);
968
969	while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
970	ut_delay(`1`);
971	arg.m_no= arg.m_id;
972
973	ids->clear();
974	ids->reserve(rw_trx_hash.size() + `32`);
975	rw_trx_hash.iterate(caller_trx,
976	reinterpret_cast<my_hash_walk_action>(copy_one_id),
977	&arg);
978
979	*max_trx_id= arg.m_id;
980	*min_trx_no= arg.m_no;
981	}
982
983
984	/* Initialiser for m_max_trx_id and m_rw_trx_hash_version. /
985	void init_max_trx_id(trx_id_t value)
986	{
987	m_max_trx_id= m_rw_trx_hash_version= value;
988	}
989
990
991	bool is_initialised() { return m_initialised; }
992
993
994	/* Initialise the transaction subsystem. /
995	void create();
996
997	/* Close the transaction subsystem on shutdown. /
998	void close();
999
1000	/* @return total number of active (non-prepared) transactions /
1001	ulint any_active_transactions();
1002
1003
1004	/**
1005	Registers read-write transaction.
1006
1007	Transaction becomes visible to MVCC.
1008
1009	There's a gap between m_max_trx_id increment and transaction becoming
1010	visible through rw_trx_hash. While we're in this gap concurrent thread may
1011	come and do MVCC snapshot. As a result concurrent read view will be able to
1012	observe records owned by this transaction even before it was committed.
1013
1014	m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
1015	to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
1016	means that all transactions up to m_max_trx_id are available through
1017	rw_trx_hash.
1018
1019	We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
1020	that m_rw_trx_hash_version increment happens after transaction becomes
1021	visible through rw_trx_hash.
1022	*/
1023
1024	void register_rw(trx_t *trx)
1025	{
1026	trx->id= get_new_trx_id_no_refresh();
1027	rw_trx_hash.insert(trx);
1028	refresh_rw_trx_hash_version();
1029	}
1030
1031
1032	/**
1033	Deregisters read-write transaction.
1034
1035	Transaction is removed from rw_trx_hash, which releases all implicit locks.
1036	MVCC snapshot won't see this transaction anymore.
1037	*/
1038
1039	void deregister_rw(trx_t *trx)
1040	{
1041	rw_trx_hash.erase(trx);
1042	}
1043
1044
1045	bool is_registered(trx_t *caller_trx, trx_id_t id)
1046	{
1047	return rw_trx_hash.find(caller_trx, id);
1048	}
1049
1050
1051	trx_t find(trx_t caller_trx, trx_id_t id)
1052	{
1053	return rw_trx_hash.find(caller_trx, id, true);
1054	}
1055
1056
1057	/**
1058	Registers transaction in trx_sys.
1059
1060	@param trx transaction
1061	*/
1062	void register_trx(trx_t *trx)
1063	{
1064	mutex_enter(&mutex);
1065	UT_LIST_ADD_FIRST(trx_list, trx);
1066	mutex_exit(&mutex);
1067	}
1068
1069
1070	/**
1071	Deregisters transaction in trx_sys.
1072
1073	@param trx transaction
1074	*/
1075	void deregister_trx(trx_t *trx)
1076	{
1077	mutex_enter(&mutex);
1078	UT_LIST_REMOVE(trx_list, trx);
1079	mutex_exit(&mutex);
1080	}
1081
1082
1083	/**
1084	Clones the oldest view and stores it in view.
1085
1086	No need to call ReadView::close(). The caller owns the view that is passed
1087	in. This function is called by purge thread to determine whether it should
1088	purge the delete marked record or not.
1089	*/
1090	void clone_oldest_view();
1091
1092
1093	/* @return the number of active views /
1094	size_t view_count() const
1095	{
1096	size_t count= `0`;
1097
1098	mutex_enter(&mutex);
1099	for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
1100	trx= UT_LIST_GET_NEXT(trx_list, trx))
1101	{
1102	if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
1103	++count;
1104	}
1105	mutex_exit(&mutex);
1106	return count;
1107	}
1108
1109	/* @return number of committed transactions waiting for purge /
1110	ulint history_size() const
1111	{
1112	return uint32(my_atomic_load32(&const_cast<trx_sys_t>(this*)
1113	->rseg_history_len));
1114	}
1115	/* Add to the TRX_RSEG_HISTORY length (on database startup). /
1116	void history_add(int32 len)
1117	{
1118	my_atomic_add32(&rseg_history_len, len);
1119	}
1120	/* Register a committed transaction. /
1121	void history_insert() { history_add(`1`); }
1122	/* Note that a committed transaction was purged. /
1123	void history_remove() { history_add(-`1`); }
1124
1125	private:
1126	static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
1127	trx_id_t *id)
1128	{
1129	if (element->id < *id)
1130	{
1131	mutex_enter(&element->mutex);
1132	/ We don't care about read-only transactions here. /
1133	if (element->trx && element->trx->rsegs.m_redo.rseg)
1134	*id= element->id;
1135	mutex_exit(&element->mutex);
1136	}
1137	return `0`;
1138	}
1139
1140
1141	struct snapshot_ids_arg
1142	{
1143	snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
1144	trx_ids_t *m_ids;
1145	trx_id_t m_id;
1146	trx_id_t m_no;
1147	};
1148
1149
1150	static my_bool copy_one_id(rw_trx_hash_element_t *element,
1151	snapshot_ids_arg *arg)
1152	{
1153	if (element->id < arg->m_id)
1154	{
1155	trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit(
1156	reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED));
1157	arg->m_ids->push_back(element->id);
1158	if (no < arg->m_no)
1159	arg->m_no= no;
1160	}
1161	return `0`;
1162	}
1163
1164
1165	/* Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. /
1166	trx_id_t get_rw_trx_hash_version()
1167	{
1168	return static_cast<trx_id_t>
1169	(my_atomic_load64_explicit(reinterpret_cast<int64*>
1170	(&m_rw_trx_hash_version),
1171	MY_MEMORY_ORDER_ACQUIRE));
1172	}
1173
1174
1175	/* Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. /
1176	void refresh_rw_trx_hash_version()
1177	{
1178	my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version),
1179	`1`, MY_MEMORY_ORDER_RELEASE);
1180	}
1181
1182
1183	/**
1184	Allocates new transaction id without refreshing rw_trx_hash version.
1185
1186	This method is extracted for exclusive use by register_rw() and
1187	assign_new_trx_no() where new id must be allocated atomically with
1188	payload of these methods from MVCC snapshot point of view.
1189
1190	@sa get_new_trx_id()
1191	@sa assign_new_trx_no()
1192
1193	@return new transaction id
1194	*/
1195
1196	trx_id_t get_new_trx_id_no_refresh()
1197	{
1198	return static_cast<trx_id_t>(my_atomic_add64_explicit(
1199	reinterpret_cast<int64*>(&m_max_trx_id), `1`, MY_MEMORY_ORDER_RELAXED));
1200	}
1201	};
1202
1203
1204	/* The transaction system /
1205	extern trx_sys_t trx_sys;
1206
1207	#endif
1208

Browse the source code of MariaDB/storage/innobase/include/trx0sys.h