1/*****************************************************************************
2
3Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file include/trx0sys.h
22Transaction system
23
24Created 3/26/1996 Heikki Tuuri
25*******************************************************/
26
27#ifndef trx0sys_h
28#define trx0sys_h
29
30#include "univ.i"
31
32#include "buf0buf.h"
33#include "fil0fil.h"
34#include "trx0types.h"
35#include "mem0mem.h"
36#include "mtr0mtr.h"
37#include "ut0byte.h"
38#include "ut0lst.h"
39#include "read0types.h"
40#include "page0types.h"
41#include "ut0mutex.h"
42#include "trx0trx.h"
43#ifdef WITH_WSREP
44#include "trx0xa.h"
45#endif /* WITH_WSREP */
46
47typedef UT_LIST_BASE_NODE_T(trx_t) trx_ut_list_t;
48
49/** Checks if a page address is the trx sys header page.
50@param[in] page_id page id
51@return true if trx sys header page */
52inline
53bool
54trx_sys_hdr_page(const page_id_t& page_id)
55{
56 return(page_id.space() == TRX_SYS_SPACE
57 && page_id.page_no() == TRX_SYS_PAGE_NO);
58}
59
60/*****************************************************************//**
61Creates and initializes the transaction system at the database creation. */
62void
63trx_sys_create_sys_pages(void);
64/*==========================*/
65/** Find an available rollback segment.
66@param[in] sys_header
67@return an unallocated rollback segment slot in the TRX_SYS header
68@retval ULINT_UNDEFINED if not found */
69ulint
70trx_sys_rseg_find_free(const buf_block_t* sys_header);
71/** Request the TRX_SYS page.
72@param[in] rw whether to lock the page for writing
73@return the TRX_SYS page
74@retval NULL if the page cannot be read */
75inline
76buf_block_t*
77trx_sysf_get(mtr_t* mtr, bool rw = true)
78{
79 buf_block_t* block = buf_page_get(
80 page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
81 univ_page_size, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
82 if (block) {
83 buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
84 }
85 return block;
86}
87
88#ifdef UNIV_DEBUG
89/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
90extern uint trx_rseg_n_slots_debug;
91#endif
92
93/** Write DB_TRX_ID.
94@param[out] db_trx_id the DB_TRX_ID field to be written to
95@param[in] id transaction ID */
96UNIV_INLINE
97void
98trx_write_trx_id(byte* db_trx_id, trx_id_t id)
99{
100 compile_time_assert(DATA_TRX_ID_LEN == 6);
101 ut_ad(id);
102 mach_write_to_6(db_trx_id, id);
103}
104
105/** Read a transaction identifier.
106@return id */
107inline
108trx_id_t
109trx_read_trx_id(const byte* ptr)
110{
111 compile_time_assert(DATA_TRX_ID_LEN == 6);
112 return(mach_read_from_6(ptr));
113}
114
115#ifdef UNIV_DEBUG
116/** Check that the DB_TRX_ID in a record is valid.
117@param[in] db_trx_id the DB_TRX_ID column to validate
118@param[in] trx_id the id of the ALTER TABLE transaction */
119inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
120{
121 trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
122 ut_ad(id == 0 || id > trx_id);
123 return true;
124}
125#endif
126
127/*****************************************************************//**
128Updates the offset information about the end of the MySQL binlog entry
129which corresponds to the transaction just being committed. In a MySQL
130replication slave updates the latest master binlog position up to which
131replication has proceeded. */
132void
133trx_sys_update_mysql_binlog_offset(
134/*===============================*/
135 const char* file_name,/*!< in: MySQL log file name */
136 int64_t offset, /*!< in: position in that log file */
137 buf_block_t* sys_header, /*!< in,out: trx sys header */
138 mtr_t* mtr); /*!< in,out: mini-transaction */
139/** Display the MySQL binlog offset info if it is present in the trx
140system header. */
141void
142trx_sys_print_mysql_binlog_offset();
143
144/** Create the rollback segments.
145@return whether the creation succeeded */
146bool
147trx_sys_create_rsegs();
148
149/** The automatically created system rollback segment has this id */
150#define TRX_SYS_SYSTEM_RSEG_ID 0
151
152/** The offset of the transaction system header on the page */
153#define TRX_SYS FSEG_PAGE_DATA
154
155/** Transaction system header */
156/*------------------------------------------------------------- @{ */
157/** In old versions of InnoDB, this persisted the value of
158trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
159the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
160and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
161are used instead. The field only exists for the purpose of upgrading
162from older MySQL or MariaDB versions. */
163#define TRX_SYS_TRX_ID_STORE 0
164#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
165 tablespace segment the trx
166 system is created into */
167#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
168 /*!< the start of the array of
169 rollback segment specification
170 slots */
171/*------------------------------------------------------------- @} */
172
173/** The number of rollback segments; rollback segment id must fit in
174the 7 bits reserved for it in DB_ROLL_PTR. */
175#define TRX_SYS_N_RSEGS 128
176/** Maximum number of undo tablespaces (not counting the system tablespace) */
177#define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1)
178
179/* Rollback segment specification slot offsets */
180
181/** the tablespace ID of an undo log header; starting with
182MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
183#define TRX_SYS_RSEG_SPACE 0
184/** the page number of an undo log header, or FIL_NULL if unused */
185#define TRX_SYS_RSEG_PAGE_NO 4
186/** Size of a rollback segment specification slot */
187#define TRX_SYS_RSEG_SLOT_SIZE 8
188
189/** Read the tablespace ID of a rollback segment slot.
190@param[in] sys_header TRX_SYS page
191@param[in] rseg_id rollback segment identifier
192@return undo tablespace id */
193inline
194uint32_t
195trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
196{
197 ut_ad(rseg_id < TRX_SYS_N_RSEGS);
198 return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
199 + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
200 + sys_header->frame);
201}
202
203/** Read the page number of a rollback segment slot.
204@param[in] sys_header TRX_SYS page
205@param[in] rseg_id rollback segment identifier
206@return undo page number */
207inline
208uint32_t
209trx_sysf_rseg_get_page_no(const buf_block_t* sys_header, ulint rseg_id)
210{
211 ut_ad(rseg_id < TRX_SYS_N_RSEGS);
212 return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
213 + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
214 + sys_header->frame);
215}
216
217/** Maximum length of MySQL binlog file name, in bytes.
218(Used before MariaDB 10.3.5.) */
219#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
220/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
221#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
222
223#if UNIV_PAGE_SIZE_MIN < 4096
224# error "UNIV_PAGE_SIZE_MIN < 4096"
225#endif
226/** The offset of the MySQL binlog offset info in the trx system header */
227#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000)
228#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
229 TRX_SYS_MYSQL_LOG_MAGIC_N
230 if we have valid data in the
231 MySQL binlog info */
232#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset
233 within that file */
234#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
235
236/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
237
2380...37 FIL_HEADER
23938...45 TRX_SYS_TRX_ID_STORE
24046...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
24156 TRX_SYS_RSEGS
242 56...59 TRX_SYS_RSEG_SPACE for slot 0
243 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0
244 64...67 TRX_SYS_RSEG_SPACE for slot 1
245 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1
246....
247 594..597 TRX_SYS_RSEG_SPACE for slot 72
248 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72
249...
250 ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126
251
252(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
253space_id, page_no pairs :::)
254596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
255600 TRX_SYS_WSREP_XID_FORMAT
256604 TRX_SYS_WSREP_XID_GTRID_LEN
257608 TRX_SYS_WSREP_XID_BQUAL_LEN
258612 TRX_SYS_WSREP_XID_DATA (len = 128)
259739 TRX_SYS_WSREP_XID_DATA_END
260
261FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
262(srv_page_size-2500)
2631596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
2641600 TRX_SYS_WSREP_XID_FORMAT
2651604 TRX_SYS_WSREP_XID_GTRID_LEN
2661608 TRX_SYS_WSREP_XID_BQUAL_LEN
2671612 TRX_SYS_WSREP_XID_DATA (len = 128)
2681739 TRX_SYS_WSREP_XID_DATA_END
269
270(srv_page_size - 2000 MYSQL MASTER LOG)
2712096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
2722100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
2732104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
2742108 TRX_SYS_MYSQL_LOG_NAME
275
276(srv_page_size - 1000 MYSQL LOG)
2773096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
2783100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
2793104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
2803108 TRX_SYS_MYSQL_LOG_NAME
281
282(srv_page_size - 200 DOUBLEWRITE)
2833896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG
2843906 TRX_SYS_DOUBLEWRITE_MAGIC
2853910 TRX_SYS_DOUBLEWRITE_BLOCK1
2863914 TRX_SYS_DOUBLEWRITE_BLOCK2
2873918 TRX_SYS_DOUBLEWRITE_REPEAT
2883930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
289
290(srv_page_size - 8, TAILER)
2914088..4096 FIL_TAILER
292
293*/
294#ifdef WITH_WSREP
295/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
296#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
297#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
298#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
299
300/** XID field: formatID, gtrid_len, bqual_len, xid_data */
301#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
302#define TRX_SYS_WSREP_XID_FORMAT 4
303#define TRX_SYS_WSREP_XID_GTRID_LEN 8
304#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
305#define TRX_SYS_WSREP_XID_DATA 16
306#endif /* WITH_WSREP*/
307
308/** Doublewrite buffer */
309/* @{ */
310/** The offset of the doublewrite buffer header on the trx system header page */
311#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200)
312/*-------------------------------------------------------------*/
313#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
314 containing the doublewrite
315 buffer */
316#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
317 /*!< 4-byte magic number which
318 shows if we already have
319 created the doublewrite
320 buffer */
321#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
322 /*!< page number of the
323 first page in the first
324 sequence of 64
325 (= FSP_EXTENT_SIZE) consecutive
326 pages in the doublewrite
327 buffer */
328#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
329 /*!< page number of the
330 first page in the second
331 sequence of 64 consecutive
332 pages in the doublewrite
333 buffer */
334#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
335 TRX_SYS_DOUBLEWRITE_MAGIC,
336 TRX_SYS_DOUBLEWRITE_BLOCK1,
337 TRX_SYS_DOUBLEWRITE_BLOCK2
338 so that if the trx sys
339 header is half-written
340 to disk, we still may
341 be able to recover the
342 information */
343/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
344we must reset the doublewrite buffer, because starting from 4.1.x the
345space id of a data page is stored into
346FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
347#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
348
349/*-------------------------------------------------------------*/
350/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
351#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
352/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
353#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
354
355/** Size of the doublewrite block in pages */
356#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
357/* @} */
358
359trx_t* current_trx();
360
361struct rw_trx_hash_element_t
362{
363 rw_trx_hash_element_t(): trx(0)
364 {
365 mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
366 }
367
368
369 ~rw_trx_hash_element_t()
370 {
371 mutex_free(&mutex);
372 }
373
374
375 trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
376 trx_id_t no;
377 trx_t *trx;
378 ib_mutex_t mutex;
379};
380
381
382/**
383 Wrapper around LF_HASH to store set of in memory read-write transactions.
384*/
385
386class rw_trx_hash_t
387{
388 LF_HASH hash;
389
390
391 /**
392 Constructor callback for lock-free allocator.
393
394 Object is just allocated and is not yet accessible via rw_trx_hash by
395 concurrent threads. Object can be reused multiple times before it is freed.
396 Every time object is being reused initializer() callback is called.
397 */
398
399 static void rw_trx_hash_constructor(uchar *arg)
400 {
401 new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
402 }
403
404
405 /**
406 Destructor callback for lock-free allocator.
407
408 Object is about to be freed and is not accessible via rw_trx_hash by
409 concurrent threads.
410 */
411
412 static void rw_trx_hash_destructor(uchar *arg)
413 {
414 reinterpret_cast<rw_trx_hash_element_t*>
415 (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
416 }
417
418
419 /**
420 Destructor callback for lock-free allocator.
421
422 This destructor is used at shutdown. It frees remaining transaction
423 objects.
424
425 XA PREPARED transactions may remain if they haven't been committed or
426 rolled back. ACTIVE transactions may remain if startup was interrupted or
427 server is running in read-only mode or for certain srv_force_recovery
428 levels.
429 */
430
431 static void rw_trx_hash_shutdown_destructor(uchar *arg)
432 {
433 rw_trx_hash_element_t *element=
434 reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
435 if (trx_t *trx= element->trx)
436 {
437 ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
438 (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
439 (!srv_was_started ||
440 srv_read_only_mode ||
441 srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
442 trx_free_at_shutdown(trx);
443 }
444 element->~rw_trx_hash_element_t();
445 }
446
447
448 /**
449 Initializer callback for lock-free hash.
450
451 Object is not yet accessible via rw_trx_hash by concurrent threads, but is
452 about to become such. Object id can be changed only by this callback and
453 remains the same until all pins to this object are released.
454
455 Object trx can be changed to 0 by erase() under object mutex protection,
456 which indicates it is about to be removed from lock-free hash and become
457 not accessible by concurrent threads.
458 */
459
460 static void rw_trx_hash_initializer(LF_HASH *,
461 rw_trx_hash_element_t *element,
462 trx_t *trx)
463 {
464 ut_ad(element->trx == 0);
465 element->trx= trx;
466 element->id= trx->id;
467 element->no= TRX_ID_MAX;
468 trx->rw_trx_hash_element= element;
469 }
470
471
472 /**
473 Gets LF_HASH pins.
474
475 Pins are used to protect object from being destroyed or reused. They are
476 normally stored in trx object for quick access. If caller doesn't have trx
477 available, we try to get it using currnet_trx(). If caller doesn't have trx
478 at all, temporary pins are allocated.
479 */
480
481 LF_PINS *get_pins(trx_t *trx)
482 {
483 if (!trx->rw_trx_hash_pins)
484 {
485 trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
486 ut_a(trx->rw_trx_hash_pins);
487 }
488 return trx->rw_trx_hash_pins;
489 }
490
491
492 struct eliminate_duplicates_arg
493 {
494 trx_ids_t ids;
495 my_hash_walk_action action;
496 void *argument;
497 eliminate_duplicates_arg(size_t size, my_hash_walk_action act, void* arg):
498 action(act), argument(arg) { ids.reserve(size); }
499 };
500
501
502 static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
503 eliminate_duplicates_arg *arg)
504 {
505 for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
506 {
507 if (*it == element->id)
508 return 0;
509 }
510 arg->ids.push_back(element->id);
511 return arg->action(element, arg->argument);
512 }
513
514
515#ifdef UNIV_DEBUG
516 static void validate_element(trx_t *trx)
517 {
518 ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
519 ut_ad(!trx_is_autocommit_non_locking(trx));
520 mutex_enter(&trx->mutex);
521 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
522 trx_state_eq(trx, TRX_STATE_PREPARED));
523 mutex_exit(&trx->mutex);
524 }
525
526
527 struct debug_iterator_arg
528 {
529 my_hash_walk_action action;
530 void *argument;
531 };
532
533
534 static my_bool debug_iterator(rw_trx_hash_element_t *element,
535 debug_iterator_arg *arg)
536 {
537 mutex_enter(&element->mutex);
538 if (element->trx)
539 validate_element(element->trx);
540 mutex_exit(&element->mutex);
541 return arg->action(element, arg->argument);
542 }
543#endif
544
545
546public:
547 void init()
548 {
549 lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
550 sizeof(trx_id_t), 0, &my_charset_bin);
551 hash.alloc.constructor= rw_trx_hash_constructor;
552 hash.alloc.destructor= rw_trx_hash_destructor;
553 hash.initializer=
554 reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
555 }
556
557
558 void destroy()
559 {
560 hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
561 lf_hash_destroy(&hash);
562 }
563
564
565 /**
566 Releases LF_HASH pins.
567
568 Must be called by thread that owns trx_t object when the latter is being
569 "detached" from thread (e.g. released to the pool by trx_free()). Can be
570 called earlier if thread is expected not to use rw_trx_hash.
571
572 Since pins are not allowed to be transferred to another thread,
573 initialisation thread calls this for recovered transactions.
574 */
575
576 void put_pins(trx_t *trx)
577 {
578 if (trx->rw_trx_hash_pins)
579 {
580 lf_hash_put_pins(trx->rw_trx_hash_pins);
581 trx->rw_trx_hash_pins= 0;
582 }
583 }
584
585
586 /**
587 Finds trx object in lock-free hash with given id.
588
589 Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
590 the transaction may get committed before this method returns.
591
592 With do_ref_count == false the caller may dereference returned trx pointer
593 only if lock_sys.mutex was acquired before calling find().
594
595 With do_ref_count == true caller may dereference trx even if it is not
596 holding lock_sys.mutex. Caller is responsible for calling
597 trx->release_reference() when it is done playing with trx.
598
599 Ideally this method should get caller rw_trx_hash_pins along with trx
600 object as a parameter, similar to insert() and erase(). However most
601 callers lose trx early in their call chains and it is not that easy to pass
602 them through.
603
604 So we take more expensive approach: get trx through current_thd()->ha_data.
605 Some threads don't have trx attached to THD, and at least server
606 initialisation thread, fts_optimize_thread, srv_master_thread,
607 dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
608 have THD at all. For such cases we allocate pins only for duration of
609 search and free them immediately.
610
611 This has negative performance impact and should be fixed eventually (by
612 passing caller_trx as a parameter). Still stream of DML is more or less Ok.
613
614 @return
615 @retval 0 not found
616 @retval pointer to trx
617 */
618
619 trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count= false)
620 {
621 /*
622 In MariaDB 10.3, purge will reset DB_TRX_ID to 0
623 when the history is lost. Read/write transactions will
624 always have a nonzero trx_t::id; there the value 0 is
625 reserved for transactions that did not write or lock
626 anything yet.
627 */
628 if (!trx_id)
629 return NULL;
630 if (caller_trx && caller_trx->id == trx_id)
631 {
632 if (do_ref_count)
633 caller_trx->reference();
634 return caller_trx;
635 }
636
637 trx_t *trx= 0;
638 LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
639 ut_a(pins);
640
641 rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
642 (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
643 sizeof(trx_id_t)));
644 if (element)
645 {
646 mutex_enter(&element->mutex);
647 lf_hash_search_unpin(pins);
648 if ((trx= element->trx))
649 {
650 if (do_ref_count)
651 trx->reference();
652 ut_d(validate_element(trx));
653 }
654 mutex_exit(&element->mutex);
655 }
656 if (!caller_trx)
657 lf_hash_put_pins(pins);
658 return trx;
659 }
660
661
662 /**
663 Inserts trx to lock-free hash.
664
665 Object becomes accessible via rw_trx_hash.
666 */
667
668 void insert(trx_t *trx)
669 {
670 ut_d(validate_element(trx));
671 int res= lf_hash_insert(&hash, get_pins(trx),
672 reinterpret_cast<void*>(trx));
673 ut_a(res == 0);
674 }
675
676
677 /**
678 Removes trx from lock-free hash.
679
680 Object becomes not accessible via rw_trx_hash. But it still can be pinned
681 by concurrent find(), which is supposed to release it immediately after
682 it sees object trx is 0.
683 */
684
685 void erase(trx_t *trx)
686 {
687 ut_d(validate_element(trx));
688 mutex_enter(&trx->rw_trx_hash_element->mutex);
689 trx->rw_trx_hash_element->trx= 0;
690 mutex_exit(&trx->rw_trx_hash_element->mutex);
691 int res= lf_hash_delete(&hash, get_pins(trx),
692 reinterpret_cast<const void*>(&trx->id),
693 sizeof(trx_id_t));
694 ut_a(res == 0);
695 }
696
697
698 /**
699 Returns the number of elements in the hash.
700
701 The number is exact only if hash is protected against concurrent
702 modifications (e.g. single threaded startup or hash is protected
703 by some mutex). Otherwise the number may be used as a hint only,
704 because it may change even before this method returns.
705 */
706
707 uint32_t size()
708 {
709 return uint32_t(my_atomic_load32_explicit(&hash.count,
710 MY_MEMORY_ORDER_RELAXED));
711 }
712
713
714 /**
715 Iterates the hash.
716
717 @param caller_trx used to get/set pins
718 @param action called for every element in hash
719 @param argument opque argument passed to action
720
721 May return the same element multiple times if hash is under contention.
722 If caller doesn't like to see the same transaction multiple times, it has
723 to call iterate_no_dups() instead.
724
725 May return element with committed transaction. If caller doesn't like to
726 see committed transactions, it has to skip those under element mutex:
727
728 mutex_enter(&element->mutex);
729 if (trx_t trx= element->trx)
730 {
731 // trx is protected against commit in this branch
732 }
733 mutex_exit(&element->mutex);
734
735 May miss concurrently inserted transactions.
736
737 @return
738 @retval 0 iteration completed successfully
739 @retval 1 iteration was interrupted (action returned 1)
740 */
741
742 int iterate(trx_t *caller_trx, my_hash_walk_action action, void *argument)
743 {
744 LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
745 ut_a(pins);
746#ifdef UNIV_DEBUG
747 debug_iterator_arg debug_arg= { action, argument };
748 action= reinterpret_cast<my_hash_walk_action>(debug_iterator);
749 argument= &debug_arg;
750#endif
751 int res= lf_hash_iterate(&hash, pins, action, argument);
752 if (!caller_trx)
753 lf_hash_put_pins(pins);
754 return res;
755 }
756
757
758 int iterate(my_hash_walk_action action, void *argument)
759 {
760 return iterate(current_trx(), action, argument);
761 }
762
763
764 /**
765 Iterates the hash and eliminates duplicate elements.
766
767 @sa iterate()
768 */
769
770 int iterate_no_dups(trx_t *caller_trx, my_hash_walk_action action,
771 void *argument)
772 {
773 eliminate_duplicates_arg arg(size() + 32, action, argument);
774 return iterate(caller_trx, reinterpret_cast<my_hash_walk_action>
775 (eliminate_duplicates), &arg);
776 }
777
778
779 int iterate_no_dups(my_hash_walk_action action, void *argument)
780 {
781 return iterate_no_dups(current_trx(), action, argument);
782 }
783};
784
785
786/** The transaction system central memory data structure. */
787class trx_sys_t
788{
789 /**
790 The smallest number not yet assigned as a transaction id or transaction
791 number. Accessed and updated with atomic operations.
792 */
793 MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_max_trx_id;
794
795
796 /**
797 Solves race conditions between register_rw() and snapshot_ids() as well as
798 race condition between assign_new_trx_no() and snapshot_ids().
799
800 @sa register_rw()
801 @sa assign_new_trx_no()
802 @sa snapshot_ids()
803 */
804 MY_ALIGNED(CACHE_LINE_SIZE) trx_id_t m_rw_trx_hash_version;
805
806
807 /**
808 TRX_RSEG_HISTORY list length (number of committed transactions to purge)
809 */
810 MY_ALIGNED(CACHE_LINE_SIZE) int32 rseg_history_len;
811
812 bool m_initialised;
813
814public:
815 /** Mutex protecting trx_list. */
816 MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
817
818 /** List of all transactions. */
819 MY_ALIGNED(CACHE_LINE_SIZE) trx_ut_list_t trx_list;
820
821 MY_ALIGNED(CACHE_LINE_SIZE)
822 /** Temporary rollback segments */
823 trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS];
824
825 MY_ALIGNED(CACHE_LINE_SIZE)
826 trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
827 /*!< Pointer array to rollback
828 segments; NULL if slot not in use;
829 created and destroyed in
830 single-threaded mode; not protected
831 by any mutex, because it is read-only
832 during multi-threaded operation */
833
834 /**
835 Lock-free hash of in memory read-write transactions.
836 Works faster when it is on it's own cache line (tested).
837 */
838
839 MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
840
841
842#ifdef WITH_WSREP
843 /** Latest recovered XID during startup */
844 XID recovered_wsrep_xid;
845#endif
846 /** Latest recovered binlog offset */
847 uint64_t recovered_binlog_offset;
848 /** Latest recovred binlog file name */
849 char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
850
851
852 /**
853 Constructor.
854
855 Some members may require late initialisation, thus we just mark object as
856 uninitialised. Real initialisation happens in create().
857 */
858
859 trx_sys_t(): m_initialised(false) {}
860
861
862 /**
863 Returns the minimum trx id in rw trx list.
864
865 This is the smallest id for which the trx can possibly be active. (But, you
866 must look at the trx->state to find out if the minimum trx id transaction
867 itself is active, or already committed.)
868
869 @return the minimum trx id, or m_max_trx_id if the trx list is empty
870 */
871
872 trx_id_t get_min_trx_id()
873 {
874 trx_id_t id= get_max_trx_id();
875 rw_trx_hash.iterate(reinterpret_cast<my_hash_walk_action>
876 (get_min_trx_id_callback), &id);
877 return id;
878 }
879
880
881 /**
882 Determines the maximum transaction id.
883
884 @return maximum currently allocated trx id; will be stale after the
885 next call to trx_sys.get_new_trx_id()
886 */
887
888 trx_id_t get_max_trx_id()
889 {
890 return static_cast<trx_id_t>
891 (my_atomic_load64_explicit(reinterpret_cast<int64*>(&m_max_trx_id),
892 MY_MEMORY_ORDER_RELAXED));
893 }
894
895
896 /**
897 Allocates a new transaction id.
898 @return new, allocated trx id
899 */
900
901 trx_id_t get_new_trx_id()
902 {
903 trx_id_t id= get_new_trx_id_no_refresh();
904 refresh_rw_trx_hash_version();
905 return id;
906 }
907
908
909 /**
910 Allocates and assigns new transaction serialisation number.
911
912 There's a gap between m_max_trx_id increment and transaction serialisation
913 number becoming visible through rw_trx_hash. While we're in this gap
914 concurrent thread may come and do MVCC snapshot without seeing allocated
915 but not yet assigned serialisation number. Then at some point purge thread
916 may clone this view. As a result it won't see newly allocated serialisation
917 number and may remove "unnecessary" history data of this transaction from
918 rollback segments.
919
920 m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
921 to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
922 means that all transaction serialisation numbers up to m_max_trx_id are
923 available through rw_trx_hash.
924
925 We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
926 that m_rw_trx_hash_version increment happens after
927 trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
928
929 @param trx transaction
930 */
931 void assign_new_trx_no(trx_t *trx)
932 {
933 trx->no= get_new_trx_id_no_refresh();
934 my_atomic_store64_explicit(reinterpret_cast<int64*>
935 (&trx->rw_trx_hash_element->no),
936 trx->no, MY_MEMORY_ORDER_RELAXED);
937 refresh_rw_trx_hash_version();
938 }
939
940
941 /**
942 Takes MVCC snapshot.
943
944 To reduce malloc probablility we reserver rw_trx_hash.size() + 32 elements
945 in ids.
946
947 For details about get_rw_trx_hash_version() != get_max_trx_id() spin
948 @sa register_rw() and @sa assign_new_trx_no().
949
950 We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
951 that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
952
953 To optimise snapshot creation rw_trx_hash.iterate() is being used instead
954 of rw_trx_hash.iterate_no_dups(). It means that some transaction
955 identifiers may appear multiple times in ids.
956
957 @param[in,out] caller_trx used to get access to rw_trx_hash_pins
958 @param[out] ids array to store registered transaction identifiers
959 @param[out] max_trx_id variable to store m_max_trx_id value
960 @param[out] mix_trx_no variable to store min(trx->no) value
961 */
962
963 void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
964 trx_id_t *min_trx_no)
965 {
966 ut_ad(!mutex_own(&mutex));
967 snapshot_ids_arg arg(ids);
968
969 while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
970 ut_delay(1);
971 arg.m_no= arg.m_id;
972
973 ids->clear();
974 ids->reserve(rw_trx_hash.size() + 32);
975 rw_trx_hash.iterate(caller_trx,
976 reinterpret_cast<my_hash_walk_action>(copy_one_id),
977 &arg);
978
979 *max_trx_id= arg.m_id;
980 *min_trx_no= arg.m_no;
981 }
982
983
984 /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
985 void init_max_trx_id(trx_id_t value)
986 {
987 m_max_trx_id= m_rw_trx_hash_version= value;
988 }
989
990
991 bool is_initialised() { return m_initialised; }
992
993
994 /** Initialise the transaction subsystem. */
995 void create();
996
997 /** Close the transaction subsystem on shutdown. */
998 void close();
999
1000 /** @return total number of active (non-prepared) transactions */
1001 ulint any_active_transactions();
1002
1003
1004 /**
1005 Registers read-write transaction.
1006
1007 Transaction becomes visible to MVCC.
1008
1009 There's a gap between m_max_trx_id increment and transaction becoming
1010 visible through rw_trx_hash. While we're in this gap concurrent thread may
1011 come and do MVCC snapshot. As a result concurrent read view will be able to
1012 observe records owned by this transaction even before it was committed.
1013
1014 m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
1015 to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
1016 means that all transactions up to m_max_trx_id are available through
1017 rw_trx_hash.
1018
1019 We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
1020 that m_rw_trx_hash_version increment happens after transaction becomes
1021 visible through rw_trx_hash.
1022 */
1023
1024 void register_rw(trx_t *trx)
1025 {
1026 trx->id= get_new_trx_id_no_refresh();
1027 rw_trx_hash.insert(trx);
1028 refresh_rw_trx_hash_version();
1029 }
1030
1031
1032 /**
1033 Deregisters read-write transaction.
1034
1035 Transaction is removed from rw_trx_hash, which releases all implicit locks.
1036 MVCC snapshot won't see this transaction anymore.
1037 */
1038
1039 void deregister_rw(trx_t *trx)
1040 {
1041 rw_trx_hash.erase(trx);
1042 }
1043
1044
1045 bool is_registered(trx_t *caller_trx, trx_id_t id)
1046 {
1047 return rw_trx_hash.find(caller_trx, id);
1048 }
1049
1050
1051 trx_t *find(trx_t *caller_trx, trx_id_t id)
1052 {
1053 return rw_trx_hash.find(caller_trx, id, true);
1054 }
1055
1056
1057 /**
1058 Registers transaction in trx_sys.
1059
1060 @param trx transaction
1061 */
1062 void register_trx(trx_t *trx)
1063 {
1064 mutex_enter(&mutex);
1065 UT_LIST_ADD_FIRST(trx_list, trx);
1066 mutex_exit(&mutex);
1067 }
1068
1069
1070 /**
1071 Deregisters transaction in trx_sys.
1072
1073 @param trx transaction
1074 */
1075 void deregister_trx(trx_t *trx)
1076 {
1077 mutex_enter(&mutex);
1078 UT_LIST_REMOVE(trx_list, trx);
1079 mutex_exit(&mutex);
1080 }
1081
1082
1083 /**
1084 Clones the oldest view and stores it in view.
1085
1086 No need to call ReadView::close(). The caller owns the view that is passed
1087 in. This function is called by purge thread to determine whether it should
1088 purge the delete marked record or not.
1089 */
1090 void clone_oldest_view();
1091
1092
1093 /** @return the number of active views */
1094 size_t view_count() const
1095 {
1096 size_t count= 0;
1097
1098 mutex_enter(&mutex);
1099 for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
1100 trx= UT_LIST_GET_NEXT(trx_list, trx))
1101 {
1102 if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
1103 ++count;
1104 }
1105 mutex_exit(&mutex);
1106 return count;
1107 }
1108
1109 /** @return number of committed transactions waiting for purge */
1110 ulint history_size() const
1111 {
1112 return uint32(my_atomic_load32(&const_cast<trx_sys_t*>(this)
1113 ->rseg_history_len));
1114 }
1115 /** Add to the TRX_RSEG_HISTORY length (on database startup). */
1116 void history_add(int32 len)
1117 {
1118 my_atomic_add32(&rseg_history_len, len);
1119 }
1120 /** Register a committed transaction. */
1121 void history_insert() { history_add(1); }
1122 /** Note that a committed transaction was purged. */
1123 void history_remove() { history_add(-1); }
1124
1125private:
1126 static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
1127 trx_id_t *id)
1128 {
1129 if (element->id < *id)
1130 {
1131 mutex_enter(&element->mutex);
1132 /* We don't care about read-only transactions here. */
1133 if (element->trx && element->trx->rsegs.m_redo.rseg)
1134 *id= element->id;
1135 mutex_exit(&element->mutex);
1136 }
1137 return 0;
1138 }
1139
1140
1141 struct snapshot_ids_arg
1142 {
1143 snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
1144 trx_ids_t *m_ids;
1145 trx_id_t m_id;
1146 trx_id_t m_no;
1147 };
1148
1149
1150 static my_bool copy_one_id(rw_trx_hash_element_t *element,
1151 snapshot_ids_arg *arg)
1152 {
1153 if (element->id < arg->m_id)
1154 {
1155 trx_id_t no= static_cast<trx_id_t>(my_atomic_load64_explicit(
1156 reinterpret_cast<int64*>(&element->no), MY_MEMORY_ORDER_RELAXED));
1157 arg->m_ids->push_back(element->id);
1158 if (no < arg->m_no)
1159 arg->m_no= no;
1160 }
1161 return 0;
1162 }
1163
1164
1165 /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
1166 trx_id_t get_rw_trx_hash_version()
1167 {
1168 return static_cast<trx_id_t>
1169 (my_atomic_load64_explicit(reinterpret_cast<int64*>
1170 (&m_rw_trx_hash_version),
1171 MY_MEMORY_ORDER_ACQUIRE));
1172 }
1173
1174
1175 /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
1176 void refresh_rw_trx_hash_version()
1177 {
1178 my_atomic_add64_explicit(reinterpret_cast<int64*>(&m_rw_trx_hash_version),
1179 1, MY_MEMORY_ORDER_RELEASE);
1180 }
1181
1182
1183 /**
1184 Allocates new transaction id without refreshing rw_trx_hash version.
1185
1186 This method is extracted for exclusive use by register_rw() and
1187 assign_new_trx_no() where new id must be allocated atomically with
1188 payload of these methods from MVCC snapshot point of view.
1189
1190 @sa get_new_trx_id()
1191 @sa assign_new_trx_no()
1192
1193 @return new transaction id
1194 */
1195
1196 trx_id_t get_new_trx_id_no_refresh()
1197 {
1198 return static_cast<trx_id_t>(my_atomic_add64_explicit(
1199 reinterpret_cast<int64*>(&m_max_trx_id), 1, MY_MEMORY_ORDER_RELAXED));
1200 }
1201};
1202
1203
1204/** The transaction system */
1205extern trx_sys_t trx_sys;
1206
1207#endif
1208