1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2018, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | |
20 | /**************************************************//** |
21 | @file read/read0read.cc |
22 | Cursor read |
23 | |
24 | Created 2/16/1997 Heikki Tuuri |
25 | *******************************************************/ |
26 | |
27 | #include "read0types.h" |
28 | |
29 | #include "srv0srv.h" |
30 | #include "trx0sys.h" |
31 | #include "trx0purge.h" |
32 | |
33 | /* |
34 | ------------------------------------------------------------------------------- |
35 | FACT A: Cursor read view on a secondary index sees only committed versions |
36 | ------- |
37 | of the records in the secondary index or those versions of rows created |
38 | by transaction which created a cursor before cursor was created even |
39 | if transaction which created the cursor has changed that clustered index page. |
40 | |
41 | PROOF: We must show that read goes always to the clustered index record |
42 | to see that record is visible in the cursor read view. Consider e.g. |
43 | following table and SQL-clauses: |
44 | |
45 | create table t1(a int not null, b int, primary key(a), index(b)); |
46 | insert into t1 values (1,1),(2,2); |
47 | commit; |
48 | |
49 | Now consider that we have a cursor for a query |
50 | |
51 | select b from t1 where b >= 1; |
52 | |
53 | This query will use secondary key on the table t1. Now after the first fetch |
54 | on this cursor if we do a update: |
55 | |
56 | update t1 set b = 5 where b = 2; |
57 | |
58 | Now second fetch of the cursor should not see record (2,5) instead it should |
59 | see record (2,2). |
60 | |
61 | We also should show that if we have delete t1 where b = 5; we still |
62 | can see record (2,2). |
63 | |
64 | When we access a secondary key record maximum transaction id is fetched |
65 | from this record and this trx_id is compared to up_limit_id in the view. |
66 | If trx_id in the record is greater or equal than up_limit_id in the view |
67 | cluster record is accessed. Because trx_id of the creating |
68 | transaction is stored when this view was created to the list of |
69 | trx_ids not seen by this read view previous version of the |
70 | record is requested to be built. This is build using clustered record. |
71 | If the secondary key record is delete-marked, its corresponding |
72 | clustered record can be already be purged only if records |
73 | trx_id < low_limit_no. Purge can't remove any record deleted by a |
74 | transaction which was active when cursor was created. But, we still |
75 | may have a deleted secondary key record but no clustered record. But, |
76 | this is not a problem because this case is handled in |
77 | row_sel_get_clust_rec() function which is called |
78 | whenever we note that this read view does not see trx_id in the |
79 | record. Thus, we see correct version. Q. E. D. |
80 | |
81 | ------------------------------------------------------------------------------- |
82 | FACT B: Cursor read view on a clustered index sees only committed versions |
83 | ------- |
84 | of the records in the clustered index or those versions of rows created |
85 | by transaction which created a cursor before cursor was created even |
86 | if transaction which created the cursor has changed that clustered index page. |
87 | |
88 | PROOF: Consider e.g.following table and SQL-clauses: |
89 | |
90 | create table t1(a int not null, b int, primary key(a)); |
91 | insert into t1 values (1),(2); |
92 | commit; |
93 | |
94 | Now consider that we have a cursor for a query |
95 | |
96 | select a from t1 where a >= 1; |
97 | |
98 | This query will use clustered key on the table t1. Now after the first fetch |
99 | on this cursor if we do a update: |
100 | |
101 | update t1 set a = 5 where a = 2; |
102 | |
103 | Now second fetch of the cursor should not see record (5) instead it should |
104 | see record (2). |
105 | |
106 | We also should show that if we have execute delete t1 where a = 5; after |
107 | the cursor is opened we still can see record (2). |
108 | |
109 | When accessing clustered record we always check if this read view sees |
110 | trx_id stored to clustered record. By default we don't see any changes |
111 | if record trx_id >= low_limit_id i.e. change was made transaction |
112 | which started after transaction which created the cursor. If row |
113 | was changed by the future transaction a previous version of the |
114 | clustered record is created. Thus we see only committed version in |
115 | this case. We see all changes made by committed transactions i.e. |
116 | record trx_id < up_limit_id. In this case we don't need to do anything, |
117 | we already see correct version of the record. We don't see any changes |
118 | made by active transaction except creating transaction. We have stored |
119 | trx_id of creating transaction to list of trx_ids when this view was |
120 | created. Thus we can easily see if this record was changed by the |
121 | creating transaction. Because we already have clustered record we can |
122 | access roll_ptr. Using this roll_ptr we can fetch undo record. |
123 | We can now check that undo_no of the undo record is less than undo_no of the |
124 | trancaction which created a view when cursor was created. We see this |
125 | clustered record only in case when record undo_no is less than undo_no |
126 | in the view. If this is not true we build based on undo_rec previous |
127 | version of the record. This record is found because purge can't remove |
128 | records accessed by active transaction. Thus we see correct version. Q. E. D. |
129 | ------------------------------------------------------------------------------- |
130 | FACT C: Purge does not remove any delete-marked row that is visible |
131 | ------- |
132 | in any cursor read view. |
133 | |
134 | PROOF: We know that: |
135 | 1: Currently active read views in trx_sys_t::view_list are ordered by |
136 | ReadView::low_limit_no in descending order, that is, |
137 | newest read view first. |
138 | |
139 | 2: Purge clones the oldest read view and uses that to determine whether there |
140 | are any active transactions that can see the to be purged records. |
141 | |
142 | Therefore any joining or active transaction will not have a view older |
143 | than the purge view, according to 1. |
144 | |
145 | When purge needs to remove a delete-marked row from a secondary index, |
146 | it will first check that the DB_TRX_ID value of the corresponding |
147 | record in the clustered index is older than the purge view. It will |
148 | also check if there is a newer version of the row (clustered index |
149 | record) that is not delete-marked in the secondary index. If such a |
150 | row exists and is collation-equal to the delete-marked secondary index |
151 | record then purge will not remove the secondary index record. |
152 | |
153 | Delete-marked clustered index records will be removed by |
154 | row_purge_remove_clust_if_poss(), unless the clustered index record |
155 | (and its DB_ROLL_PTR) has been updated. Every new version of the |
156 | clustered index record will update DB_ROLL_PTR, pointing to a new UNDO |
157 | log entry that allows the old version to be reconstructed. The |
158 | DB_ROLL_PTR in the oldest remaining version in the old-version chain |
159 | may be pointing to garbage (an undo log record discarded by purge), |
160 | but it will never be dereferenced, because the purge view is older |
161 | than any active transaction. |
162 | |
163 | For details see: row_vers_old_has_index_entry() and row_purge_poss_sec() |
164 | |
165 | Some additional issues: |
166 | |
167 | What if trx_sys.view_list == NULL and some transaction T1 and Purge both |
168 | try to open read_view at same time. Only one can acquire trx_sys.mutex. |
169 | In which order will the views be opened? Should it matter? If no, why? |
170 | |
171 | The order does not matter. No new transactions can be created and no running |
172 | RW transaction can commit or rollback (or free views). AC-NL-RO transactions |
173 | will mark their views as closed but not actually free their views. |
174 | */ |
175 | |
176 | |
177 | /** |
178 | Creates a snapshot where exactly the transactions serialized before this |
179 | point in time are seen in the view. |
180 | |
181 | @param[in,out] trx transaction |
182 | */ |
183 | inline void ReadView::snapshot(trx_t *trx) |
184 | { |
185 | trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no); |
186 | std::sort(m_ids.begin(), m_ids.end()); |
187 | m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); |
188 | ut_ad(m_up_limit_id <= m_low_limit_id); |
189 | } |
190 | |
191 | |
192 | /** |
193 | Opens a read view where exactly the transactions serialized before this |
194 | point in time are seen in the view. |
195 | |
196 | View becomes visible to purge thread. |
197 | |
198 | @param[in,out] trx transaction |
199 | */ |
200 | void ReadView::open(trx_t *trx) |
201 | { |
202 | ut_ad(this == &trx->read_view); |
203 | switch (m_state) |
204 | { |
205 | case READ_VIEW_STATE_OPEN: |
206 | ut_ad(!srv_read_only_mode); |
207 | return; |
208 | case READ_VIEW_STATE_CLOSED: |
209 | if (srv_read_only_mode) |
210 | return; |
211 | /* |
212 | Reuse closed view if there were no read-write transactions since (and at) |
213 | its creation time. |
214 | |
215 | Original comment states: there is an inherent race here between purge |
216 | and this thread. |
217 | |
218 | To avoid this race we should've checked trx_sys.get_max_trx_id() and |
219 | set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex |
220 | protection. But we're cutting edges to achieve great scalability. |
221 | |
222 | There're at least two types of concurrent threads interested in this |
223 | value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and |
224 | InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()). |
225 | |
226 | What bad things can happen because we allow this race? |
227 | |
228 | Speculative execution may reorder state change before get_max_trx_id(). |
229 | In this case purge thread has short gap to clone outdated view. Which is |
230 | probably not that bad: it just won't be able to purge things that it was |
231 | actually allowed to purge for a short while. |
232 | |
233 | This thread may as well get suspended after trx_sys.get_max_trx_id() and |
234 | before state is set to READ_VIEW_STATE_OPEN. New read-write transaction |
235 | may get started, committed and purged meanwhile. It is acceptable as |
236 | well, since this view doesn't see it. |
237 | */ |
238 | if (trx_is_autocommit_non_locking(trx) && m_ids.empty() && |
239 | m_low_limit_id == trx_sys.get_max_trx_id()) |
240 | goto reopen; |
241 | |
242 | /* |
243 | Can't reuse view, take new snapshot. |
244 | |
245 | Alas this empty critical section is simplest way to make sure concurrent |
246 | purge thread completed snapshot copy. Of course purge thread may come |
247 | again and try to copy once again after we release this mutex, but in |
248 | this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus |
249 | it'll skip this view. |
250 | |
251 | This critical section can be replaced with new state, which purge thread |
252 | would set to inform us to wait until it completes snapshot. However it'd |
253 | complicate m_state even further. |
254 | */ |
255 | mutex_enter(&trx_sys.mutex); |
256 | mutex_exit(&trx_sys.mutex); |
257 | my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_SNAPSHOT, |
258 | MY_MEMORY_ORDER_RELAXED); |
259 | break; |
260 | default: |
261 | ut_ad(0); |
262 | } |
263 | |
264 | snapshot(trx); |
265 | reopen: |
266 | m_creator_trx_id= trx->id; |
267 | my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_OPEN, |
268 | MY_MEMORY_ORDER_RELEASE); |
269 | } |
270 | |
271 | |
272 | /** |
273 | Clones the oldest view and stores it in view. |
274 | |
275 | No need to call ReadView::close(). The caller owns the view that is passed |
276 | in. This function is called by purge thread to determine whether it should |
277 | purge the delete marked record or not. |
278 | */ |
279 | void trx_sys_t::clone_oldest_view() |
280 | { |
281 | purge_sys.view.snapshot(0); |
282 | mutex_enter(&mutex); |
283 | /* Find oldest view. */ |
284 | for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx; |
285 | trx= UT_LIST_GET_NEXT(trx_list, trx)) |
286 | { |
287 | int32_t state; |
288 | |
289 | while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT) |
290 | ut_delay(1); |
291 | |
292 | if (state == READ_VIEW_STATE_OPEN) |
293 | purge_sys.view.copy(trx->read_view); |
294 | } |
295 | mutex_exit(&mutex); |
296 | } |
297 | |