1 | /* Copyright (C) 2007-2008 Michael Widenius |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
15 | |
16 | /* |
17 | Storage of records in block |
18 | |
19 | Some clarifications about the abbrev used: |
20 | |
21 | NULL fields -> Fields that may have contain a NULL value. |
22 | Not null fields -> Fields that may not contain a NULL value. |
23 | Critical fields -> Fields that can't be null and can't be dropped without |
24 | causing a table reorganization. |
25 | |
26 | |
27 | Maria will have a LSN at start of each page (excluding the bitmap pages) |
28 | |
29 | The different page types that are in a data file are: |
30 | |
31 | Bitmap pages Map of free pages in the next extent (8192 page size |
32 | gives us 256M of mapped pages / bitmap) |
33 | Head page Start of rows are stored on this page. |
34 | A rowid always points to a head page |
35 | Blob page This page is totally filled with data from one blob or by |
36 | a set of long VARCHAR/CHAR fields |
37 | Tail page This contains the last part from different rows, blobs |
38 | or varchar fields. |
39 | |
40 | The data file starts with a bitmap page, followed by as many data |
41 | pages as the bitmap can cover. After this there is a new bitmap page |
42 | and more data pages etc. |
43 | |
44 | For information about the bitmap page, see ma_bitmap.c |
45 | |
46 | Structure of data and tail page: |
47 | |
48 | The page has a row directory at end of page to allow us to do deletes |
49 | without having to reorganize the page. It also allows us to later store |
50 | some more bytes after each row to allow them to grow without having to move |
51 | around other rows. |
52 | |
53 | Page header: |
54 | |
55 | LSN 7 bytes Log position for last page change |
56 | PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob |
57 | DIR_COUNT 1 uchar Number of row/tail entries on page |
58 | FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no |
59 | empty space 2 bytes Empty space on page |
60 | |
61 | The most significant bit in PAGE_TYPE is set to 1 if the data on the page |
62 | can be compacted to get more space. (PAGE_CAN_BE_COMPACTED) |
63 | |
64 | Row data |
65 | |
66 | Row directory of NO entries, that consist of the following for each row |
67 | (in reverse order; i.e., first record is stored last): |
68 | |
69 | Position 2 bytes Position of row on page |
70 | Length 2 bytes Length of entry |
71 | |
72 | For Position and Length, the 1 most significant bit of the position and |
73 | the 1 most significant bit of the length could be used for some states of |
74 | the row (in other words, we should try to keep these reserved) |
75 | |
76 | Position is 0 if the entry is not used. In this case length[0] points |
77 | to a previous free entry (255 if no previous entry) and length[1] |
78 | to the next free entry (or 255 if last free entry). This works because |
79 | the directory entry 255 can never be marked free (if the first directory |
80 | entry is freed, the directory is shrinked). |
81 | |
82 | checksum 4 bytes Reserved for full page read testing and live backup. |
83 | |
84 | ---------------- |
85 | |
86 | Structure of blob pages: |
87 | |
88 | LSN 7 bytes Log position for last page change |
89 | PAGE_TYPE 1 uchar 3 |
90 | |
91 | data |
92 | |
93 | ----------------- |
94 | |
95 | Row data structure: |
96 | |
97 | Flag 1 uchar Marker of which header field exists |
98 | TRANSID 6 bytes TRANSID of changing transaction |
99 | (optional, added on insert and first |
100 | update/delete) |
101 | VER_PTR 7 bytes Pointer to older version in log |
102 | (undo record) |
103 | (optional, added after first |
104 | update/delete) |
105 | DELETE_TRANSID 6 bytes (optional). TRANSID of original row. |
106 | Added on delete. |
107 | Nulls_extended 1 uchar To allow us to add new DEFAULT NULL |
108 | fields (optional, added after first |
109 | change of row after alter table) |
110 | Number of ROW_EXTENT's 1-3 uchar Length encoded, optional |
111 | This is the number of extents the |
112 | row is split into |
113 | First row_extent 7 uchar Pointer to first row extent (optional) |
114 | |
115 | Total length of length array 1-3 uchar Only used if we have |
116 | char/varchar/blob fields. |
117 | Row checksum 1 uchar Only if table created with checksums |
118 | Null_bits .. One bit for each NULL field (a field that may |
119 | have the value NULL) |
120 | Empty_bits .. One bit for each field that may be 'empty'. |
121 | (Both for null and not null fields). |
122 | This bit is 1 if the value for the field is |
123 | 0 or empty string. |
124 | |
125 | field_offsets 2 byte/offset |
126 | For each 32'th field, there is one offset |
127 | that points to where the field information |
128 | starts in the block. This is to provide |
129 | fast access to later field in the row |
130 | when we only need to return a small |
131 | set of fields. |
132 | TODO: Implement this. |
133 | |
134 | Things marked above as 'optional' will only be present if the |
135 | corresponding bit is set in 'Flag' field. Flag gives us a way to |
136 | get more space on a page when doing page compaction as we don't need |
137 | to store TRANSID that have committed before the smallest running |
138 | transaction we have in memory. |
139 | |
140 | Data in the following order: |
141 | (Field order is precalculated when table is created) |
142 | |
143 | Critical fixed length, not null, fields. (Note, these can't be dropped) |
144 | Fixed length, null fields |
145 | |
146 | Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields. |
147 | Number of bytes used in length array per entry is depending on max length |
148 | for field. |
149 | |
150 | ROW_EXTENT's |
151 | CHAR data (space stripped) |
152 | VARCHAR data |
153 | BLOB data |
154 | |
155 | Fields marked in null_bits or empty_bits are not stored in data part or |
156 | length array. |
157 | |
158 | If row doesn't fit into the given block, then the first EXTENT will be |
159 | stored last on the row. This is done so that we don't break any field |
160 | data in the middle. |
161 | |
162 | We first try to store the full row into one block. If that's not possible |
163 | we move out each big blob into their own extents. If this is not enough we |
164 | move out a concatenation of all varchars to their own extent. |
165 | |
166 | Each blob and the concatenated char/varchar fields are stored the following |
167 | way: |
168 | - Store the parts in as many full-contiguous pages as possible. |
169 | - The last part, that doesn't fill a full page, is stored in tail page. |
170 | |
171 | When doing an insert of a new row, we don't have to have |
172 | VER_PTR in the row. This will make rows that are not changed stored |
173 | efficiently. On update and delete we would add TRANSID (if it was an old |
174 | committed row) and VER_PTR to |
175 | the row. On row page compaction we can easily detect rows where |
176 | TRANSID was committed before the longest running transaction |
177 | started and we can then delete TRANSID and VER_PTR from the row to |
178 | gain more space. |
179 | |
180 | If a row is deleted in Maria, we change TRANSID to the deleting |
181 | transaction's id, change VER_PTR to point to the undo record for the delete, |
182 | and add DELETE_TRANSID (the id of the transaction which last |
183 | inserted/updated the row before its deletion). DELETE_TRANSID allows an old |
184 | transaction to avoid reading the log to know if it can see the last version |
185 | before delete (in other words it reduces the probability of having to follow |
186 | VER_PTR). TODO: depending on a compilation option, evaluate the performance |
187 | impact of not storing DELETE_TRANSID (which would make the row smaller). |
188 | |
189 | Description of the different parts: |
190 | |
191 | Flag is coded as: |
192 | |
193 | Description bit |
194 | TRANS_ID_exists 0 |
195 | VER_PTR_exists 1 |
196 | Row is deleted 2 (Means that DELETE_TRANSID exists) |
197 | Nulls_extended_exists 3 |
198 | Row is split 7 This means that 'Number_of_row_extents' exists |
199 | |
200 | Nulls_extended is the number of new DEFAULT NULL fields in the row |
201 | compared to the number of DEFAULT NULL fields when the first version |
202 | of the table was created. If Nulls_extended doesn't exist in the row, |
203 | we know it's 0 as this must be one of the original rows from when the |
204 | table was created first time. This coding allows us to add 255*8 = |
205 | 2048 new fields without requiring a full alter table. |
206 | |
207 | Empty_bits is used to allow us to store 0, 0.0, empty string, empty |
208 | varstring and empty blob efficiently. (This is very good for data |
209 | warehousing where NULL's are often regarded as evil). Having this |
210 | bitmap also allows us to drop information of a field during a future |
211 | delete if field was deleted with ALTER TABLE DROP COLUMN. To be able |
212 | to handle DROP COLUMN, we must store in the index header the fields |
213 | that has been dropped. When unpacking a row we will ignore dropped |
214 | fields. When storing a row, we will mark a dropped field either with a |
215 | null in the null bit map or in the empty_bits and not store any data |
216 | for it. |
217 | TODO: Add code for handling dropped fields. |
218 | |
219 | |
220 | A ROW EXTENT is range of pages. One ROW_EXTENT is coded as: |
221 | |
222 | START_PAGE 5 bytes |
223 | PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page. |
224 | Bit 15 is to set if this is start of a new |
225 | blob extent. |
226 | |
227 | With 8K pages, we can cover 256M in one extent. This coding gives us a |
228 | maximum file size of 2^40*8192 = 8192 tera |
229 | |
230 | As an example of ROW_EXTENT handling, assume a row with one integer |
231 | field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2 |
232 | big BLOB fields that we have updated. |
233 | |
234 | The record format for storing this into an empty file would be: |
235 | |
236 | Page 1: |
237 | |
238 | 00 00 00 00 00 00 00 LSN |
239 | 01 Only one row in page |
240 | FF No free dir entry |
241 | xx xx Empty space on page |
242 | |
243 | 10 Flag: row split, VER_PTR exists |
244 | 01 00 00 00 00 00 TRANSID 1 |
245 | 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1 |
246 | 5 Number of row extents |
247 | 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4 |
248 | 0 No null fields |
249 | 0 No empty fields |
250 | 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0 |
251 | 06 00 00 00 00 80 00 First blob, stored at page 6-133 |
252 | 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5 |
253 | 86 00 00 00 00 80 00 Second blob, stored at page 134-262 |
254 | 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5 |
255 | 05 00 5 integer |
256 | FA Length of first varchar field (size 250) |
257 | 00 60 Length of second varchar field (size 8192*3) |
258 | 00 60 10 First medium BLOB, 1M |
259 | 01 00 10 00 Second BLOB, 1M |
260 | xx xx xx xx xx xx Varchars are stored here until end of page |
261 | |
262 | ..... until end of page |
263 | |
264 | 09 00 F4 1F Start position 9, length 8180 |
265 | xx xx xx xx Checksum |
266 | |
267 | A data page is allowed to have a wrong CRC and header as long as it is |
268 | marked empty in the bitmap and its directory's count is 0. |
269 | */ |
270 | |
271 | #include "maria_def.h" |
272 | #include "ma_blockrec.h" |
273 | #include "trnman.h" |
274 | #include "ma_key_recover.h" |
275 | #include "ma_recovery_util.h" |
276 | #include <lf.h> |
277 | |
278 | /* |
279 | Struct for having a cursor over a set of extent. |
280 | This is used to loop over all extents for a row when reading |
281 | the row data. It's also used to store the tail positions for |
282 | a read row to be used by a later update/delete command. |
283 | */ |
284 | |
285 | typedef struct st_maria_extent_cursor |
286 | { |
287 | /* |
288 | Pointer to packed uchar array of extents for the row. |
289 | Format is described above in the header |
290 | */ |
291 | uchar *extent; |
292 | /* Where data starts on page; Only for debugging */ |
293 | uchar *data_start; |
294 | /* Position to all tails in the row. Updated when reading a row */ |
295 | MARIA_RECORD_POS *tail_positions; |
296 | /* Current page */ |
297 | pgcache_page_no_t page; |
298 | /* How many pages in the page region */ |
299 | uint page_count; |
300 | /* What kind of lock to use for tail pages */ |
301 | enum pagecache_page_lock lock_for_tail_pages; |
302 | /* Total number of extents (i.e., entries in the 'extent' slot) */ |
303 | uint extent_count; |
304 | /* <> 0 if current extent is a tail page; Set while using cursor */ |
305 | uint tail; |
306 | /* Position for tail on tail page */ |
307 | uint tail_row_nr; |
308 | /* |
309 | == 1 if we are working on the first extent (i.e., the one that is stored in |
310 | the row header, not an extent that is stored as part of the row data). |
311 | */ |
312 | my_bool first_extent; |
313 | } MARIA_EXTENT_CURSOR; |
314 | |
315 | |
316 | /** |
317 | @brief Structure for passing down info to write_hook_for_clr_end(). |
318 | This hooks needs to know the variation of the live checksum caused by the |
319 | current operation to update state.checksum under log's mutex, |
320 | needs to know the transaction's previous undo_lsn to set |
321 | trn->undo_lsn under log mutex, and needs to know the type of UNDO being |
322 | undone now to modify state.records under log mutex. |
323 | */ |
324 | |
325 | /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */ |
326 | #define store_checksum_in_rec(S,D,E,P,L) do \ |
327 | { \ |
328 | D= 0; \ |
329 | if ((S)->calc_checksum != NULL) \ |
330 | { \ |
331 | D= (E); \ |
332 | ha_checksum_store(P, D); \ |
333 | L+= HA_CHECKSUM_STORE_SIZE; \ |
334 | } \ |
335 | } while (0) |
336 | |
337 | |
338 | static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails); |
339 | static my_bool delete_head_or_tail(MARIA_HA *info, |
340 | pgcache_page_no_t page, uint record_number, |
341 | my_bool head, my_bool from_update); |
342 | #ifndef DBUG_OFF |
343 | static void _ma_print_directory(MARIA_SHARE *share, |
344 | FILE *file, uchar *buff, uint block_size); |
345 | #endif |
346 | static uchar *store_page_range(MARIA_SHARE *share, |
347 | uchar *to, MARIA_BITMAP_BLOCK *block, |
348 | ulong length, |
349 | uint *tot_ranges); |
350 | static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, |
351 | LEX_CUSTRING *log_parts, |
352 | uint *log_parts_count); |
353 | static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, |
354 | const uchar *newrec, |
355 | LEX_CUSTRING *log_parts, |
356 | uint *log_parts_count); |
357 | |
358 | /**************************************************************************** |
359 | Initialization |
360 | ****************************************************************************/ |
361 | |
362 | /* |
363 | Initialize data needed for block structures |
364 | */ |
365 | |
366 | |
367 | /* Size of the different header elements for a row */ |
368 | |
369 | static uchar []= |
370 | { |
371 | TRANSID_SIZE, |
372 | VERPTR_SIZE, |
373 | TRANSID_SIZE, /* Delete transid */ |
374 | 1 /* Null extends */ |
375 | }; |
376 | |
377 | /* |
378 | Calculate array of all used headers |
379 | |
380 | Used to speed up: |
381 | |
382 | size= 1; |
383 | if (flag & 1) |
384 | size+= TRANSID_SIZE; |
385 | if (flag & 2) |
386 | size+= VERPTR_SIZE; |
387 | if (flag & 4) |
388 | size+= TRANSID_SIZE |
389 | if (flag & 8) |
390 | size+= 1; |
391 | |
392 | NOTES |
393 | This is called only once at startup of Maria |
394 | */ |
395 | |
396 | static uchar [1 << array_elements(header_sizes)]; |
397 | #define (array_elements(total_header_size) -1) |
398 | |
399 | void _ma_init_block_record_data(void) |
400 | { |
401 | uint i; |
402 | bzero(total_header_size, sizeof(total_header_size)); |
403 | total_header_size[0]= FLAG_SIZE; /* Flag uchar */ |
404 | for (i= 1; i < array_elements(total_header_size); i++) |
405 | { |
406 | uint size= FLAG_SIZE, j, bit; |
407 | for (j= 0; (bit= (1 << j)) <= i; j++) |
408 | { |
409 | if (i & bit) |
410 | size+= header_sizes[j]; |
411 | } |
412 | total_header_size[i]= size; |
413 | } |
414 | } |
415 | |
416 | |
417 | my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file) |
418 | { |
419 | my_bool res; |
420 | pgcache_page_no_t last_page; |
421 | |
422 | /* |
423 | First calculate the max file length with can have with a pointer of size |
424 | rec_reflength. |
425 | |
426 | The 'rec_reflength - 1' is because one byte is used for row |
427 | position withing the page. |
428 | The /2 comes from _ma_transaction_recpos_to_keypos() where we use |
429 | the lowest bit to mark if there is a transid following the rownr. |
430 | */ |
431 | last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2; |
432 | if (!last_page) /* Overflow; set max size */ |
433 | last_page= ~(pgcache_page_no_t) 0; |
434 | |
435 | res= _ma_bitmap_init(share, data_file, &last_page); |
436 | share->base.max_data_file_length= _ma_safe_mul(last_page + 1, |
437 | share->block_size); |
438 | #if SIZEOF_OFF_T == 4 |
439 | set_if_smaller(share->base.max_data_file_length, INT_MAX32); |
440 | #endif |
441 | return res; |
442 | } |
443 | |
444 | |
445 | my_bool _ma_once_end_block_record(MARIA_SHARE *share) |
446 | { |
447 | int res= _ma_bitmap_end(share); |
448 | if (share->bitmap.file.file >= 0) |
449 | { |
450 | if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file, |
451 | ((share->temporary || share->deleting) ? |
452 | FLUSH_IGNORE_CHANGED : |
453 | FLUSH_RELEASE))) |
454 | res= 1; |
455 | /* |
456 | File must be synced as it is going out of the maria_open_list and so |
457 | becoming unknown to Checkpoint. |
458 | */ |
459 | if (share->now_transactional && |
460 | mysql_file_sync(share->bitmap.file.file, MYF(MY_WME))) |
461 | res= 1; |
462 | if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME))) |
463 | res= 1; |
464 | /* |
465 | Trivial assignment to guard against multiple invocations |
466 | (May happen if file are closed but we want to keep the maria object |
467 | around a bit longer) |
468 | */ |
469 | share->bitmap.file.file= -1; |
470 | } |
471 | if (share->id != 0) |
472 | { |
473 | /* |
474 | We de-assign the id even though index has not been flushed, this is ok |
475 | as close_lock serializes us with a Checkpoint looking at our share. |
476 | */ |
477 | translog_deassign_id_from_share(share); |
478 | } |
479 | return res; |
480 | } |
481 | |
482 | |
483 | /* Init info->cur_row structure */ |
484 | |
485 | my_bool _ma_init_block_record(MARIA_HA *info) |
486 | { |
487 | MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row; |
488 | MARIA_SHARE *share= info->s; |
489 | uint default_extents; |
490 | DBUG_ENTER("_ma_init_block_record" ); |
491 | |
492 | if (!my_multi_malloc(MY_WME, |
493 | &row->empty_bits, share->base.pack_bytes, |
494 | &row->field_lengths, |
495 | share->base.max_field_lengths + 2, |
496 | &row->blob_lengths, sizeof(ulong) * share->base.blobs, |
497 | &row->null_field_lengths, (sizeof(uint) * |
498 | (share->base.fields - |
499 | share->base.blobs + |
500 | EXTRA_LENGTH_FIELDS)), |
501 | &row->tail_positions, (sizeof(MARIA_RECORD_POS) * |
502 | (share->base.blobs + 2)), |
503 | &new_row->empty_bits, share->base.pack_bytes, |
504 | &new_row->field_lengths, |
505 | share->base.max_field_lengths + 2, |
506 | &new_row->blob_lengths, |
507 | sizeof(ulong) * share->base.blobs, |
508 | &new_row->null_field_lengths, (sizeof(uint) * |
509 | (share->base.fields - |
510 | share->base.blobs + |
511 | EXTRA_LENGTH_FIELDS)), |
512 | &info->log_row_parts, |
513 | sizeof(*info->log_row_parts) * |
514 | (TRANSLOG_INTERNAL_PARTS + 3 + |
515 | share->base.fields + 3), |
516 | &info->update_field_data, |
517 | (share->base.fields * 4 + |
518 | share->base.max_field_lengths + 1 + 4), |
519 | NullS, 0)) |
520 | DBUG_RETURN(1); |
521 | /* Skip over bytes used to store length of field length for logging */ |
522 | row->field_lengths+= 2; |
523 | new_row->field_lengths+= 2; |
524 | |
525 | /* Reserve some initial space to avoid mallocs during execution */ |
526 | default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 + |
527 | (AVERAGE_BLOB_SIZE / |
528 | FULL_PAGE_SIZE(share) / |
529 | BLOB_SEGMENT_MIN_SIZE)); |
530 | |
531 | if (my_init_dynamic_array(&info->bitmap_blocks, |
532 | sizeof(MARIA_BITMAP_BLOCK), default_extents, |
533 | 64, MYF(0))) |
534 | goto err; |
535 | info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE; |
536 | if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length, |
537 | MYF(MY_WME)))) |
538 | goto err; |
539 | |
540 | info->row_base_length= share->base_length; |
541 | info->row_flag= share->base.default_row_flag; |
542 | |
543 | /* |
544 | We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in |
545 | null_field_lengths to allow splitting of rows in 'find_where_to_split_row' |
546 | */ |
547 | row->null_field_lengths+= EXTRA_LENGTH_FIELDS; |
548 | new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS; |
549 | |
550 | DBUG_RETURN(0); |
551 | |
552 | err: |
553 | _ma_end_block_record(info); |
554 | DBUG_RETURN(1); |
555 | } |
556 | |
557 | |
558 | void _ma_end_block_record(MARIA_HA *info) |
559 | { |
560 | DBUG_ENTER("_ma_end_block_record" ); |
561 | my_free(info->cur_row.empty_bits); |
562 | delete_dynamic(&info->bitmap_blocks); |
563 | my_free(info->cur_row.extents); |
564 | my_free(info->blob_buff); |
565 | /* |
566 | The data file is closed, when needed, in ma_once_end_block_record(). |
567 | The following protects us from doing an extra, not allowed, close |
568 | in maria_close() |
569 | */ |
570 | info->dfile.file= -1; |
571 | DBUG_VOID_RETURN; |
572 | } |
573 | |
574 | |
575 | /**************************************************************************** |
576 | Helper functions |
577 | ****************************************************************************/ |
578 | |
579 | /* |
580 | Return the next unused postion on the page after a directory entry. |
581 | |
582 | SYNOPSIS |
583 | start_of_next_entry() |
584 | dir Directory entry to be used. This can not be the |
585 | the last entry on the page! |
586 | |
587 | RETURN |
588 | # Position in page where next entry starts. |
589 | Everything between the '*dir' and this are free to be used. |
590 | */ |
591 | |
592 | static inline uint start_of_next_entry(uchar *dir) |
593 | { |
594 | uchar *prev; |
595 | /* |
596 | Find previous used entry. (There is always a previous entry as |
597 | the directory never starts with a deleted entry) |
598 | */ |
599 | for (prev= dir - DIR_ENTRY_SIZE ; |
600 | prev[0] == 0 && prev[1] == 0 ; |
601 | prev-= DIR_ENTRY_SIZE) |
602 | {} |
603 | return (uint) uint2korr(prev); |
604 | } |
605 | |
606 | |
607 | /* |
608 | Return the offset where the previous entry ends (before on page) |
609 | |
610 | SYNOPSIS |
611 | end_of_previous_entry() |
612 | dir Address for current directory entry |
613 | end Address to last directory entry |
614 | |
615 | RETURN |
616 | # Position where previous entry ends (smallest address on page) |
617 | Everything between # and current entry are free to be used. |
618 | */ |
619 | |
620 | |
621 | static inline uint end_of_previous_entry(MARIA_SHARE *share, |
622 | uchar *dir, uchar *end) |
623 | { |
624 | uchar *pos; |
625 | for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE) |
626 | { |
627 | uint offset; |
628 | if ((offset= uint2korr(pos))) |
629 | return offset + uint2korr(pos+2); |
630 | } |
631 | return PAGE_HEADER_SIZE(share); |
632 | } |
633 | |
634 | |
635 | #ifndef DBUG_OFF |
636 | |
637 | static void _ma_print_directory(MARIA_SHARE *share, |
638 | FILE *file, uchar *buff, uint block_size) |
639 | { |
640 | uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0; |
641 | uint end_of_prev_row= PAGE_HEADER_SIZE(share); |
642 | uchar *dir, *end; |
643 | |
644 | dir= dir_entry_pos(buff, block_size, max_entry-1); |
645 | end= dir_entry_pos(buff, block_size, 0); |
646 | |
647 | DBUG_LOCK_FILE; /* If using DBUG_FILE */ |
648 | fprintf(file,"Directory dump (pos:length):\n" ); |
649 | |
650 | for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++) |
651 | { |
652 | uint offset= uint2korr(end); |
653 | uint length= uint2korr(end+2); |
654 | fprintf(file, " %4u:%4u" , offset, offset ? length : 0); |
655 | if (!(row % (80/12))) |
656 | fputc('\n', file); |
657 | if (offset) |
658 | { |
659 | DBUG_ASSERT(offset >= end_of_prev_row); |
660 | end_of_prev_row= offset + length; |
661 | } |
662 | } |
663 | fputc('\n', file); |
664 | fflush(file); |
665 | DBUG_UNLOCK_FILE; |
666 | } |
667 | |
668 | |
669 | static void check_directory(MARIA_SHARE *share, |
670 | uchar *buff, uint block_size, uint min_row_length, |
671 | uint real_empty_size) |
672 | { |
673 | uchar *dir, *end; |
674 | uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
675 | uint start_of_dir, deleted; |
676 | uint end_of_prev_row= PAGE_HEADER_SIZE(share); |
677 | uint empty_size_on_page; |
678 | uint empty_size; |
679 | uchar free_entry, prev_free_entry; |
680 | |
681 | dir= dir_entry_pos(buff, block_size, max_entry-1); |
682 | start_of_dir= (uint) (dir - buff); |
683 | end= dir_entry_pos(buff, block_size, 0); |
684 | deleted= empty_size= 0; |
685 | |
686 | empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size : |
687 | uint2korr(buff + EMPTY_SPACE_OFFSET)); |
688 | |
689 | /* Ensure that all rows are in increasing order and no overlaps */ |
690 | for (; dir <= end ; end-= DIR_ENTRY_SIZE) |
691 | { |
692 | uint offset= uint2korr(end); |
693 | uint length= uint2korr(end+2); |
694 | if (offset) |
695 | { |
696 | DBUG_ASSERT(offset >= end_of_prev_row); |
697 | DBUG_ASSERT(!length || length >= min_row_length); |
698 | empty_size+= offset - end_of_prev_row; |
699 | end_of_prev_row= offset + length; |
700 | } |
701 | else |
702 | deleted++; |
703 | } |
704 | empty_size+= start_of_dir - end_of_prev_row; |
705 | DBUG_ASSERT(end_of_prev_row <= start_of_dir); |
706 | DBUG_ASSERT(empty_size == empty_size_on_page); |
707 | |
708 | /* check free links */ |
709 | free_entry= buff[DIR_FREE_OFFSET]; |
710 | prev_free_entry= END_OF_DIR_FREE_LIST; |
711 | while (free_entry != END_OF_DIR_FREE_LIST) |
712 | { |
713 | uchar *dir= dir_entry_pos(buff, block_size, free_entry); |
714 | DBUG_ASSERT(dir[0] == 0 && dir[1] == 0); |
715 | DBUG_ASSERT(dir[2] == prev_free_entry); |
716 | prev_free_entry= free_entry; |
717 | free_entry= dir[3]; |
718 | deleted--; |
719 | } |
720 | DBUG_ASSERT(deleted == 0); |
721 | } |
722 | #else |
723 | #define check_directory(A,B,C,D,E) |
724 | #endif /* DBUG_OFF */ |
725 | |
726 | |
727 | /** |
728 | @brief Calculate if there is enough entries on the page |
729 | */ |
730 | |
731 | static my_bool enough_free_entries(uchar *buff, uint block_size, |
732 | uint wanted_entries) |
733 | { |
734 | uint entries= (uint) buff[DIR_COUNT_OFFSET]; |
735 | uint needed_free_entries, free_entry; |
736 | |
737 | if (entries + wanted_entries <= MAX_ROWS_PER_PAGE) |
738 | return 1; |
739 | |
740 | /* Check if enough free entries in free list */ |
741 | needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE; |
742 | |
743 | free_entry= (uint) buff[DIR_FREE_OFFSET]; |
744 | while (free_entry != END_OF_DIR_FREE_LIST) |
745 | { |
746 | uchar *dir; |
747 | if (!--needed_free_entries) |
748 | return 1; |
749 | dir= dir_entry_pos(buff, block_size, free_entry); |
750 | free_entry= dir[3]; |
751 | } |
752 | return 0; /* Not enough entries */ |
753 | } |
754 | |
755 | |
756 | /** |
757 | @brief Check if there is room for more rows on page |
758 | |
759 | @fn enough_free_entries_on_page |
760 | |
761 | @return 0 Directory is full |
762 | @return 1 There is room for more entries on the page |
763 | */ |
764 | |
765 | my_bool enough_free_entries_on_page(MARIA_SHARE *share, |
766 | uchar *page_buff) |
767 | { |
768 | enum en_page_type page_type; |
769 | page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] & |
770 | ~(uchar) PAGE_CAN_BE_COMPACTED); |
771 | |
772 | if (page_type == HEAD_PAGE) |
773 | { |
774 | uint row_count= (uint) page_buff[DIR_COUNT_OFFSET]; |
775 | return !(row_count == MAX_ROWS_PER_PAGE && |
776 | page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST); |
777 | } |
778 | return enough_free_entries(page_buff, share->block_size, |
779 | 1 + share->base.blobs); |
780 | } |
781 | |
782 | |
783 | /** |
784 | @brief Extend a record area to fit a given size block |
785 | |
786 | @fn extend_area_on_page() |
787 | @param info Handler |
788 | @param buff Page buffer |
789 | @param dir Pointer to dir entry in buffer |
790 | @param rownr Row number we working on |
791 | @param block_size Block size of buffer |
792 | @param request_length How much data we want to put at [dir] |
793 | @param empty_space Total empty space in buffer |
794 | This is updated with length after dir |
795 | is allocated and current block freed |
796 | @param head_page 1 if head page, 0 for tail page |
797 | |
798 | @implementation |
799 | The logic is as follows (same as in _ma_update_block_record()) |
800 | - If new data fits in old block, use old block. |
801 | - Extend block with empty space before block. If enough, use it. |
802 | - Extend block with empty space after block. If enough, use it. |
803 | - Use _ma_compact_block_page() to get all empty space at dir. |
804 | |
805 | @note |
806 | The given directory entry is set to rec length. |
807 | empty_space doesn't include the new directory entry |
808 | |
809 | |
810 | @return |
811 | @retval 0 ok |
812 | @retval ret_offset Pointer to store offset to found area |
813 | @retval ret_length Pointer to store length of found area |
814 | @retval [dir] rec_offset is store here too |
815 | |
816 | @retval 1 error (wrong info in block) |
817 | */ |
818 | |
819 | static my_bool extend_area_on_page(MARIA_HA *info, |
820 | uchar *buff, uchar *dir, |
821 | uint rownr, |
822 | uint request_length, |
823 | uint *empty_space, uint *ret_offset, |
824 | uint *ret_length, |
825 | my_bool head_page) |
826 | { |
827 | uint rec_offset, length, org_rec_length; |
828 | uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
829 | MARIA_SHARE *share= info->s; |
830 | uint block_size= share->block_size; |
831 | DBUG_ENTER("extend_area_on_page" ); |
832 | |
833 | /* |
834 | We can't check for min length here as we may have called |
835 | extend_directory() to create a new (empty) entry just before |
836 | */ |
837 | check_directory(share, buff, block_size, 0, *empty_space); |
838 | |
839 | rec_offset= uint2korr(dir); |
840 | if (rec_offset) |
841 | { |
842 | /* Extending old row; Mark current space as 'free' */ |
843 | length= org_rec_length= uint2korr(dir + 2); |
844 | DBUG_PRINT("info" , ("rec_offset: %u length: %u request_length: %u " |
845 | "empty_space: %u" , |
846 | rec_offset, org_rec_length, request_length, |
847 | *empty_space)); |
848 | |
849 | *empty_space+= org_rec_length; |
850 | } |
851 | else |
852 | { |
853 | /* Reusing free directory entry; Free it from the directory list */ |
854 | if (dir[2] == END_OF_DIR_FREE_LIST) |
855 | buff[DIR_FREE_OFFSET]= dir[3]; |
856 | else |
857 | { |
858 | uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]); |
859 | DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr); |
860 | prev_dir[3]= dir[3]; |
861 | } |
862 | if (dir[3] != END_OF_DIR_FREE_LIST) |
863 | { |
864 | uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]); |
865 | DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr); |
866 | next_dir[2]= dir[2]; |
867 | } |
868 | rec_offset= start_of_next_entry(dir); |
869 | length= 0; |
870 | } |
871 | if (length < request_length) |
872 | { |
873 | uint old_rec_offset; |
874 | /* |
875 | New data did not fit in old position. |
876 | Find first possible position where to put new data. |
877 | */ |
878 | old_rec_offset= rec_offset; |
879 | rec_offset= end_of_previous_entry(share, |
880 | dir, buff + block_size - |
881 | PAGE_SUFFIX_SIZE); |
882 | length+= (uint) (old_rec_offset - rec_offset); |
883 | DBUG_ASSERT(old_rec_offset); |
884 | /* |
885 | 'length' is 0 if we are doing an insert into a not allocated block. |
886 | This can only happen during "REDO of INSERT" or "UNDO of DELETE." |
887 | */ |
888 | if (length < request_length) |
889 | { |
890 | /* |
891 | Did not fit in current block + empty space. Extend with |
892 | empty space after block. |
893 | */ |
894 | if (rownr == max_entry - 1) |
895 | { |
896 | /* Last entry; Everything is free between this and directory */ |
897 | length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) - |
898 | rec_offset); |
899 | } |
900 | else |
901 | length= start_of_next_entry(dir) - rec_offset; |
902 | DBUG_ASSERT((int) length >= 0); |
903 | if (length < request_length) |
904 | { |
905 | /* Not enough continuous space, compact page to get more */ |
906 | int2store(dir, rec_offset); |
907 | /* Reset length, as this may be a deleted block */ |
908 | int2store(dir+2, 0); |
909 | _ma_compact_block_page(share, |
910 | buff, rownr, 1, |
911 | head_page ? info->trn->min_read_from: 0, |
912 | head_page ? share->base.min_block_length : 0); |
913 | rec_offset= uint2korr(dir); |
914 | length= uint2korr(dir+2); |
915 | if (length < request_length) |
916 | { |
917 | DBUG_PRINT("error" , ("Not enough space: " |
918 | "length: %u request_length: %u" , |
919 | length, request_length)); |
920 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
921 | DBUG_RETURN(1); /* Error in block */ |
922 | } |
923 | *empty_space= length; /* All space is here */ |
924 | } |
925 | } |
926 | } |
927 | int2store(dir, rec_offset); |
928 | int2store(dir + 2, length); |
929 | *ret_offset= rec_offset; |
930 | *ret_length= length; |
931 | |
932 | check_directory(share, |
933 | buff, block_size, |
934 | head_page ? share->base.min_block_length : 0, |
935 | *empty_space - length); |
936 | DBUG_RETURN(0); |
937 | } |
938 | |
939 | |
940 | /** |
941 | @brief Copy not changed fields from 'from' to 'to' |
942 | |
943 | @notes |
944 | Assumption is that most fields are not changed! |
945 | (Which is why we don't test if all bits are set for some bytes in bitmap) |
946 | */ |
947 | |
948 | void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields, |
949 | uchar *to, uchar *from) |
950 | { |
951 | MARIA_COLUMNDEF *column, *end_column; |
952 | uchar *bitmap= (uchar*) changed_fields->bitmap; |
953 | MARIA_SHARE *share= info->s; |
954 | uint bit= 1; |
955 | |
956 | for (column= share->columndef, end_column= column+ share->base.fields; |
957 | column < end_column; column++) |
958 | { |
959 | if (!(*bitmap & bit)) |
960 | { |
961 | uint field_length= column->length; |
962 | if (column->type == FIELD_VARCHAR) |
963 | { |
964 | if (column->fill_length == 1) |
965 | field_length= (uint) from[column->offset] + 1; |
966 | else |
967 | field_length= uint2korr(from + column->offset) + 2; |
968 | } |
969 | memcpy(to + column->offset, from + column->offset, field_length); |
970 | } |
971 | if ((bit= (bit << 1)) == 256) |
972 | { |
973 | bitmap++; |
974 | bit= 1; |
975 | } |
976 | } |
977 | } |
978 | |
979 | #ifdef NOT_YET_NEEDED |
980 | /* Calculate empty space on a page */ |
981 | |
982 | static uint empty_space_on_page(uchar *buff, uint block_size) |
983 | { |
984 | enum en_page_type; |
985 | page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & |
986 | ~(uchar) PAGE_CAN_BE_COMPACTED); |
987 | if (page_type == UNALLOCATED_PAGE) |
988 | return block_size; |
989 | if ((uint) page_type <= TAIL_PAGE) |
990 | return uint2korr(buff+EMPTY_SPACE_OFFSET); |
991 | return 0; /* Blob page */ |
992 | } |
993 | #endif |
994 | |
995 | |
996 | /* |
997 | @brief Ensure we have space for new directory entries |
998 | |
999 | @fn make_space_for_directory() |
1000 | @param info Handler |
1001 | @param buff Page buffer |
1002 | @param max_entry Number of current entries in directory |
1003 | @param count Number of new entries to be added to directory |
1004 | @param first_dir First directory entry on page |
1005 | @param empty_space Total empty space in buffer. It's updated |
1006 | to reflect the new empty space |
1007 | @param first_pos Store position to last data byte on page here |
1008 | @param head_page 1 if head page, 0 for tail page. |
1009 | |
1010 | @note |
1011 | This function is inline as the argument passing is the biggest |
1012 | part of the function |
1013 | |
1014 | @return |
1015 | @retval 0 ok |
1016 | @retval 1 error (No data on page, fatal error) |
1017 | */ |
1018 | |
1019 | static inline my_bool |
1020 | make_space_for_directory(MARIA_HA *info, |
1021 | uchar *buff, uint max_entry, |
1022 | uint count, uchar *first_dir, uint *empty_space, |
1023 | uint *first_pos, |
1024 | my_bool head_page) |
1025 | { |
1026 | uint length_needed= DIR_ENTRY_SIZE * count; |
1027 | MARIA_SHARE *share= info->s; |
1028 | |
1029 | /* |
1030 | The following is not true only in the case and UNDO is used to reinsert |
1031 | a row on a previously not used page |
1032 | */ |
1033 | if (likely(max_entry)) |
1034 | { |
1035 | /* Check if there is place for the directory entry on the page */ |
1036 | *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2); |
1037 | |
1038 | if ((uint) (first_dir - buff) < *first_pos + length_needed) |
1039 | { |
1040 | /* Create place for directory */ |
1041 | _ma_compact_block_page(share, |
1042 | buff, max_entry - 1, 0, |
1043 | head_page ? info->trn->min_read_from : 0, |
1044 | head_page ? share->base.min_block_length : 0); |
1045 | *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2)); |
1046 | *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
1047 | if (*empty_space < length_needed) |
1048 | { |
1049 | /* |
1050 | We should always have space, as we only come here for |
1051 | UNDO of DELETE (in which case we know the row was on the |
1052 | page before) or if the bitmap told us there was space on page |
1053 | */ |
1054 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
1055 | return(1); |
1056 | } |
1057 | } |
1058 | } |
1059 | else |
1060 | *first_pos= PAGE_HEADER_SIZE(share); |
1061 | |
1062 | /* Reduce directory entry size from free space size */ |
1063 | (*empty_space)-= length_needed; |
1064 | buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count); |
1065 | return(0); |
1066 | } |
1067 | |
1068 | |
1069 | /* |
1070 | Find free position in directory |
1071 | |
1072 | SYNOPSIS |
1073 | find_free_position() |
1074 | info Handler |
1075 | buff Page |
1076 | block_size Size of page |
1077 | res_rownr Store index to free position here |
1078 | res_length Store length of found segment here |
1079 | empty_space Store length of empty space on disk here. This is |
1080 | all empty space, including the found block. |
1081 | @param head_page 1 if head page, 0 for tail page. |
1082 | |
1083 | NOTES |
1084 | If there is a free directory entry (entry with position == 0), |
1085 | then use it and change it to be the size of the empty block |
1086 | after the previous entry. This guarantees that all row entries |
1087 | are stored on disk in inverse directory order, which makes life easier for |
1088 | '_ma_compact_block_page()' and to know if there is free space after any |
1089 | block. |
1090 | |
1091 | If there is no free entry (entry with position == 0), then we create |
1092 | a new one. If there is not space for the directory entry (because |
1093 | the last block overlapps with the directory), we compact the page. |
1094 | |
1095 | We will update the offset and the length of the found dir entry to |
1096 | match the position and empty space found. |
1097 | |
1098 | buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller |
1099 | |
1100 | See start of file for description of how free directory entires are linked |
1101 | |
1102 | RETURN |
1103 | 0 Error (directory full or last block goes over directory) |
1104 | # Pointer to directory entry on page |
1105 | */ |
1106 | |
1107 | static uchar *find_free_position(MARIA_HA *info, |
1108 | uchar *buff, uint block_size, uint *res_rownr, |
1109 | uint *res_length, uint *empty_space, |
1110 | my_bool head_page) |
1111 | { |
1112 | uint max_entry, free_entry; |
1113 | uint length, first_pos; |
1114 | uchar *dir, *first_dir; |
1115 | MARIA_SHARE *share= info->s; |
1116 | DBUG_ENTER("find_free_position" ); |
1117 | |
1118 | max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
1119 | free_entry= (uint) buff[DIR_FREE_OFFSET]; |
1120 | *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
1121 | |
1122 | DBUG_PRINT("info" , ("max_entry: %u free_entry: %u" , max_entry, free_entry)); |
1123 | |
1124 | first_dir= dir_entry_pos(buff, block_size, max_entry - 1); |
1125 | |
1126 | /* Search after first free position */ |
1127 | if (free_entry != END_OF_DIR_FREE_LIST) |
1128 | { |
1129 | if (free_entry >= max_entry) |
1130 | DBUG_RETURN(0); /* Consistency error */ |
1131 | dir= dir_entry_pos(buff, block_size, free_entry); |
1132 | DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST); |
1133 | /* Relink free list */ |
1134 | if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST) |
1135 | { |
1136 | uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); |
1137 | DBUG_ASSERT((uint) next_entry[2] == free_entry && |
1138 | uint2korr(next_entry) == 0); |
1139 | next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */ |
1140 | } |
1141 | |
1142 | first_pos= end_of_previous_entry(share, |
1143 | dir, buff + block_size - |
1144 | PAGE_SUFFIX_SIZE); |
1145 | length= start_of_next_entry(dir) - first_pos; |
1146 | int2store(dir, first_pos); /* Update dir entry */ |
1147 | int2store(dir + 2, 0); |
1148 | *res_rownr= free_entry; |
1149 | *res_length= length; |
1150 | |
1151 | check_directory(share, buff, block_size, |
1152 | head_page ? share->base.min_block_length : 0, (uint) -1); |
1153 | DBUG_RETURN(dir); |
1154 | } |
1155 | /* No free places in dir; create a new one */ |
1156 | |
1157 | /* Check if there is place for the directory entry */ |
1158 | if (max_entry == MAX_ROWS_PER_PAGE) |
1159 | DBUG_RETURN(0); |
1160 | |
1161 | if (make_space_for_directory(info, buff, max_entry, 1, |
1162 | first_dir, empty_space, &first_pos, head_page)) |
1163 | DBUG_RETURN(0); |
1164 | |
1165 | dir= first_dir - DIR_ENTRY_SIZE; |
1166 | length= (uint) (dir - buff - first_pos); |
1167 | DBUG_ASSERT(length <= *empty_space); |
1168 | int2store(dir, first_pos); |
1169 | int2store(dir + 2, 0); /* Max length of region */ |
1170 | *res_rownr= max_entry; |
1171 | *res_length= length; |
1172 | |
1173 | check_directory(share, |
1174 | buff, block_size, |
1175 | head_page ? share->base.min_block_length : 0, |
1176 | *empty_space); |
1177 | DBUG_RETURN(dir); |
1178 | } |
1179 | |
1180 | |
1181 | /** |
1182 | @brief Enlarge page directory to hold more entries |
1183 | |
1184 | @fn extend_directory() |
1185 | @param info Handler |
1186 | @param buff Page buffer |
1187 | @param block_size Block size |
1188 | @param max_entry Number of directory entries on page |
1189 | @param new_entry Position for new entry |
1190 | @param empty_space Total empty space in buffer. It's updated |
1191 | to reflect the new empty space |
1192 | @param head_page 1 if head page, 0 for tail page. |
1193 | |
1194 | @note |
1195 | This is only called on UNDO when we want to expand the directory |
1196 | to be able to re-insert row in a given position |
1197 | |
1198 | The new directory entry will be set to cover the maximum possible space |
1199 | |
1200 | @return |
1201 | @retval 0 ok |
1202 | @retval 1 error (No data on page, fatal error) |
1203 | */ |
1204 | |
1205 | static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size, |
1206 | uint max_entry, uint new_entry, |
1207 | uint *empty_space, my_bool head_page) |
1208 | { |
1209 | uint length, first_pos; |
1210 | uchar *dir, *first_dir; |
1211 | DBUG_ENTER("extend_directory" ); |
1212 | |
1213 | /* |
1214 | Note that in if max_entry is 0, then first_dir will point to |
1215 | an illegal directory entry. This is ok, as in this case we will |
1216 | not access anything through first_dir. |
1217 | */ |
1218 | first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE; |
1219 | |
1220 | if (make_space_for_directory(info, buff, max_entry, |
1221 | new_entry - max_entry + 1, |
1222 | first_dir, empty_space, &first_pos, head_page)) |
1223 | DBUG_RETURN(1); |
1224 | |
1225 | /* Set the new directory entry to cover the max possible length */ |
1226 | dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1); |
1227 | length= (uint) (dir - buff - first_pos); |
1228 | int2store(dir, first_pos); |
1229 | int2store(dir+2, length); |
1230 | *empty_space-= length; |
1231 | |
1232 | if (new_entry-- > max_entry) |
1233 | { |
1234 | /* Link all row entries between new_entry and max_entry into free list */ |
1235 | uint free_entry= (uint) buff[DIR_FREE_OFFSET]; |
1236 | uint prev_entry= END_OF_DIR_FREE_LIST; |
1237 | buff[DIR_FREE_OFFSET]= new_entry; |
1238 | do |
1239 | { |
1240 | dir+= DIR_ENTRY_SIZE; |
1241 | dir[0]= dir[1]= 0; |
1242 | dir[2]= (uchar) prev_entry; |
1243 | dir[3]= (uchar) new_entry-1; |
1244 | prev_entry= new_entry; |
1245 | } while (new_entry-- > max_entry); |
1246 | if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST) |
1247 | { |
1248 | /* Relink next entry to point to newly freed entry */ |
1249 | uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); |
1250 | DBUG_ASSERT(uint2korr(next_entry) == 0 && |
1251 | next_entry[2] == END_OF_DIR_FREE_LIST); |
1252 | next_entry[2]= max_entry; |
1253 | } |
1254 | } |
1255 | |
1256 | check_directory(info->s, |
1257 | buff, block_size, |
1258 | head_page ? MY_MIN(info->s->base.min_block_length, length) : |
1259 | 0, *empty_space); |
1260 | DBUG_RETURN(0); |
1261 | } |
1262 | |
1263 | |
1264 | /**************************************************************************** |
1265 | Updating records |
1266 | ****************************************************************************/ |
1267 | |
1268 | /* |
1269 | Calculate length of all the different field parts |
1270 | |
1271 | SYNOPSIS |
1272 | calc_record_size() |
1273 | info Maria handler |
1274 | record Row to store |
1275 | row Store statistics about row here |
1276 | |
1277 | NOTES |
1278 | The statistics is used to find out how much space a row will need |
1279 | and also where we can split a row when we need to split it into several |
1280 | extents. |
1281 | */ |
1282 | |
1283 | static void calc_record_size(MARIA_HA *info, const uchar *record, |
1284 | MARIA_ROW *row) |
1285 | { |
1286 | MARIA_SHARE *share= info->s; |
1287 | uchar *field_length_data; |
1288 | MARIA_COLUMNDEF *column, *end_column; |
1289 | uint *null_field_lengths= row->null_field_lengths; |
1290 | ulong *blob_lengths= row->blob_lengths; |
1291 | DBUG_ENTER("calc_record_size" ); |
1292 | |
1293 | row->normal_length= row->char_length= row->varchar_length= |
1294 | row->blob_length= row->extents_count= 0; |
1295 | |
1296 | /* Create empty bitmap and calculate length of each varlength/char field */ |
1297 | bzero(row->empty_bits, share->base.pack_bytes); |
1298 | field_length_data= row->field_lengths; |
1299 | for (column= share->columndef + share->base.fixed_not_null_fields, |
1300 | end_column= share->columndef + share->base.fields; |
1301 | column < end_column; column++, null_field_lengths++) |
1302 | { |
1303 | if ((record[column->null_pos] & column->null_bit)) |
1304 | { |
1305 | if (column->type != FIELD_BLOB) |
1306 | *null_field_lengths= 0; |
1307 | else |
1308 | *blob_lengths++= 0; |
1309 | continue; |
1310 | } |
1311 | switch (column->type) { |
1312 | case FIELD_CHECK: |
1313 | case FIELD_NORMAL: /* Fixed length field */ |
1314 | case FIELD_ZERO: |
1315 | DBUG_ASSERT(column->empty_bit == 0); |
1316 | /* fall through */ |
1317 | case FIELD_SKIP_PRESPACE: /* Not packed */ |
1318 | row->normal_length+= column->length; |
1319 | *null_field_lengths= column->length; |
1320 | break; |
1321 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
1322 | if (memcmp(record+ column->offset, maria_zero_string, |
1323 | column->length) == 0) |
1324 | { |
1325 | row->empty_bits[column->empty_pos] |= column->empty_bit; |
1326 | *null_field_lengths= 0; |
1327 | } |
1328 | else |
1329 | { |
1330 | row->normal_length+= column->length; |
1331 | *null_field_lengths= column->length; |
1332 | } |
1333 | break; |
1334 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
1335 | { |
1336 | const uchar *pos, *end; |
1337 | for (pos= record + column->offset, end= pos + column->length; |
1338 | end > pos && end[-1] == ' '; end--) |
1339 | ; |
1340 | if (pos == end) /* If empty string */ |
1341 | { |
1342 | row->empty_bits[column->empty_pos]|= column->empty_bit; |
1343 | *null_field_lengths= 0; |
1344 | } |
1345 | else |
1346 | { |
1347 | uint length= (uint) (end - pos); |
1348 | if (column->length <= 255) |
1349 | *field_length_data++= (uchar) length; |
1350 | else |
1351 | { |
1352 | int2store(field_length_data, length); |
1353 | field_length_data+= 2; |
1354 | } |
1355 | row->char_length+= length; |
1356 | *null_field_lengths= length; |
1357 | } |
1358 | break; |
1359 | } |
1360 | case FIELD_VARCHAR: |
1361 | { |
1362 | uint length, field_length_data_length; |
1363 | const uchar *field_pos= record + column->offset; |
1364 | |
1365 | /* 256 is correct as this includes the length uchar */ |
1366 | field_length_data[0]= field_pos[0]; |
1367 | if (column->length <= 256) |
1368 | { |
1369 | length= (uint) (uchar) *field_pos; |
1370 | field_length_data_length= 1; |
1371 | } |
1372 | else |
1373 | { |
1374 | length= uint2korr(field_pos); |
1375 | field_length_data[1]= field_pos[1]; |
1376 | field_length_data_length= 2; |
1377 | } |
1378 | *null_field_lengths= length; |
1379 | if (!length) |
1380 | { |
1381 | row->empty_bits[column->empty_pos]|= column->empty_bit; |
1382 | break; |
1383 | } |
1384 | row->varchar_length+= length; |
1385 | *null_field_lengths= length; |
1386 | field_length_data+= field_length_data_length; |
1387 | break; |
1388 | } |
1389 | case FIELD_BLOB: |
1390 | { |
1391 | const uchar *field_pos= record + column->offset; |
1392 | uint size_length= column->length - portable_sizeof_char_ptr; |
1393 | ulong blob_length= _ma_calc_blob_length(size_length, field_pos); |
1394 | |
1395 | *blob_lengths++= blob_length; |
1396 | if (!blob_length) |
1397 | row->empty_bits[column->empty_pos]|= column->empty_bit; |
1398 | else |
1399 | { |
1400 | row->blob_length+= blob_length; |
1401 | memcpy(field_length_data, field_pos, size_length); |
1402 | field_length_data+= size_length; |
1403 | } |
1404 | break; |
1405 | } |
1406 | default: |
1407 | DBUG_ASSERT(0); |
1408 | } |
1409 | } |
1410 | row->field_lengths_length= (uint) (field_length_data - row->field_lengths); |
1411 | /* |
1412 | - info->row_base_length is base information we must have on a page in first |
1413 | extent: |
1414 | - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes + |
1415 | table_checksum (0 | 1) |
1416 | - row->min_length is minimum amount of data we must store on |
1417 | a page. bitmap code will ensure we get at list this much + |
1418 | total number of extents and one extent information |
1419 | - fixed_not_null_fields_length is length of fixed length fields that can't |
1420 | be compacted |
1421 | - head_length is the amount of data for the head page |
1422 | (ie, all fields except blobs) |
1423 | */ |
1424 | row->min_length= (info->row_base_length + |
1425 | (share->base.max_field_lengths ? |
1426 | size_to_store_key_length(row->field_lengths_length) : |
1427 | 0)); |
1428 | row->head_length= (row->min_length + |
1429 | share->base.fixed_not_null_fields_length + |
1430 | row->field_lengths_length + |
1431 | row->normal_length + |
1432 | row->char_length + row->varchar_length); |
1433 | row->total_length= (row->head_length + row->blob_length); |
1434 | if (row->total_length < share->base.min_block_length) |
1435 | row->total_length= share->base.min_block_length; |
1436 | DBUG_PRINT("exit" , ("head_length: %lu total_length: %lu" , |
1437 | (ulong) row->head_length, (ulong) row->total_length)); |
1438 | DBUG_VOID_RETURN; |
1439 | } |
1440 | |
1441 | |
1442 | /** |
1443 | Compact page by removing all space between rows |
1444 | |
1445 | Moves up all rows to start of page. Moves blocks that are directly after |
1446 | each other with one memmove. |
1447 | |
1448 | @note if rownr is the last row in the page, and extend_block is false, |
1449 | caller has to make sure to update bitmap page afterwards to reflect freed |
1450 | space. |
1451 | |
1452 | @param buff Page to compact |
1453 | @param block_size Size of page |
1454 | @param rownr Put empty data after this row |
1455 | @param extend_block If 1, extend the block at 'rownr' to cover the |
1456 | whole block. |
1457 | @param min_read_from If <> 0, remove all trid's that are less than this |
1458 | */ |
1459 | |
1460 | void _ma_compact_block_page(MARIA_SHARE *share, |
1461 | uchar *buff, uint rownr, |
1462 | my_bool extend_block, TrID min_read_from, |
1463 | uint min_row_length) |
1464 | { |
1465 | uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
1466 | uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block; |
1467 | uint freed_size= 0; |
1468 | uint block_size= share->block_size; |
1469 | uchar *dir, *end; |
1470 | DBUG_ENTER("_ma_compact_block_page" ); |
1471 | DBUG_PRINT("enter" , ("rownr: %u min_read_from: %lu" , rownr, |
1472 | (ulong) min_read_from)); |
1473 | DBUG_ASSERT(max_entry > 0 && |
1474 | max_entry < (block_size - PAGE_HEADER_SIZE(share) - |
1475 | PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE); |
1476 | |
1477 | /* Move all entries before and including rownr up to start of page */ |
1478 | dir= dir_entry_pos(buff, block_size, rownr); |
1479 | end= dir_entry_pos(buff, block_size, 0); |
1480 | page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share); |
1481 | diff= 0; |
1482 | for (; dir <= end ; end-= DIR_ENTRY_SIZE) |
1483 | { |
1484 | uint offset= uint2korr(end); |
1485 | |
1486 | if (offset) |
1487 | { |
1488 | uint row_length= uint2korr(end + 2); |
1489 | DBUG_ASSERT(offset >= page_pos); |
1490 | DBUG_ASSERT(buff + offset + row_length <= dir); |
1491 | DBUG_ASSERT(row_length >= min_row_length || row_length == 0); |
1492 | |
1493 | /* Row length can be zero if row is to be deleted */ |
1494 | if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID)) |
1495 | { |
1496 | TrID transid= transid_korr(buff+offset+1); |
1497 | if (transid < min_read_from) |
1498 | { |
1499 | /* Remove transid from row by moving the start point of the row up */ |
1500 | buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; |
1501 | offset+= TRANSID_SIZE; |
1502 | freed_size+= TRANSID_SIZE; |
1503 | row_length-= TRANSID_SIZE; |
1504 | int2store(end+2, row_length); |
1505 | } |
1506 | } |
1507 | |
1508 | if (offset != next_free_pos) |
1509 | { |
1510 | uint length= (next_free_pos - start_of_found_block); |
1511 | /* |
1512 | There was empty space before this and prev block |
1513 | Check if we have to move previous block up to page start |
1514 | */ |
1515 | if (page_pos != start_of_found_block) |
1516 | { |
1517 | /* move up previous block */ |
1518 | memmove(buff + page_pos, buff + start_of_found_block, length); |
1519 | } |
1520 | page_pos+= length; |
1521 | /* next continuous block starts here */ |
1522 | start_of_found_block= offset; |
1523 | diff= offset - page_pos; |
1524 | } |
1525 | int2store(end, offset - diff); /* correct current pos */ |
1526 | next_free_pos= offset + row_length; |
1527 | |
1528 | if (unlikely(row_length < min_row_length) && row_length) |
1529 | { |
1530 | /* |
1531 | This can only happen in the case we compacted transid and |
1532 | the row become 'too short' |
1533 | |
1534 | Move the current row down to it's right place and extend it |
1535 | with 0. |
1536 | */ |
1537 | uint row_diff= min_row_length - row_length; |
1538 | uint length= (next_free_pos - start_of_found_block); |
1539 | |
1540 | DBUG_ASSERT(page_pos != start_of_found_block); |
1541 | bmove(buff + page_pos, buff + start_of_found_block, length); |
1542 | bzero(buff+ page_pos + length, row_diff); |
1543 | page_pos+= min_row_length; |
1544 | int2store(end+2, min_row_length); |
1545 | freed_size-= row_diff; |
1546 | next_free_pos= start_of_found_block= page_pos; |
1547 | diff= 0; |
1548 | } |
1549 | } |
1550 | } |
1551 | if (page_pos != start_of_found_block) |
1552 | { |
1553 | uint length= (next_free_pos - start_of_found_block); |
1554 | memmove(buff + page_pos, buff + start_of_found_block, length); |
1555 | } |
1556 | start_of_found_block= uint2korr(dir); |
1557 | |
1558 | if (rownr != max_entry - 1) |
1559 | { |
1560 | /* Move all entries after rownr to end of page */ |
1561 | uint rownr_length; |
1562 | |
1563 | DBUG_ASSERT(extend_block); /* Should always be true */ |
1564 | next_free_pos= end_of_found_block= page_pos= |
1565 | block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE; |
1566 | diff= 0; |
1567 | /* End points to entry before 'rownr' */ |
1568 | for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE) |
1569 | { |
1570 | uint offset= uint2korr(dir); |
1571 | uint row_length; |
1572 | uint row_end; |
1573 | if (!offset) |
1574 | continue; |
1575 | row_length= uint2korr(dir + 2); |
1576 | row_end= offset + row_length; |
1577 | DBUG_ASSERT(offset >= start_of_found_block && |
1578 | row_end <= next_free_pos && row_length >= min_row_length); |
1579 | |
1580 | if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID)) |
1581 | { |
1582 | TrID transid= transid_korr(buff + offset+1); |
1583 | if (transid < min_read_from) |
1584 | { |
1585 | /* Remove transid from row */ |
1586 | buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; |
1587 | offset+= TRANSID_SIZE; |
1588 | row_length-= TRANSID_SIZE; |
1589 | int2store(dir+2, row_length); |
1590 | } |
1591 | if (unlikely(row_length < min_row_length)) |
1592 | { |
1593 | /* |
1594 | This can only happen in the case we compacted transid and |
1595 | the row become 'too short' |
1596 | */ |
1597 | uint row_diff= min_row_length - row_length; |
1598 | if (next_free_pos < row_end + row_diff) |
1599 | { |
1600 | /* |
1601 | Not enough space for extending next block with enough |
1602 | end 0's. Move current data down to get place for them |
1603 | */ |
1604 | uint move_down= row_diff - (next_free_pos - row_end); |
1605 | bmove(buff + offset - move_down, buff + offset, row_length); |
1606 | offset-= move_down; |
1607 | } |
1608 | /* |
1609 | Extend the next block with 0, which will be part of current |
1610 | row when the blocks are joined together later |
1611 | */ |
1612 | bzero(buff + next_free_pos - row_diff, row_diff); |
1613 | next_free_pos-= row_diff; |
1614 | int2store(dir+2, min_row_length); |
1615 | } |
1616 | row_end= offset + row_length; |
1617 | } |
1618 | |
1619 | if (row_end != next_free_pos) |
1620 | { |
1621 | uint length= (end_of_found_block - next_free_pos); |
1622 | if (page_pos != end_of_found_block) |
1623 | { |
1624 | /* move next block down */ |
1625 | memmove(buff + page_pos - length, buff + next_free_pos, length); |
1626 | } |
1627 | page_pos-= length; |
1628 | /* next continuous block starts here */ |
1629 | end_of_found_block= row_end; |
1630 | diff= page_pos - row_end; |
1631 | } |
1632 | int2store(dir, offset + diff); /* correct current pos */ |
1633 | next_free_pos= offset; |
1634 | } |
1635 | if (page_pos != end_of_found_block) |
1636 | { |
1637 | uint length= (end_of_found_block - next_free_pos); |
1638 | memmove(buff + page_pos - length, buff + next_free_pos, length); |
1639 | next_free_pos= page_pos- length; |
1640 | } |
1641 | |
1642 | /* Extend rownr block to cover hole */ |
1643 | rownr_length= next_free_pos - start_of_found_block; |
1644 | int2store(dir+2, rownr_length); |
1645 | DBUG_ASSERT(rownr_length >= min_row_length); |
1646 | } |
1647 | else |
1648 | { |
1649 | if (extend_block) |
1650 | { |
1651 | /* Extend last block to cover whole page */ |
1652 | uint length= ((uint) (dir - buff) - start_of_found_block); |
1653 | int2store(dir+2, length); |
1654 | DBUG_ASSERT(length >= min_row_length); |
1655 | } |
1656 | else |
1657 | { |
1658 | /* Add length gained from freed transaction id's to this page */ |
1659 | uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size; |
1660 | int2store(buff + EMPTY_SPACE_OFFSET, length); |
1661 | } |
1662 | buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED; |
1663 | } |
1664 | check_directory(share, buff, block_size, min_row_length, |
1665 | extend_block ? 0 : (uint) -1); |
1666 | DBUG_EXECUTE("directory" , _ma_print_directory(share, |
1667 | DBUG_FILE, buff, block_size);); |
1668 | DBUG_VOID_RETURN; |
1669 | } |
1670 | |
1671 | |
1672 | /* |
1673 | Create an empty tail or head page |
1674 | |
1675 | SYNOPSIS |
1676 | make_empty_page() |
1677 | buff Page buffer |
1678 | block_size Block size |
1679 | page_type HEAD_PAGE or TAIL_PAGE |
1680 | create_dir_entry TRUE of we should create a directory entry |
1681 | |
1682 | NOTES |
1683 | EMPTY_SPACE is not updated |
1684 | */ |
1685 | |
1686 | static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type, |
1687 | my_bool create_dir_entry) |
1688 | { |
1689 | uint block_size= info->s->block_size; |
1690 | DBUG_ENTER("make_empty_page" ); |
1691 | |
1692 | bzero(buff, PAGE_HEADER_SIZE(info->s)); |
1693 | |
1694 | #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind) |
1695 | /* |
1696 | We zero the rest of the block to avoid getting old memory information |
1697 | to disk and to allow the file to be compressed better if archived. |
1698 | The code does not assume the block is zeroed. |
1699 | */ |
1700 | if (page_type != BLOB_PAGE) |
1701 | bzero(buff+ PAGE_HEADER_SIZE(info->s), |
1702 | block_size - PAGE_HEADER_SIZE(info->s)); |
1703 | #endif |
1704 | buff[PAGE_TYPE_OFFSET]= (uchar) page_type; |
1705 | buff[DIR_COUNT_OFFSET]= (int) create_dir_entry; |
1706 | buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; |
1707 | if (create_dir_entry) |
1708 | { |
1709 | /* Create directory entry to point to start of page with size 0 */ |
1710 | buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; |
1711 | int2store(buff, PAGE_HEADER_SIZE(info->s)); |
1712 | int2store(buff+2, 0); |
1713 | } |
1714 | DBUG_VOID_RETURN; |
1715 | } |
1716 | |
1717 | |
1718 | /* |
1719 | Read or initialize new head or tail page |
1720 | |
1721 | SYNOPSIS |
1722 | get_head_or_tail_page() |
1723 | info Maria handler |
1724 | block Block to read |
1725 | buff Suggest this buffer to key cache |
1726 | length Minimum space needed |
1727 | page_type HEAD_PAGE || TAIL_PAGE |
1728 | res Store result position here |
1729 | |
1730 | NOTES |
1731 | We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data |
1732 | as we don't know how much data the caller will actually use. |
1733 | |
1734 | res->empty_space is set to length of empty space |
1735 | |
1736 | RETURN |
1737 | 0 ok All slots in 'res' are updated |
1738 | 1 error my_errno is set |
1739 | */ |
1740 | |
1741 | struct st_row_pos_info |
1742 | { |
1743 | uchar *buff; /* page buffer */ |
1744 | uchar *data; /* Place for data */ |
1745 | uchar *dir; /* Directory */ |
1746 | uint length; /* Length for data */ |
1747 | uint rownr; /* Offset in directory */ |
1748 | uint empty_space; /* Space left on page */ |
1749 | }; |
1750 | |
1751 | |
1752 | static my_bool get_head_or_tail_page(MARIA_HA *info, |
1753 | const MARIA_BITMAP_BLOCK *block, |
1754 | uchar *buff, uint length, uint page_type, |
1755 | enum pagecache_page_lock lock, |
1756 | struct st_row_pos_info *res) |
1757 | { |
1758 | uint block_size; |
1759 | MARIA_PINNED_PAGE page_link; |
1760 | MARIA_SHARE *share= info->s; |
1761 | DBUG_ENTER("get_head_or_tail_page" ); |
1762 | DBUG_PRINT("enter" , ("page_type: %u length: %u" , page_type, length)); |
1763 | |
1764 | block_size= share->block_size; |
1765 | if (block->org_bitmap_value == 0) /* Empty block */ |
1766 | { |
1767 | /* New page */ |
1768 | make_empty_page(info, buff, page_type, 1); |
1769 | res->buff= buff; |
1770 | res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share)); |
1771 | res->data= (buff + PAGE_HEADER_SIZE(share)); |
1772 | res->dir= res->data + res->length; |
1773 | res->rownr= 0; |
1774 | DBUG_ASSERT(length <= res->length); |
1775 | } |
1776 | else |
1777 | { |
1778 | uchar *dir; |
1779 | /* Read old page */ |
1780 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
1781 | res->buff= pagecache_read(share->pagecache, &info->dfile, |
1782 | block->page, 0, 0, share->page_type, |
1783 | lock, &page_link.link); |
1784 | page_link.changed= res->buff != 0; |
1785 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
1786 | if (!page_link.changed) |
1787 | goto crashed; |
1788 | |
1789 | DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == |
1790 | page_type); |
1791 | if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr, |
1792 | &res->length, &res->empty_space, |
1793 | page_type == HEAD_PAGE))) |
1794 | goto crashed; |
1795 | |
1796 | if (res->length < length) |
1797 | { |
1798 | if (res->empty_space + res->length >= length) |
1799 | { |
1800 | _ma_compact_block_page(share, |
1801 | res->buff, res->rownr, 1, |
1802 | (page_type == HEAD_PAGE ? |
1803 | info->trn->min_read_from : 0), |
1804 | (page_type == HEAD_PAGE ? |
1805 | share->base.min_block_length : |
1806 | 0)); |
1807 | /* All empty space are now after current position */ |
1808 | dir= dir_entry_pos(res->buff, block_size, res->rownr); |
1809 | res->length= res->empty_space= uint2korr(dir+2); |
1810 | } |
1811 | if (res->length < length) |
1812 | { |
1813 | DBUG_PRINT("error" , ("length: %u res->length: %u empty_space: %u" , |
1814 | length, res->length, res->empty_space)); |
1815 | goto crashed; /* Wrong bitmap information */ |
1816 | } |
1817 | } |
1818 | res->dir= dir; |
1819 | res->data= res->buff + uint2korr(dir); |
1820 | } |
1821 | DBUG_RETURN(0); |
1822 | |
1823 | crashed: |
1824 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
1825 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */ |
1826 | DBUG_RETURN(1); |
1827 | } |
1828 | |
1829 | |
1830 | /* |
1831 | @brief Create room for a head or tail row on a given page at given position |
1832 | |
1833 | @fn get_rowpos_in_head_or_tail_page() |
1834 | @param info Maria handler |
1835 | @param block Block to read |
1836 | @param buff Suggest this buffer to key cache |
1837 | @param length Minimum space needed |
1838 | @param page_type HEAD_PAGE || TAIL_PAGE |
1839 | @param rownr Rownr to use |
1840 | @param res Store result position here |
1841 | |
1842 | @note |
1843 | This is essential same as get_head_or_tail_page, with the difference |
1844 | that the caller species at what position the row should be put. |
1845 | This is used when restoring a row to it's original position as |
1846 | part of UNDO DELETE or UNDO UPDATE |
1847 | |
1848 | @return |
1849 | @retval 0 ok All slots in 'res' are updated |
1850 | @retval 1 error my_errno is set |
1851 | */ |
1852 | |
1853 | static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info, |
1854 | const MARIA_BITMAP_BLOCK *block, |
1855 | uchar *buff, uint length, |
1856 | uint page_type, |
1857 | enum pagecache_page_lock lock, |
1858 | uint rownr, |
1859 | struct st_row_pos_info *res) |
1860 | { |
1861 | MARIA_PINNED_PAGE page_link; |
1862 | MARIA_SHARE *share= info->s; |
1863 | uchar *dir; |
1864 | uint block_size= share->block_size; |
1865 | uint max_entry, max_length, rec_offset; |
1866 | DBUG_ENTER("get_rowpos_in_head_or_tail_page" ); |
1867 | |
1868 | if (block->org_bitmap_value == 0) /* Empty block */ |
1869 | { |
1870 | /* New page */ |
1871 | make_empty_page(info, buff, page_type, 0); |
1872 | res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE; |
1873 | } |
1874 | else |
1875 | { |
1876 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
1877 | buff= pagecache_read(share->pagecache, &info->dfile, |
1878 | block->page, 0, 0, share->page_type, |
1879 | lock, &page_link.link); |
1880 | page_link.changed= buff != 0; |
1881 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
1882 | if (!page_link.changed) /* Read error */ |
1883 | goto err; |
1884 | DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == |
1885 | (uchar) page_type); |
1886 | if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type) |
1887 | goto err; |
1888 | res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
1889 | } |
1890 | |
1891 | max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
1892 | if (max_entry <= rownr) |
1893 | { |
1894 | if (extend_directory(info, buff, block_size, |
1895 | max_entry, rownr, &res->empty_space, |
1896 | page_type == HEAD_PAGE)) |
1897 | goto err; |
1898 | } |
1899 | |
1900 | /* |
1901 | The following dir entry is unused in case of insert / update but |
1902 | not in case of undo_update / undo_delete |
1903 | */ |
1904 | dir= dir_entry_pos(buff, block_size, rownr); |
1905 | |
1906 | if (extend_area_on_page(info, buff, dir, rownr, length, |
1907 | &res->empty_space, &rec_offset, &max_length, |
1908 | page_type == HEAD_PAGE)) |
1909 | goto err; |
1910 | |
1911 | res->buff= buff; |
1912 | res->rownr= rownr; |
1913 | res->dir= dir; |
1914 | res->data= buff + rec_offset; |
1915 | res->length= length; |
1916 | DBUG_RETURN(0); |
1917 | |
1918 | err: |
1919 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
1920 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */ |
1921 | DBUG_RETURN(1); |
1922 | } |
1923 | |
1924 | |
1925 | /* |
1926 | Write tail for head data or blob |
1927 | |
1928 | SYNOPSIS |
1929 | write_tail() |
1930 | info Maria handler |
1931 | block Block to tail page |
1932 | row_part Data to write to page |
1933 | length Length of data |
1934 | |
1935 | NOTES |
1936 | block->page_count is updated to the directory offset for the tail |
1937 | so that we can store the position in the row extent information |
1938 | |
1939 | RETURN |
1940 | 0 ok |
1941 | block->page_count is set to point (dir entry + TAIL_BIT) |
1942 | |
1943 | 1 error; In this case my_errno is set to the error |
1944 | */ |
1945 | |
1946 | static my_bool write_tail(MARIA_HA *info, |
1947 | MARIA_BITMAP_BLOCK *block, |
1948 | uchar *row_part, uint org_length) |
1949 | { |
1950 | MARIA_SHARE *share= info->s; |
1951 | MARIA_PINNED_PAGE page_link; |
1952 | uint block_size= share->block_size, empty_space, length= org_length; |
1953 | struct st_row_pos_info row_pos; |
1954 | my_off_t position; |
1955 | my_bool res, block_is_read; |
1956 | DBUG_ENTER("write_tail" ); |
1957 | DBUG_PRINT("enter" , ("page: %lu length: %u" , |
1958 | (ulong) block->page, length)); |
1959 | |
1960 | info->keyread_buff_used= 1; |
1961 | /* |
1962 | Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows |
1963 | some place to grow in the future) |
1964 | */ |
1965 | if (length < MIN_TAIL_SIZE) |
1966 | length= MIN_TAIL_SIZE; |
1967 | |
1968 | if (block->page_count == TAIL_PAGE_COUNT_MARKER) |
1969 | { |
1970 | /* |
1971 | Create new tail |
1972 | page will be pinned & locked by get_head_or_tail_page |
1973 | */ |
1974 | if (get_head_or_tail_page(info, block, info->keyread_buff, length, |
1975 | TAIL_PAGE, PAGECACHE_LOCK_WRITE, |
1976 | &row_pos)) |
1977 | DBUG_RETURN(1); |
1978 | } |
1979 | else |
1980 | { |
1981 | /* Write tail on predefined row position */ |
1982 | if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff, |
1983 | length, TAIL_PAGE, |
1984 | PAGECACHE_LOCK_WRITE, |
1985 | block->page_count & ~TAIL_BIT, |
1986 | &row_pos)) |
1987 | DBUG_RETURN(1); |
1988 | } |
1989 | DBUG_PRINT("info" , ("tailid: %lu (%lu:%u)" , |
1990 | (ulong) ma_recordpos(block->page, row_pos.rownr), |
1991 | (ulong) block->page, row_pos.rownr)); |
1992 | |
1993 | block_is_read= block->org_bitmap_value != 0; |
1994 | |
1995 | memcpy(row_pos.data, row_part, org_length); |
1996 | |
1997 | if (share->now_transactional) |
1998 | { |
1999 | /* Log changes in tail block */ |
2000 | uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; |
2001 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; |
2002 | LSN lsn; |
2003 | |
2004 | /* |
2005 | Log REDO changes of tail page |
2006 | Note that we have to log length, not org_length, to be sure that |
2007 | REDO, which doesn't use write_tail, also creates a block of at least |
2008 | MIN_TAIL_SIZE |
2009 | */ |
2010 | page_store(log_data + FILEID_STORE_SIZE, block->page); |
2011 | dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, |
2012 | row_pos.rownr); |
2013 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
2014 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
2015 | log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data; |
2016 | log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; |
2017 | if (translog_write_record(&lsn, |
2018 | (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL : |
2019 | LOGREC_REDO_NEW_ROW_TAIL), |
2020 | info->trn, info, |
2021 | (translog_size_t) (sizeof(log_data) + length), |
2022 | TRANSLOG_INTERNAL_PARTS + 2, log_array, |
2023 | log_data, NULL)) |
2024 | DBUG_RETURN(1); |
2025 | } |
2026 | |
2027 | int2store(row_pos.dir + 2, length); |
2028 | empty_space= row_pos.empty_space - length; |
2029 | int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space); |
2030 | block->page_count= row_pos.rownr + TAIL_BIT; |
2031 | /* |
2032 | If there is less directory entries free than number of possible tails |
2033 | we can write for a row, we mark the page full to ensure that we don't |
2034 | during _ma_bitmap_find_place() allocate more entries on the tail page |
2035 | than it can hold |
2036 | */ |
2037 | block->empty_space= (enough_free_entries(row_pos.buff, share->block_size, |
2038 | 1 + share->base.blobs) ? |
2039 | empty_space : 0); |
2040 | /* Keep BLOCKUSED_USE_ORG_BITMAP */ |
2041 | block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; |
2042 | |
2043 | if (block_is_read) |
2044 | { |
2045 | /* Current page link is last element in pinned_pages */ |
2046 | MARIA_PINNED_PAGE *page_link; |
2047 | page_link= dynamic_element(&info->pinned_pages, |
2048 | info->pinned_pages.elements-1, |
2049 | MARIA_PINNED_PAGE*); |
2050 | pagecache_unlock_by_link(share->pagecache, page_link->link, |
2051 | PAGECACHE_LOCK_WRITE_TO_READ, |
2052 | PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, |
2053 | LSN_IMPOSSIBLE, 1, FALSE); |
2054 | DBUG_ASSERT(page_link->changed); |
2055 | page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; |
2056 | res= 0; |
2057 | } |
2058 | else |
2059 | { |
2060 | if (!(res= pagecache_write(share->pagecache, |
2061 | &info->dfile, block->page, 0, |
2062 | row_pos.buff,share->page_type, |
2063 | PAGECACHE_LOCK_READ, |
2064 | PAGECACHE_PIN, |
2065 | PAGECACHE_WRITE_DELAY, &page_link.link, |
2066 | LSN_IMPOSSIBLE))) |
2067 | { |
2068 | DBUG_ASSERT(page_link.link); |
2069 | page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; |
2070 | page_link.changed= 1; |
2071 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
2072 | } |
2073 | |
2074 | /* Increase data file size, if extended */ |
2075 | position= (my_off_t) block->page * block_size; |
2076 | if (share->state.state.data_file_length <= position) |
2077 | { |
2078 | /* |
2079 | We are modifying a state member before writing the UNDO; this is a WAL |
2080 | violation. But for data_file_length this is ok, as long as we change |
2081 | data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see |
2082 | collect_tables()). |
2083 | */ |
2084 | _ma_set_share_data_file_length(share, position + block_size); |
2085 | } |
2086 | } |
2087 | DBUG_RETURN(res); |
2088 | } |
2089 | |
2090 | |
2091 | /* |
2092 | Write full pages |
2093 | |
2094 | SYNOPSIS |
2095 | write_full_pages() |
2096 | info Maria handler |
2097 | lsn LSN for the undo record |
2098 | block Where to write data |
2099 | data Data to write |
2100 | length Length of data |
2101 | |
2102 | NOTES |
2103 | Logging of the changes to the full pages are done in the caller |
2104 | write_block_record(). |
2105 | |
2106 | RETURN |
2107 | 0 ok |
2108 | 1 error on write |
2109 | */ |
2110 | |
2111 | static my_bool write_full_pages(MARIA_HA *info, |
2112 | LSN lsn, |
2113 | MARIA_BITMAP_BLOCK *block, |
2114 | uchar *data, ulong length) |
2115 | { |
2116 | pgcache_page_no_t page; |
2117 | MARIA_SHARE *share= info->s; |
2118 | uint block_size= share->block_size; |
2119 | uint data_size= FULL_PAGE_SIZE(share); |
2120 | uchar *buff= info->keyread_buff; |
2121 | uint page_count, sub_blocks; |
2122 | my_off_t position, max_position; |
2123 | DBUG_ENTER("write_full_pages" ); |
2124 | DBUG_PRINT("enter" , ("length: %lu page: %lu page_count: %lu" , |
2125 | (ulong) length, (ulong) block->page, |
2126 | (ulong) block->page_count)); |
2127 | DBUG_ASSERT((block->page_count & TAIL_BIT) == 0); |
2128 | |
2129 | info->keyread_buff_used= 1; |
2130 | page= block->page; |
2131 | page_count= block->page_count; |
2132 | sub_blocks= block->sub_blocks; |
2133 | |
2134 | max_position= (my_off_t) (page + page_count) * block_size; |
2135 | |
2136 | /* Increase data file size, if extended */ |
2137 | |
2138 | for (; length; data+= data_size) |
2139 | { |
2140 | uint copy_length; |
2141 | if (!page_count--) |
2142 | { |
2143 | if (!--sub_blocks) |
2144 | { |
2145 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
2146 | DBUG_RETURN(1); |
2147 | } |
2148 | |
2149 | block++; |
2150 | page= block->page; |
2151 | page_count= block->page_count - 1; |
2152 | DBUG_PRINT("info" , ("page: %lu page_count: %lu" , |
2153 | (ulong) block->page, (ulong) block->page_count)); |
2154 | |
2155 | position= (page + page_count + 1) * block_size; |
2156 | set_if_bigger(max_position, position); |
2157 | } |
2158 | lsn_store(buff, lsn); |
2159 | buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE; |
2160 | bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE, |
2161 | FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE)); |
2162 | copy_length= MY_MIN(data_size, length); |
2163 | memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length); |
2164 | length-= copy_length; |
2165 | |
2166 | /* |
2167 | Zero out old information from the block. This removes possible |
2168 | sensitive information from the block and also makes the file |
2169 | easier to compress and easier to compare after recovery. |
2170 | */ |
2171 | if (copy_length != data_size) |
2172 | bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length), |
2173 | (data_size - copy_length) + PAGE_SUFFIX_SIZE); |
2174 | |
2175 | if (pagecache_write(share->pagecache, |
2176 | &info->dfile, page, 0, |
2177 | buff, share->page_type, |
2178 | PAGECACHE_LOCK_LEFT_UNLOCKED, |
2179 | PAGECACHE_PIN_LEFT_UNPINNED, |
2180 | PAGECACHE_WRITE_DELAY, |
2181 | 0, info->trn->rec_lsn)) |
2182 | DBUG_RETURN(1); |
2183 | page++; |
2184 | DBUG_ASSERT(block->used & BLOCKUSED_USED); |
2185 | } |
2186 | if (share->state.state.data_file_length < max_position) |
2187 | _ma_set_share_data_file_length(share, max_position); |
2188 | DBUG_RETURN(0); |
2189 | } |
2190 | |
2191 | |
2192 | /* |
2193 | Store ranges of full pages in compact format for logging |
2194 | |
2195 | SYNOPSIS |
2196 | store_page_range() |
2197 | to Store data here |
2198 | block Where pages are to be written |
2199 | length Length of data to be written |
2200 | Normally this is full pages, except for the last |
2201 | tail block that may only partly fit the last page. |
2202 | tot_ranges Add here the number of ranges used |
2203 | |
2204 | NOTES |
2205 | The format of one entry is: |
2206 | |
2207 | Ranges SUB_RANGE_SIZE |
2208 | Empty bytes at end of last byte BLOCK_FILLER_SIZE |
2209 | For each range |
2210 | Page number PAGE_STORE_SIZE |
2211 | Number of pages PAGERANGE_STORE_SIZE |
2212 | |
2213 | RETURN |
2214 | # end position for 'to' |
2215 | */ |
2216 | |
2217 | static uchar *store_page_range(MARIA_SHARE *share, |
2218 | uchar *to, MARIA_BITMAP_BLOCK *block, |
2219 | ulong length, |
2220 | uint *tot_ranges) |
2221 | { |
2222 | uint data_size= FULL_PAGE_SIZE(share); |
2223 | ulong pages_left= (length + data_size -1) / data_size; |
2224 | uint page_count, ranges, empty_space; |
2225 | uchar *to_start; |
2226 | DBUG_ENTER("store_page_range" ); |
2227 | |
2228 | to_start= to; |
2229 | to+= SUB_RANGE_SIZE; |
2230 | |
2231 | /* Store number of unused bytes at last page */ |
2232 | empty_space= (uint) (pages_left * data_size - length); |
2233 | int2store(to, empty_space); |
2234 | to+= BLOCK_FILLER_SIZE; |
2235 | |
2236 | ranges= 0; |
2237 | do |
2238 | { |
2239 | pgcache_page_no_t page; |
2240 | page= block->page; |
2241 | page_count= block->page_count; |
2242 | block++; |
2243 | if (page_count > pages_left) |
2244 | page_count= pages_left; |
2245 | |
2246 | page_store(to, page); |
2247 | to+= PAGE_STORE_SIZE; |
2248 | pagerange_store(to, page_count); |
2249 | to+= PAGERANGE_STORE_SIZE; |
2250 | ranges++; |
2251 | } while ((pages_left-= page_count)); |
2252 | /* Store number of ranges for this block */ |
2253 | int2store(to_start, ranges); |
2254 | (*tot_ranges)+= ranges; |
2255 | |
2256 | DBUG_RETURN(to); |
2257 | } |
2258 | |
2259 | |
2260 | /* |
2261 | Store packed extent data |
2262 | |
2263 | SYNOPSIS |
2264 | store_extent_info() |
2265 | to Store first packed data here |
2266 | row_extents_second_part Store rest here |
2267 | first_block First block to store |
2268 | count Number of blocks |
2269 | |
2270 | NOTES |
2271 | We don't have to store the position for the head block |
2272 | |
2273 | We have to set the START_EXTENT_BIT for every extent where the |
2274 | blob will be stored on a page of it's own. We need this in the |
2275 | UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and |
2276 | undo-update. |
2277 | */ |
2278 | |
2279 | static void store_extent_info(uchar *to, |
2280 | uchar *row_extents_second_part, |
2281 | MARIA_BITMAP_BLOCK *first_block, |
2282 | uint count) |
2283 | { |
2284 | MARIA_BITMAP_BLOCK *block, *end_block; |
2285 | uint copy_length; |
2286 | my_bool first_found= 0; |
2287 | DBUG_ENTER("store_extent_info" ); |
2288 | DBUG_PRINT("enter" , ("count: %u" , count)); |
2289 | |
2290 | for (block= first_block, end_block= first_block+count ; |
2291 | block < end_block; block++) |
2292 | { |
2293 | /* The following is only false for marker (unused) blocks */ |
2294 | if (likely(block->used & BLOCKUSED_USED)) |
2295 | { |
2296 | uint page_count= block->page_count; |
2297 | DBUG_ASSERT(page_count != 0); |
2298 | page_store(to, block->page); |
2299 | if (block->sub_blocks) |
2300 | { |
2301 | /* |
2302 | Set a bit so that we later know that this was the first block |
2303 | for a blob |
2304 | */ |
2305 | page_count|= START_EXTENT_BIT; |
2306 | } |
2307 | pagerange_store(to + PAGE_STORE_SIZE, page_count); |
2308 | DBUG_DUMP("extent" , to, ROW_EXTENT_SIZE); |
2309 | to+= ROW_EXTENT_SIZE; |
2310 | if (!first_found) |
2311 | { |
2312 | first_found= 1; |
2313 | to= row_extents_second_part; |
2314 | } |
2315 | } |
2316 | } |
2317 | copy_length= (count - 1) * ROW_EXTENT_SIZE; |
2318 | /* |
2319 | In some unlikely cases we have allocated to many blocks. Clear this |
2320 | data. |
2321 | */ |
2322 | bzero(to, (size_t) (row_extents_second_part + copy_length - to)); |
2323 | DBUG_VOID_RETURN; |
2324 | } |
2325 | |
2326 | |
2327 | /** |
2328 | @brief |
2329 | Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable |
2330 | for write_block_record |
2331 | |
2332 | @note |
2333 | In case of blobs, this function marks all the blob pages in the bitmap |
2334 | as full pages. The bitmap bits for other pages will be marked |
2335 | when write_block_record() calls _ma_bitmap_release_unused(). |
2336 | |
2337 | This function will be removed in Maria 2.0 when we instead of delete rows |
2338 | mark them as deleted and only remove them after commit. |
2339 | |
2340 | @return |
2341 | @retval 0 ok |
2342 | @retval 1 Error (out of memory or disk error changing bitmap) or |
2343 | wrong information in extent information |
2344 | */ |
2345 | |
2346 | static my_bool extent_to_bitmap_blocks(MARIA_HA *info, |
2347 | MARIA_BITMAP_BLOCKS *blocks, |
2348 | pgcache_page_no_t head_page, |
2349 | uint extent_count, |
2350 | const uchar *extent_info) |
2351 | { |
2352 | MARIA_BITMAP_BLOCK *block, *start_block; |
2353 | MARIA_SHARE *share= info->s; |
2354 | uint i, tail_page; |
2355 | DBUG_ENTER("extent_to_bitmap_blocks" ); |
2356 | |
2357 | if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2)) |
2358 | DBUG_RETURN(1); |
2359 | block= blocks->block= dynamic_element(&info->bitmap_blocks, 0, |
2360 | MARIA_BITMAP_BLOCK*); |
2361 | blocks->count= extent_count + 1; |
2362 | blocks->tail_page_skipped= blocks->page_skipped= 0; |
2363 | block->page= head_page; |
2364 | block->page_count= 1; |
2365 | block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; |
2366 | /* Impossible value, will force storage of real value */ |
2367 | block->org_bitmap_value= 255; |
2368 | |
2369 | start_block= block++; |
2370 | for (i=0 ; |
2371 | i++ < extent_count ; |
2372 | block++, extent_info+= ROW_EXTENT_SIZE) |
2373 | { |
2374 | uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE); |
2375 | if (page_count & START_EXTENT_BIT) |
2376 | { |
2377 | page_count&= ~START_EXTENT_BIT; |
2378 | start_block->sub_blocks= (uint) (block - start_block); |
2379 | start_block= block; |
2380 | } |
2381 | block->page= page_korr(extent_info); |
2382 | block->page_count= page_count; |
2383 | block->sub_blocks= 0; |
2384 | if (block->page_count == 0) |
2385 | { |
2386 | /* Extend allocated but not used by write_block_record() */ |
2387 | DBUG_ASSERT(block->page == 0); |
2388 | /* This is the last block */ |
2389 | blocks->count= i; |
2390 | break; |
2391 | } |
2392 | if ((tail_page= page_count & TAIL_BIT)) |
2393 | page_count= 1; |
2394 | |
2395 | /* Check if wrong data */ |
2396 | if (block->page == 0 || page_count == 0 || |
2397 | (block->page + page_count) * share->block_size > |
2398 | share->state.state.data_file_length) |
2399 | { |
2400 | DBUG_PRINT("error" , ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld" , |
2401 | (ulong) block->page, |
2402 | (block->page_count & ~TAIL_BIT), |
2403 | (uint) MY_TEST(block->page_count & TAIL_BIT), |
2404 | (ulong) ((block->page + (page_count & ~TAIL_BIT)) * |
2405 | share->block_size), |
2406 | (ulong) share->state.state.data_file_length)); |
2407 | DBUG_RETURN(1); |
2408 | } |
2409 | if (tail_page) |
2410 | { |
2411 | block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap, |
2412 | block->page); |
2413 | block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED | |
2414 | BLOCKUSED_USE_ORG_BITMAP); |
2415 | } |
2416 | else |
2417 | { |
2418 | my_bool res; |
2419 | mysql_mutex_lock(&share->bitmap.bitmap_lock); |
2420 | res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, |
2421 | block->page, page_count); |
2422 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
2423 | if (res) |
2424 | DBUG_RETURN(1); |
2425 | block->used= BLOCKUSED_USED; |
2426 | } |
2427 | } |
2428 | start_block->sub_blocks= (uint) (block - start_block); |
2429 | DBUG_RETURN(0); |
2430 | } |
2431 | |
2432 | |
2433 | /* |
2434 | Free regions of pages with logging |
2435 | |
2436 | NOTES |
2437 | We are removing filler events and tail page events from |
2438 | row->extents to get smaller log. |
2439 | |
2440 | RETURN |
2441 | 0 ok |
2442 | 1 error |
2443 | */ |
2444 | |
2445 | static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row) |
2446 | { |
2447 | uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE]; |
2448 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; |
2449 | LSN lsn; |
2450 | size_t extents_length; |
2451 | uchar *extents= row->extents; |
2452 | DBUG_ENTER("free_full_pages" ); |
2453 | |
2454 | if (info->s->now_transactional) |
2455 | { |
2456 | /* Compact events by removing filler and tail events */ |
2457 | uchar *new_block= 0; |
2458 | uchar *end, *to, *compact_extent_info; |
2459 | my_bool res; |
2460 | uint extents_count; |
2461 | |
2462 | if (!(compact_extent_info= my_alloca(row->extents_count * |
2463 | ROW_EXTENT_SIZE))) |
2464 | DBUG_RETURN(1); |
2465 | |
2466 | to= compact_extent_info; |
2467 | for (end= extents + row->extents_count * ROW_EXTENT_SIZE ; |
2468 | extents < end ; |
2469 | extents+= ROW_EXTENT_SIZE) |
2470 | { |
2471 | uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); |
2472 | page_count&= ~START_EXTENT_BIT; |
2473 | if (! (page_count & TAIL_BIT) && page_count != 0) |
2474 | { |
2475 | /* Found correct extent */ |
2476 | if (!new_block) |
2477 | new_block= extents; /* First extent in range */ |
2478 | continue; |
2479 | } |
2480 | /* Found extent to remove, copy everything found so far */ |
2481 | if (new_block) |
2482 | { |
2483 | size_t length= (size_t) (extents - new_block); |
2484 | memcpy(to, new_block, length); |
2485 | to+= length; |
2486 | new_block= 0; |
2487 | } |
2488 | } |
2489 | if (new_block) |
2490 | { |
2491 | size_t length= (size_t) (extents - new_block); |
2492 | memcpy(to, new_block, length); |
2493 | to+= length; |
2494 | } |
2495 | |
2496 | if (!unlikely(extents_length= (uint) (to - compact_extent_info))) |
2497 | { |
2498 | /* |
2499 | No ranges. This happens in the rear case when we have a allocated |
2500 | place for a blob on a tail page but it did fit into the main page. |
2501 | */ |
2502 | my_afree(compact_extent_info); |
2503 | DBUG_RETURN(0); |
2504 | } |
2505 | extents_count= (uint) (extents_length / ROW_EXTENT_SIZE); |
2506 | pagerange_store(log_data + FILEID_STORE_SIZE, extents_count); |
2507 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
2508 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
2509 | log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info; |
2510 | log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length; |
2511 | res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn, |
2512 | info, |
2513 | (translog_size_t) (sizeof(log_data) + |
2514 | extents_length), |
2515 | TRANSLOG_INTERNAL_PARTS + 2, log_array, |
2516 | log_data, NULL); |
2517 | my_afree(compact_extent_info); |
2518 | if (res) |
2519 | DBUG_RETURN(1); |
2520 | } |
2521 | |
2522 | DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents, |
2523 | row->extents_count)); |
2524 | } |
2525 | |
2526 | |
2527 | /* |
2528 | Free one page range |
2529 | |
2530 | NOTES |
2531 | This is very similar to free_full_pages() |
2532 | |
2533 | RETURN |
2534 | 0 ok |
2535 | 1 error |
2536 | */ |
2537 | |
2538 | static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page, |
2539 | uint count) |
2540 | { |
2541 | my_bool res= 0; |
2542 | uint delete_count; |
2543 | MARIA_SHARE *share= info->s; |
2544 | DBUG_ENTER("free_full_page_range" ); |
2545 | |
2546 | delete_count= count; |
2547 | if (share->state.state.data_file_length == |
2548 | (page + count) * share->block_size) |
2549 | { |
2550 | /* |
2551 | Don't delete last page from pagecache as this will make the file |
2552 | shorter than expected if the last operation extended the file |
2553 | */ |
2554 | delete_count--; |
2555 | } |
2556 | if (delete_count && |
2557 | pagecache_delete_pages(share->pagecache, &info->dfile, |
2558 | page, delete_count, PAGECACHE_LOCK_WRITE, 1)) |
2559 | res= 1; |
2560 | |
2561 | if (share->now_transactional) |
2562 | { |
2563 | LSN lsn; |
2564 | /** @todo unify log_data's shape with delete_head_or_tail() */ |
2565 | uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + |
2566 | ROW_EXTENT_SIZE]; |
2567 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; |
2568 | DBUG_ASSERT(info->trn->rec_lsn); |
2569 | pagerange_store(log_data + FILEID_STORE_SIZE, 1); |
2570 | page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, |
2571 | page); |
2572 | int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + |
2573 | PAGE_STORE_SIZE, count); |
2574 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
2575 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
2576 | |
2577 | if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, |
2578 | info->trn, info, |
2579 | (translog_size_t) sizeof(log_data), |
2580 | TRANSLOG_INTERNAL_PARTS + 1, log_array, |
2581 | log_data, NULL)) |
2582 | res= 1; |
2583 | } |
2584 | mysql_mutex_lock(&share->bitmap.bitmap_lock); |
2585 | if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count)) |
2586 | res= 1; |
2587 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
2588 | DBUG_RETURN(res); |
2589 | } |
2590 | |
2591 | |
2592 | /** |
2593 | @brief Write a record to a (set of) pages |
2594 | |
2595 | @fn write_block_record() |
2596 | @param info Maria handler |
2597 | @param old_record Original record in case of update; NULL in case of |
2598 | insert |
2599 | @param record Record we should write |
2600 | @param row Statistics about record (calculated by |
2601 | calc_record_size()) |
2602 | @param map_blocks On which pages the record should be stored |
2603 | @param row_pos Position on head page where to put head part of |
2604 | record |
2605 | @param undo_lsn <> LSN_ERROR if we are executing an UNDO |
2606 | @param old_record_checksum Checksum of old_record: ignored if table does |
2607 | not have live checksum; otherwise if |
2608 | old_record==NULL it must be 0. |
2609 | |
2610 | @note |
2611 | On return all pinned pages are released. |
2612 | |
2613 | [page_buff + EMPTY_SPACE_OFFSET] is set to |
2614 | row_pos->empty_space - head_length |
2615 | |
2616 | @return Operation status |
2617 | @retval 0 OK |
2618 | @retval 1 Error |
2619 | */ |
2620 | |
2621 | static my_bool write_block_record(MARIA_HA *info, |
2622 | const uchar *old_record, |
2623 | const uchar *record, |
2624 | MARIA_ROW *row, |
2625 | MARIA_BITMAP_BLOCKS *bitmap_blocks, |
2626 | my_bool head_block_is_read, |
2627 | struct st_row_pos_info *row_pos, |
2628 | LSN undo_lsn, |
2629 | ha_checksum old_record_checksum) |
2630 | { |
2631 | uchar *data, *end_of_data, *tmp_data_used, *tmp_data; |
2632 | uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part); |
2633 | uchar *field_length_data; |
2634 | uchar *page_buff; |
2635 | MARIA_BITMAP_BLOCK *block, *head_block; |
2636 | MARIA_SHARE *share= info->s; |
2637 | MARIA_COLUMNDEF *column, *end_column; |
2638 | MARIA_PINNED_PAGE page_link; |
2639 | uint block_size, flag, head_length; |
2640 | ulong *blob_lengths; |
2641 | my_bool row_extents_in_use, blob_full_pages_exists; |
2642 | LSN lsn; |
2643 | my_off_t position; |
2644 | uint save_my_errno; |
2645 | DBUG_ENTER("write_block_record" ); |
2646 | |
2647 | head_block= bitmap_blocks->block; |
2648 | block_size= share->block_size; |
2649 | |
2650 | page_buff= row_pos->buff; |
2651 | /* Position on head page where we should store the head part */ |
2652 | data= row_pos->data; |
2653 | end_of_data= data + row_pos->length; |
2654 | |
2655 | /* Write header */ |
2656 | flag= info->row_flag; |
2657 | row_extents_in_use= 0; |
2658 | if (unlikely(row->total_length > row_pos->length)) |
2659 | { |
2660 | /* Need extent */ |
2661 | DBUG_ASSERT(bitmap_blocks->count > 1); |
2662 | if (bitmap_blocks->count <= 1) |
2663 | goto crashed; /* Wrong in bitmap */ |
2664 | flag|= ROW_FLAG_EXTENTS; |
2665 | row_extents_in_use= 1; |
2666 | } |
2667 | /* For now we have only a minimum header */ |
2668 | *data++= (uchar) flag; |
2669 | if (flag & ROW_FLAG_TRANSID) |
2670 | { |
2671 | transid_store(data, info->trn->trid); |
2672 | data+= TRANSID_SIZE; |
2673 | } |
2674 | |
2675 | if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED)) |
2676 | *data++= (uchar) (share->base.null_bytes - |
2677 | share->base.original_null_bytes); |
2678 | if (row_extents_in_use) |
2679 | { |
2680 | /* Store first extent in header */ |
2681 | store_key_length_inc(data, bitmap_blocks->count - 1); |
2682 | row_extents_first_part= data; |
2683 | data+= ROW_EXTENT_SIZE; |
2684 | } |
2685 | if (share->base.max_field_lengths) |
2686 | store_key_length_inc(data, row->field_lengths_length); |
2687 | if (share->calc_checksum) |
2688 | { |
2689 | *(data++)= (uchar) (row->checksum); /* store least significant byte */ |
2690 | DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL))); |
2691 | } |
2692 | memcpy(data, record, share->base.null_bytes); |
2693 | data+= share->base.null_bytes; |
2694 | memcpy(data, row->empty_bits, share->base.pack_bytes); |
2695 | data+= share->base.pack_bytes; |
2696 | |
2697 | DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || |
2698 | (uint) (data - row_pos->data) == row->min_length); |
2699 | |
2700 | /* |
2701 | Allocate a buffer of rest of data (except blobs) |
2702 | |
2703 | To avoid double copying of data, we copy as many columns that fits into |
2704 | the page. The rest goes into info->packed_row. |
2705 | |
2706 | Using an extra buffer, instead of doing continuous writes to different |
2707 | pages, uses less code and we don't need to have to do a complex call |
2708 | for every data segment we want to store. |
2709 | */ |
2710 | if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, |
2711 | row->head_length)) |
2712 | DBUG_RETURN(1); |
2713 | |
2714 | tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */ |
2715 | tmp_data= data; |
2716 | |
2717 | if (row_extents_in_use) |
2718 | { |
2719 | uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE; |
2720 | if (!tmp_data_used && tmp_data + copy_length > end_of_data) |
2721 | { |
2722 | tmp_data_used= tmp_data; |
2723 | tmp_data= info->rec_buff; |
2724 | } |
2725 | row_extents_second_part= tmp_data; |
2726 | /* |
2727 | We will copy the extents here when we have figured out the tail |
2728 | positions. |
2729 | */ |
2730 | tmp_data+= copy_length; |
2731 | } |
2732 | |
2733 | /* Copy fields that has fixed lengths (primary key etc) */ |
2734 | for (column= share->columndef, |
2735 | end_column= column + share->base.fixed_not_null_fields; |
2736 | column < end_column; column++) |
2737 | { |
2738 | if (!tmp_data_used && tmp_data + column->length > end_of_data) |
2739 | { |
2740 | tmp_data_used= tmp_data; |
2741 | tmp_data= info->rec_buff; |
2742 | } |
2743 | memcpy(tmp_data, record + column->offset, column->length); |
2744 | tmp_data+= column->length; |
2745 | } |
2746 | |
2747 | /* Copy length of data for variable length fields */ |
2748 | if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data) |
2749 | { |
2750 | tmp_data_used= tmp_data; |
2751 | tmp_data= info->rec_buff; |
2752 | } |
2753 | field_length_data= row->field_lengths; |
2754 | memcpy(tmp_data, field_length_data, row->field_lengths_length); |
2755 | tmp_data+= row->field_lengths_length; |
2756 | |
2757 | DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || |
2758 | (uint) (tmp_data - row_pos->data) == row->min_length + |
2759 | share->base.fixed_not_null_fields_length + |
2760 | row->field_lengths_length); |
2761 | |
2762 | /* Copy variable length fields and fields with null/zero */ |
2763 | for (end_column= share->columndef + share->base.fields - share->base.blobs; |
2764 | column < end_column ; |
2765 | column++) |
2766 | { |
2767 | const uchar *field_pos; |
2768 | ulong length; |
2769 | if ((record[column->null_pos] & column->null_bit) || |
2770 | (row->empty_bits[column->empty_pos] & column->empty_bit)) |
2771 | continue; |
2772 | |
2773 | field_pos= record + column->offset; |
2774 | switch (column->type) { |
2775 | case FIELD_NORMAL: /* Fixed length field */ |
2776 | case FIELD_SKIP_PRESPACE: |
2777 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
2778 | length= column->length; |
2779 | break; |
2780 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
2781 | /* Char that is space filled */ |
2782 | if (column->length <= 255) |
2783 | length= (uint) (uchar) *field_length_data++; |
2784 | else |
2785 | { |
2786 | length= uint2korr(field_length_data); |
2787 | field_length_data+= 2; |
2788 | } |
2789 | break; |
2790 | case FIELD_VARCHAR: |
2791 | if (column->length <= 256) |
2792 | { |
2793 | length= (uint) (uchar) *field_length_data++; |
2794 | field_pos++; /* Skip length uchar */ |
2795 | } |
2796 | else |
2797 | { |
2798 | length= uint2korr(field_length_data); |
2799 | field_length_data+= 2; |
2800 | field_pos+= 2; |
2801 | } |
2802 | DBUG_ASSERT(length <= column->length); |
2803 | break; |
2804 | default: /* Wrong data */ |
2805 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
2806 | length=0; |
2807 | break; |
2808 | } |
2809 | if (!tmp_data_used && tmp_data + length > end_of_data) |
2810 | { |
2811 | /* Data didn't fit in page; Change to use tmp buffer */ |
2812 | tmp_data_used= tmp_data; |
2813 | tmp_data= info->rec_buff; |
2814 | } |
2815 | memcpy((char*) tmp_data, field_pos, length); |
2816 | tmp_data+= length; |
2817 | } |
2818 | |
2819 | block= head_block + head_block->sub_blocks; /* Point to first blob data */ |
2820 | |
2821 | end_column= column + share->base.blobs; |
2822 | blob_lengths= row->blob_lengths; |
2823 | if (!tmp_data_used) |
2824 | { |
2825 | /* Still room on page; Copy as many blobs we can into this page */ |
2826 | data= tmp_data; |
2827 | for (; column < end_column && |
2828 | *blob_lengths <= (ulong)(end_of_data - data); |
2829 | column++, blob_lengths++) |
2830 | { |
2831 | uchar *tmp_pos; |
2832 | uint length; |
2833 | if (!*blob_lengths) /* Null or "" */ |
2834 | continue; |
2835 | length= column->length - portable_sizeof_char_ptr; |
2836 | memcpy(&tmp_pos, record + column->offset + length, sizeof(char*)); |
2837 | memcpy(data, tmp_pos, *blob_lengths); |
2838 | data+= *blob_lengths; |
2839 | /* |
2840 | The following is not true when we want to insert data into original |
2841 | place. In this case we don't have any extra blocks allocated |
2842 | */ |
2843 | if (likely(undo_lsn == LSN_ERROR)) |
2844 | { |
2845 | /* Skip over tail page that was prepared for storing blob */ |
2846 | block++; |
2847 | bitmap_blocks->tail_page_skipped= 1; |
2848 | } |
2849 | } |
2850 | if (head_block->sub_blocks > 1) |
2851 | { |
2852 | /* We have allocated pages that where not used */ |
2853 | bitmap_blocks->page_skipped= 1; |
2854 | } |
2855 | } |
2856 | else |
2857 | data= tmp_data_used; /* Get last used on page */ |
2858 | |
2859 | /* Update page directory */ |
2860 | head_length= (uint) (data - row_pos->data); |
2861 | DBUG_PRINT("info" , ("Used head length on page: %u header_length: %u" , |
2862 | head_length, |
2863 | (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0))); |
2864 | if (head_length < share->base.min_block_length) |
2865 | { |
2866 | /* Extend row to be of size min_block_length */ |
2867 | uint diff_length= share->base.min_block_length - head_length; |
2868 | bzero(data, diff_length); |
2869 | data+= diff_length; |
2870 | head_length= share->base.min_block_length; |
2871 | } |
2872 | DBUG_ASSERT(data <= end_of_data); |
2873 | /* |
2874 | If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have |
2875 | written exactly head_length bytes (same as original record). |
2876 | */ |
2877 | DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length); |
2878 | int2store(row_pos->dir + 2, head_length); |
2879 | /* update empty space at start of block */ |
2880 | row_pos->empty_space-= head_length; |
2881 | int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space); |
2882 | /* Mark in bitmaps how the current page was actually used */ |
2883 | head_block->empty_space= row_pos->empty_space; |
2884 | if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE && |
2885 | page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST) |
2886 | head_block->empty_space= 0; /* Page is full */ |
2887 | head_block->used|= BLOCKUSED_USED; |
2888 | |
2889 | check_directory(share, |
2890 | page_buff, share->block_size, share->base.min_block_length, |
2891 | (uint) -1); |
2892 | |
2893 | /* |
2894 | Now we have to write tail pages, as we need to store the position |
2895 | to them in the row extent header. |
2896 | |
2897 | We first write out all blob tails, to be able to store them in |
2898 | the current page or 'tmp_data'. |
2899 | |
2900 | Then we write the tail of the non-blob fields (The position to the |
2901 | tail page is stored either in row header, the extents in the head |
2902 | page or in the first full page of the non-blob data. It's never in |
2903 | the tail page of the non-blob data) |
2904 | */ |
2905 | |
2906 | blob_full_pages_exists= 0; |
2907 | if (row_extents_in_use) |
2908 | { |
2909 | if (column != end_column) /* If blob fields */ |
2910 | { |
2911 | MARIA_COLUMNDEF *save_column= column; |
2912 | MARIA_BITMAP_BLOCK *save_block= block; |
2913 | MARIA_BITMAP_BLOCK *end_block; |
2914 | ulong *save_blob_lengths= blob_lengths; |
2915 | |
2916 | for (; column < end_column; column++, blob_lengths++) |
2917 | { |
2918 | uchar *blob_pos; |
2919 | if (!*blob_lengths) /* Null or "" */ |
2920 | continue; |
2921 | if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) |
2922 | { |
2923 | uint length; |
2924 | length= column->length - portable_sizeof_char_ptr; |
2925 | memcpy(&blob_pos, record + column->offset + length, sizeof(char*)); |
2926 | length= *blob_lengths % FULL_PAGE_SIZE(share); /* tail size */ |
2927 | if (length != *blob_lengths) |
2928 | blob_full_pages_exists= 1; |
2929 | if (write_tail(info, block + block->sub_blocks-1, |
2930 | blob_pos + *blob_lengths - length, |
2931 | length)) |
2932 | goto disk_err; |
2933 | } |
2934 | else |
2935 | blob_full_pages_exists= 1; |
2936 | |
2937 | for (end_block= block + block->sub_blocks; block < end_block; block++) |
2938 | { |
2939 | /* |
2940 | Set only a bit, to not cause bitmap code to believe a block is full |
2941 | when there is still a lot of entries in it. |
2942 | */ |
2943 | block->used|= BLOCKUSED_USED; |
2944 | } |
2945 | } |
2946 | DBUG_ASSERT((undo_lsn == LSN_ERROR || |
2947 | block == bitmap_blocks->block + bitmap_blocks->count)); |
2948 | column= save_column; |
2949 | block= save_block; |
2950 | blob_lengths= save_blob_lengths; |
2951 | } |
2952 | |
2953 | if (tmp_data_used) /* non blob data overflows */ |
2954 | { |
2955 | MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block; |
2956 | MARIA_BITMAP_BLOCK *head_tail_block= 0; |
2957 | ulong length; |
2958 | ulong data_length= (ulong) (tmp_data - info->rec_buff); |
2959 | |
2960 | #ifdef SANITY_CHECKS |
2961 | DBUG_ASSERT(head_block->sub_blocks != 1); |
2962 | if (head_block->sub_blocks == 1) |
2963 | goto crashed; /* no reserved full or tails */ |
2964 | #endif |
2965 | /* |
2966 | Find out where to write tail for non-blob fields. |
2967 | |
2968 | Problem here is that the bitmap code may have allocated more |
2969 | space than we need. We have to handle the following cases: |
2970 | |
2971 | - Bitmap code allocated a tail page we don't need. |
2972 | - The last full page allocated needs to be changed to a tail page |
2973 | (Because we where able to put more data on the head page than |
2974 | the bitmap allocation assumed) |
2975 | |
2976 | The reserved pages in bitmap_blocks for the main page has one of |
2977 | the following allocations: |
2978 | - Full pages, with following blocks: |
2979 | # * full pages |
2980 | empty page ; To be used if we change last full to tail page. This |
2981 | has 'count' = 0. |
2982 | tail page (optional, if last full page was part full) |
2983 | - One tail page |
2984 | */ |
2985 | |
2986 | cur_block= head_block + 1; |
2987 | end_block= head_block + head_block->sub_blocks; |
2988 | /* |
2989 | Loop until we have find a block bigger than we need or |
2990 | we find the empty page block. |
2991 | */ |
2992 | while (data_length >= (length= (cur_block->page_count * |
2993 | FULL_PAGE_SIZE(share))) && |
2994 | cur_block->page_count) |
2995 | { |
2996 | #ifdef SANITY_CHECKS |
2997 | DBUG_ASSERT(!((cur_block == end_block) || |
2998 | (cur_block->used & BLOCKUSED_USED))); |
2999 | if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED)) |
3000 | goto crashed; |
3001 | #endif |
3002 | data_length-= length; |
3003 | (cur_block++)->used|= BLOCKUSED_USED; |
3004 | } |
3005 | last_head_block= cur_block; |
3006 | if (data_length) |
3007 | { |
3008 | if (cur_block->page_count == 0) |
3009 | { |
3010 | /* Skip empty filler block */ |
3011 | cur_block++; |
3012 | } |
3013 | #ifdef SANITY_CHECKS |
3014 | DBUG_ASSERT(!(cur_block >= end_block)); |
3015 | if ((cur_block >= end_block)) |
3016 | goto crashed; |
3017 | #endif |
3018 | if (cur_block->used & BLOCKUSED_TAIL) |
3019 | { |
3020 | DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size)); |
3021 | /* tail written to tail page */ |
3022 | cur_block->used|= BLOCKUSED_USED; |
3023 | head_tail_block= cur_block; |
3024 | } |
3025 | else if (data_length > length - MAX_TAIL_SIZE(block_size)) |
3026 | { |
3027 | /* tail written to full page */ |
3028 | cur_block->used|= BLOCKUSED_USED; |
3029 | if ((cur_block != end_block - 1) && |
3030 | (end_block[-1].used & BLOCKUSED_TAIL)) |
3031 | bitmap_blocks->tail_page_skipped= 1; |
3032 | } |
3033 | else |
3034 | { |
3035 | /* |
3036 | cur_block is a full block, followed by an empty and optional |
3037 | tail block. Change cur_block to a tail block or split it |
3038 | into full blocks and tail blocks. |
3039 | |
3040 | TODO: |
3041 | If there is enough space on the following tail block, use |
3042 | this instead of creating a new tail block. |
3043 | */ |
3044 | DBUG_ASSERT(cur_block[1].page_count == 0); |
3045 | if (cur_block->page_count == 1) |
3046 | { |
3047 | /* convert full block to tail block */ |
3048 | cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; |
3049 | head_tail_block= cur_block; |
3050 | } |
3051 | else |
3052 | { |
3053 | DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share)); |
3054 | DBUG_PRINT("info" , ("Splitting blocks into full and tail" )); |
3055 | cur_block[1].page= (cur_block->page + cur_block->page_count - 1); |
3056 | cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */ |
3057 | cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL; |
3058 | cur_block->page_count--; |
3059 | cur_block->used|= BLOCKUSED_USED; |
3060 | last_head_block= head_tail_block= cur_block+1; |
3061 | } |
3062 | if (end_block[-1].used & BLOCKUSED_TAIL) |
3063 | bitmap_blocks->tail_page_skipped= 1; |
3064 | } |
3065 | } |
3066 | else |
3067 | { |
3068 | /* Must be an empty or tail page */ |
3069 | DBUG_ASSERT(cur_block->page_count == 0 || |
3070 | cur_block->used & BLOCKUSED_TAIL); |
3071 | if (end_block[-1].used & BLOCKUSED_TAIL) |
3072 | bitmap_blocks->tail_page_skipped= 1; |
3073 | } |
3074 | |
3075 | /* |
3076 | Write all extents into page or tmp_data |
3077 | |
3078 | Note that we still don't have a correct position for the tail |
3079 | of the non-blob fields. |
3080 | */ |
3081 | store_extent_info(row_extents_first_part, |
3082 | row_extents_second_part, |
3083 | head_block+1, bitmap_blocks->count - 1); |
3084 | if (head_tail_block) |
3085 | { |
3086 | ulong block_length= (ulong) (tmp_data - info->rec_buff); |
3087 | uchar *extent_data; |
3088 | |
3089 | length= (uint) (block_length % FULL_PAGE_SIZE(share)); |
3090 | if (write_tail(info, head_tail_block, |
3091 | info->rec_buff + block_length - length, |
3092 | length)) |
3093 | goto disk_err; |
3094 | tmp_data-= length; /* Remove the tail */ |
3095 | if (tmp_data == info->rec_buff) |
3096 | { |
3097 | /* We have no full blocks to write for the head part */ |
3098 | tmp_data_used= 0; |
3099 | } |
3100 | |
3101 | /* Store the tail position for the non-blob fields */ |
3102 | if (head_tail_block == head_block + 1) |
3103 | { |
3104 | /* |
3105 | We had a head block + tail block, which means that the |
3106 | tail block is the first extent |
3107 | */ |
3108 | extent_data= row_extents_first_part; |
3109 | } |
3110 | else |
3111 | { |
3112 | /* |
3113 | We have a head block + some full blocks + tail block |
3114 | last_head_block is pointing after the last used extent |
3115 | for the head block. |
3116 | */ |
3117 | extent_data= row_extents_second_part + |
3118 | ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE; |
3119 | } |
3120 | /* Write information for tail block in the reserved space */ |
3121 | page_store(extent_data, head_tail_block->page); |
3122 | pagerange_store(extent_data + PAGE_STORE_SIZE, |
3123 | head_tail_block->page_count); |
3124 | } |
3125 | } |
3126 | else |
3127 | store_extent_info(row_extents_first_part, |
3128 | row_extents_second_part, |
3129 | head_block+1, bitmap_blocks->count - 1); |
3130 | } |
3131 | |
3132 | if (share->now_transactional) |
3133 | { |
3134 | uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; |
3135 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; |
3136 | |
3137 | /* Log REDO changes of head page */ |
3138 | page_store(log_data + FILEID_STORE_SIZE, head_block->page); |
3139 | dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, |
3140 | row_pos->rownr); |
3141 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
3142 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
3143 | log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data; |
3144 | log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length; |
3145 | if (translog_write_record(&lsn, |
3146 | head_block_is_read ? |
3147 | LOGREC_REDO_INSERT_ROW_HEAD : |
3148 | LOGREC_REDO_NEW_ROW_HEAD, |
3149 | info->trn, |
3150 | info, |
3151 | (translog_size_t) (sizeof(log_data) + |
3152 | head_length), |
3153 | TRANSLOG_INTERNAL_PARTS + 2, log_array, |
3154 | log_data, NULL)) |
3155 | goto disk_err; |
3156 | } |
3157 | |
3158 | #ifdef RECOVERY_EXTRA_DEBUG |
3159 | if (info->trn->undo_lsn != LSN_IMPOSSIBLE) |
3160 | { |
3161 | /* Stop right after the REDO; testing incomplete log record groups */ |
3162 | DBUG_EXECUTE_IF("maria_flush_whole_log" , |
3163 | { |
3164 | DBUG_PRINT("maria_flush_whole_log" , ("now" )); |
3165 | translog_flush(translog_get_horizon()); |
3166 | }); |
3167 | DBUG_EXECUTE_IF("maria_crash" , |
3168 | { DBUG_PRINT("maria_crash" , ("now" )); DBUG_SUICIDE(); }); |
3169 | } |
3170 | #endif |
3171 | |
3172 | if (head_block_is_read) |
3173 | { |
3174 | MARIA_PINNED_PAGE *page_link; |
3175 | /* Head page is always the first pinned page */ |
3176 | page_link= dynamic_element(&info->pinned_pages, 0, |
3177 | MARIA_PINNED_PAGE*); |
3178 | pagecache_unlock_by_link(share->pagecache, page_link->link, |
3179 | PAGECACHE_LOCK_WRITE_TO_READ, |
3180 | PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, |
3181 | LSN_IMPOSSIBLE, 1, FALSE); |
3182 | page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; |
3183 | page_link->changed= 1; |
3184 | } |
3185 | else |
3186 | { |
3187 | if (pagecache_write(share->pagecache, |
3188 | &info->dfile, head_block->page, 0, |
3189 | page_buff, share->page_type, |
3190 | head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ : |
3191 | PAGECACHE_LOCK_READ, |
3192 | head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED : |
3193 | PAGECACHE_PIN, |
3194 | PAGECACHE_WRITE_DELAY, &page_link.link, |
3195 | LSN_IMPOSSIBLE)) |
3196 | goto disk_err; |
3197 | DBUG_ASSERT(page_link.link); |
3198 | page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; |
3199 | page_link.changed= 1; |
3200 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
3201 | |
3202 | /* Increase data file size, if extended */ |
3203 | position= (my_off_t) head_block->page * block_size; |
3204 | if (share->state.state.data_file_length <= position) |
3205 | _ma_set_share_data_file_length(share, position + block_size); |
3206 | } |
3207 | |
3208 | if (share->now_transactional && (tmp_data_used || blob_full_pages_exists)) |
3209 | { |
3210 | /* |
3211 | Log REDO writes for all full pages (head part and all blobs) |
3212 | We write all here to be able to generate the UNDO record early |
3213 | so that we can write the LSN for the UNDO record to all full pages. |
3214 | */ |
3215 | uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + |
3216 | (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) * |
3217 | ROW_EXTENTS_ON_STACK]; |
3218 | uchar *log_data, *log_pos; |
3219 | LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 + |
3220 | ROW_EXTENTS_ON_STACK]; |
3221 | LEX_CUSTRING *log_array_pos, *log_array; |
3222 | int error; |
3223 | translog_size_t log_entry_length= 0; |
3224 | uint ext_length, extents= 0, sub_extents= 0; |
3225 | |
3226 | /* If few extents, then allocate things on stack to avoid a malloc call */ |
3227 | if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK) |
3228 | { |
3229 | log_array= tmp_log_array; |
3230 | log_data= tmp_log_data; |
3231 | } |
3232 | else |
3233 | { |
3234 | if (!my_multi_malloc(MY_WME, &log_array, |
3235 | (uint) ((bitmap_blocks->count + |
3236 | TRANSLOG_INTERNAL_PARTS + 2) * |
3237 | sizeof(*log_array)), |
3238 | &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + |
3239 | bitmap_blocks->count * (ROW_EXTENT_SIZE + |
3240 | BLOCK_FILLER_SIZE + |
3241 | SUB_RANGE_SIZE), |
3242 | NullS)) |
3243 | goto disk_err; |
3244 | } |
3245 | log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2; |
3246 | log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1; |
3247 | |
3248 | if (tmp_data_used) |
3249 | { |
3250 | /* Full head page */ |
3251 | translog_size_t block_length= (translog_size_t) (tmp_data - |
3252 | info->rec_buff); |
3253 | log_pos= store_page_range(share, |
3254 | log_pos, head_block+1, |
3255 | (ulong) block_length, &extents); |
3256 | log_array_pos->str= info->rec_buff; |
3257 | log_array_pos->length= block_length; |
3258 | log_entry_length+= block_length; |
3259 | log_array_pos++; |
3260 | sub_extents++; |
3261 | } |
3262 | if (blob_full_pages_exists) |
3263 | { |
3264 | MARIA_COLUMNDEF *tmp_column= column; |
3265 | ulong *tmp_blob_lengths= blob_lengths; |
3266 | MARIA_BITMAP_BLOCK *tmp_block= block; |
3267 | |
3268 | /* Full blob pages */ |
3269 | for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++) |
3270 | { |
3271 | ulong blob_length; |
3272 | uint length; |
3273 | |
3274 | if (!*tmp_blob_lengths) /* Null or "" */ |
3275 | continue; |
3276 | blob_length= *tmp_blob_lengths; |
3277 | length= tmp_column->length - portable_sizeof_char_ptr; |
3278 | /* |
3279 | If last part of blog was on tail page, change blob_length to |
3280 | reflect this |
3281 | */ |
3282 | if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL) |
3283 | blob_length-= (blob_length % FULL_PAGE_SIZE(share)); |
3284 | if (blob_length) |
3285 | { |
3286 | memcpy((void*) &log_array_pos->str, |
3287 | record + tmp_column->offset + length, |
3288 | sizeof(uchar*)); |
3289 | log_array_pos->length= blob_length; |
3290 | log_entry_length+= blob_length; |
3291 | log_array_pos++; |
3292 | sub_extents++; |
3293 | |
3294 | log_pos= store_page_range(share, |
3295 | log_pos, tmp_block, |
3296 | blob_length, &extents); |
3297 | } |
3298 | tmp_block+= tmp_block->sub_blocks; |
3299 | } |
3300 | } |
3301 | |
3302 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
3303 | ext_length= (uint) (log_pos - log_data); |
3304 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length; |
3305 | pagerange_store(log_data+ FILEID_STORE_SIZE, extents); |
3306 | pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, |
3307 | sub_extents); |
3308 | |
3309 | log_entry_length+= ext_length; |
3310 | /* trn->rec_lsn is already set earlier in this function */ |
3311 | error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS, |
3312 | info->trn, info, log_entry_length, |
3313 | (uint) (log_array_pos - log_array), |
3314 | log_array, log_data, NULL); |
3315 | if (log_array != tmp_log_array) |
3316 | my_free(log_array); |
3317 | if (error) |
3318 | goto disk_err; |
3319 | } |
3320 | |
3321 | /* Write UNDO or CLR record */ |
3322 | lsn= LSN_IMPOSSIBLE; |
3323 | if (share->now_transactional) |
3324 | { |
3325 | LEX_CUSTRING *log_array= info->log_row_parts; |
3326 | |
3327 | if (undo_lsn != LSN_ERROR) |
3328 | { |
3329 | /* |
3330 | Store if this CLR is about UNDO_DELETE or UNDO_UPDATE; |
3331 | in the first case, Recovery, when it sees the CLR_END in the |
3332 | REDO phase, may decrement the records' count. |
3333 | */ |
3334 | if (_ma_write_clr(info, undo_lsn, |
3335 | old_record ? LOGREC_UNDO_ROW_UPDATE : |
3336 | LOGREC_UNDO_ROW_DELETE, |
3337 | share->calc_checksum != 0, |
3338 | row->checksum - old_record_checksum, |
3339 | &lsn, (void*) 0)) |
3340 | goto disk_err; |
3341 | } |
3342 | else |
3343 | { |
3344 | uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + |
3345 | PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 + |
3346 | HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + |
3347 | ROW_EXTENT_SIZE]; |
3348 | uchar *log_pos; |
3349 | ha_checksum checksum_delta; |
3350 | |
3351 | /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */ |
3352 | lsn_store(log_data, info->trn->undo_lsn); |
3353 | page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, |
3354 | head_block->page); |
3355 | dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + |
3356 | PAGE_STORE_SIZE, |
3357 | row_pos->rownr); |
3358 | log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + |
3359 | PAGE_STORE_SIZE + DIRPOS_STORE_SIZE); |
3360 | store_checksum_in_rec(share, checksum_delta, |
3361 | row->checksum - old_record_checksum, |
3362 | log_pos, log_pos); |
3363 | compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE); |
3364 | |
3365 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
3366 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - |
3367 | log_data); |
3368 | |
3369 | if (!old_record) |
3370 | { |
3371 | /* Store undo_lsn in case we are aborting the insert */ |
3372 | row->orig_undo_lsn= info->trn->undo_lsn; |
3373 | /* Write UNDO log record for the INSERT */ |
3374 | if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT, |
3375 | info->trn, info, |
3376 | (translog_size_t) |
3377 | log_array[TRANSLOG_INTERNAL_PARTS + |
3378 | 0].length, |
3379 | TRANSLOG_INTERNAL_PARTS + 1, |
3380 | log_array, |
3381 | log_data + LSN_STORE_SIZE, &checksum_delta)) |
3382 | goto disk_err; |
3383 | } |
3384 | else |
3385 | { |
3386 | /* Write UNDO log record for the UPDATE */ |
3387 | size_t row_length, extents_length; |
3388 | uint row_parts_count, cur_head_length; |
3389 | |
3390 | /* |
3391 | Write head length and extents of the original row so that we |
3392 | during UNDO can put it back in the original position. |
3393 | We don't store size for TRANSID, as we don't write this during |
3394 | UNDO. |
3395 | */ |
3396 | cur_head_length= (info->cur_row.head_length - |
3397 | info->cur_row.header_length); |
3398 | int2store(log_pos, cur_head_length); |
3399 | pagerange_store(log_pos + 2, info->cur_row.extents_count); |
3400 | log_pos+= 2 + PAGERANGE_STORE_SIZE; |
3401 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 + |
3402 | PAGERANGE_STORE_SIZE); |
3403 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= |
3404 | info->cur_row.extents; |
3405 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= |
3406 | extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; |
3407 | |
3408 | row_length= fill_update_undo_parts(info, old_record, record, |
3409 | log_array + |
3410 | TRANSLOG_INTERNAL_PARTS + 2, |
3411 | &row_parts_count); |
3412 | if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn, |
3413 | info, |
3414 | (translog_size_t) |
3415 | (log_array[TRANSLOG_INTERNAL_PARTS + |
3416 | 0].length + extents_length + |
3417 | row_length), |
3418 | TRANSLOG_INTERNAL_PARTS + 2 + |
3419 | row_parts_count, |
3420 | log_array, |
3421 | log_data + LSN_STORE_SIZE, |
3422 | &checksum_delta)) |
3423 | goto disk_err; |
3424 | } |
3425 | } |
3426 | } |
3427 | /* Release not used space in used pages */ |
3428 | if (_ma_bitmap_release_unused(info, bitmap_blocks)) |
3429 | goto disk_err; |
3430 | _ma_unpin_all_pages(info, lsn); |
3431 | |
3432 | if (tmp_data_used) |
3433 | { |
3434 | /* |
3435 | Write data stored in info->rec_buff to pages |
3436 | This is the char/varchar data that didn't fit into the head page. |
3437 | */ |
3438 | DBUG_ASSERT(bitmap_blocks->count != 0); |
3439 | if (write_full_pages(info, lsn, head_block + 1, |
3440 | info->rec_buff, (ulong) (tmp_data - info->rec_buff))) |
3441 | goto disk_err; |
3442 | } |
3443 | |
3444 | /* Write rest of blobs (data, but no tails as they are already written) */ |
3445 | for (; column < end_column; column++, blob_lengths++) |
3446 | { |
3447 | uchar *blob_pos; |
3448 | uint length; |
3449 | ulong blob_length; |
3450 | if (!*blob_lengths) /* Null or "" */ |
3451 | continue; |
3452 | length= column->length - portable_sizeof_char_ptr; |
3453 | memcpy(&blob_pos, record + column->offset + length, sizeof(char*)); |
3454 | /* remove tail part */ |
3455 | blob_length= *blob_lengths; |
3456 | if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) |
3457 | blob_length-= (blob_length % FULL_PAGE_SIZE(share)); |
3458 | |
3459 | if (blob_length && write_full_pages(info, lsn, block, |
3460 | blob_pos, blob_length)) |
3461 | goto disk_err; |
3462 | block+= block->sub_blocks; |
3463 | } |
3464 | |
3465 | _ma_finalize_row(info); |
3466 | DBUG_RETURN(0); |
3467 | |
3468 | crashed: |
3469 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
3470 | /* Something was wrong with data on page */ |
3471 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
3472 | |
3473 | disk_err: |
3474 | /** |
3475 | @todo RECOVERY we are going to let dirty pages go to disk while we have |
3476 | logged UNDO, this violates WAL. We must mark the table corrupted! |
3477 | |
3478 | @todo RECOVERY we have written some REDOs without a closing UNDO, |
3479 | it's possible that a next operation by this transaction succeeds and then |
3480 | Recovery would glue the "orphan REDOs" to the succeeded operation and |
3481 | execute the failed REDOs. We need some mark "abort this group" in the |
3482 | log, or mark the table corrupted (then user will repair it and thus REDOs |
3483 | will be skipped). |
3484 | |
3485 | @todo RECOVERY to not let write errors go unnoticed, pagecache_write() |
3486 | should take a MARIA_HA* in argument, and it it |
3487 | fails when flushing a page to disk it should call |
3488 | (*the_maria_ha->write_error_func)(the_maria_ha) |
3489 | and this hook will mark the table corrupted. |
3490 | Maybe hook should be stored in the pagecache's block structure, or in a |
3491 | hash "file->maria_ha*". |
3492 | |
3493 | @todo RECOVERY we should distinguish below between log write error and |
3494 | table write error. The former should stop Maria immediately, the latter |
3495 | should mark the table corrupted. |
3496 | */ |
3497 | /* |
3498 | Unpin all pinned pages to not cause problems for disk cache. This is |
3499 | safe to call even if we already called _ma_unpin_all_pages() above. |
3500 | */ |
3501 | save_my_errno= my_errno; |
3502 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
3503 | my_errno= save_my_errno; |
3504 | DBUG_RETURN(1); |
3505 | } |
3506 | |
3507 | |
3508 | /* |
3509 | @brief Write a record |
3510 | |
3511 | @fn allocate_and_write_block_record() |
3512 | @param info Maria handler |
3513 | @param record Record to write |
3514 | @param row Information about fields in 'record' |
3515 | @param undo_lsn <> LSN_ERROR if we are executing an UNDO |
3516 | |
3517 | @return |
3518 | @retval 0 ok |
3519 | @retval 1 Error |
3520 | */ |
3521 | |
3522 | static my_bool allocate_and_write_block_record(MARIA_HA *info, |
3523 | const uchar *record, |
3524 | MARIA_ROW *row, |
3525 | LSN undo_lsn) |
3526 | { |
3527 | struct st_row_pos_info row_pos; |
3528 | MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks; |
3529 | int save_my_errno; |
3530 | DBUG_ENTER("allocate_and_write_block_record" ); |
3531 | |
3532 | _ma_bitmap_flushable(info, 1); |
3533 | if (_ma_bitmap_find_place(info, row, blocks)) |
3534 | goto err; /* Error reading bitmap */ |
3535 | |
3536 | /* |
3537 | Sleep; a checkpoint will happen and should not send this over-allocated |
3538 | bitmap to disk but rather wait. |
3539 | */ |
3540 | DBUG_EXECUTE_IF("maria_over_alloc_bitmap" , sleep(10);); |
3541 | |
3542 | /* page will be pinned & locked by get_head_or_tail_page */ |
3543 | if (get_head_or_tail_page(info, blocks->block, info->buff, |
3544 | MY_MAX(row->space_on_head_page, |
3545 | info->s->base.min_block_length), |
3546 | HEAD_PAGE, |
3547 | PAGECACHE_LOCK_WRITE, &row_pos)) |
3548 | goto err; |
3549 | row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr); |
3550 | if (info->s->calc_checksum) |
3551 | { |
3552 | if (undo_lsn == LSN_ERROR) |
3553 | row->checksum= (info->s->calc_checksum)(info, record); |
3554 | else |
3555 | { |
3556 | /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */ |
3557 | DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record)); |
3558 | } |
3559 | } |
3560 | DBUG_PRINT("info" , ("rowid: %lu (%lu:%u) length: %u" , (ulong) row->lastpos, |
3561 | (ulong) ma_recordpos_to_page(row->lastpos), |
3562 | ma_recordpos_to_dir_entry(row->lastpos), |
3563 | row_pos.length)); |
3564 | if (write_block_record(info, (uchar*) 0, record, row, |
3565 | blocks, blocks->block->org_bitmap_value != 0, |
3566 | &row_pos, undo_lsn, 0)) |
3567 | goto err; |
3568 | /* Now let checkpoint happen but don't commit */ |
3569 | DBUG_EXECUTE_IF("maria_over_alloc_bitmap" , sleep(1000);); |
3570 | DBUG_RETURN(0); |
3571 | |
3572 | err: |
3573 | save_my_errno= my_errno; |
3574 | if (info->non_flushable_state) |
3575 | _ma_bitmap_flushable(info, -1); |
3576 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
3577 | my_errno= save_my_errno; |
3578 | DBUG_RETURN(1); |
3579 | } |
3580 | |
3581 | |
3582 | /* |
3583 | Write a record and return rowid for it |
3584 | |
3585 | SYNOPSIS |
3586 | _ma_write_init_block_record() |
3587 | info Maria handler |
3588 | record Record to write |
3589 | |
3590 | NOTES |
3591 | This is done BEFORE we write the keys to the row! |
3592 | |
3593 | RETURN |
3594 | HA_OFFSET_ERROR Something went wrong |
3595 | # Rowid for row |
3596 | */ |
3597 | |
3598 | MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, |
3599 | const uchar *record) |
3600 | { |
3601 | DBUG_ENTER("_ma_write_init_block_record" ); |
3602 | |
3603 | calc_record_size(info, record, &info->cur_row); |
3604 | if (allocate_and_write_block_record(info, record, |
3605 | &info->cur_row, LSN_ERROR)) |
3606 | DBUG_RETURN(HA_OFFSET_ERROR); |
3607 | DBUG_RETURN(info->cur_row.lastpos); |
3608 | } |
3609 | |
3610 | |
3611 | /* |
3612 | Dummy function for (*info->s->write_record)() |
3613 | |
3614 | Nothing to do here, as we already wrote the record in |
3615 | _ma_write_init_block_record() |
3616 | */ |
3617 | |
3618 | my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)), |
3619 | const uchar *record __attribute__ ((unused))) |
3620 | { |
3621 | return 0; /* Row already written */ |
3622 | } |
3623 | |
3624 | |
3625 | /** |
3626 | @brief Remove row written by _ma_write_block_record() and log undo |
3627 | |
3628 | @param info Maria handler |
3629 | |
3630 | @note |
3631 | This is called in case we got a duplicate unique key while |
3632 | writing keys. |
3633 | |
3634 | @return Operation status |
3635 | @retval 0 OK |
3636 | @retval 1 Error |
3637 | */ |
3638 | |
3639 | my_bool _ma_write_abort_block_record(MARIA_HA *info) |
3640 | { |
3641 | my_bool res= 0; |
3642 | MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; |
3643 | MARIA_BITMAP_BLOCK *block, *end; |
3644 | LSN lsn= LSN_IMPOSSIBLE; |
3645 | MARIA_SHARE *share= info->s; |
3646 | DBUG_ENTER("_ma_write_abort_block_record" ); |
3647 | |
3648 | _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */ |
3649 | if (delete_head_or_tail(info, |
3650 | ma_recordpos_to_page(info->cur_row.lastpos), |
3651 | ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1, |
3652 | 0)) |
3653 | res= 1; |
3654 | for (block= blocks->block + 1, end= block + blocks->count - 1; block < end; |
3655 | block++) |
3656 | { |
3657 | if (block->used & BLOCKUSED_USED) |
3658 | { |
3659 | if (block->used & BLOCKUSED_TAIL) |
3660 | { |
3661 | /* |
3662 | block->page_count is set to the tail directory entry number in |
3663 | write_block_record() |
3664 | */ |
3665 | if (delete_head_or_tail(info, block->page, |
3666 | block->page_count & ~TAIL_BIT, |
3667 | 0, 0)) |
3668 | res= 1; |
3669 | } |
3670 | else |
3671 | { |
3672 | if (free_full_page_range(info, block->page, block->page_count)) |
3673 | res= 1; |
3674 | } |
3675 | } |
3676 | } |
3677 | _ma_bitmap_unlock(share); |
3678 | if (share->now_transactional) |
3679 | { |
3680 | if (_ma_write_clr(info, info->cur_row.orig_undo_lsn, |
3681 | LOGREC_UNDO_ROW_INSERT, |
3682 | share->calc_checksum != 0, |
3683 | (ha_checksum) 0 - info->cur_row.checksum, |
3684 | &lsn, (void*) 0)) |
3685 | res= 1; |
3686 | } |
3687 | _ma_unpin_all_pages_and_finalize_row(info, lsn); |
3688 | DBUG_RETURN(res); |
3689 | } |
3690 | |
3691 | |
3692 | /* |
3693 | Update a record |
3694 | |
3695 | NOTES |
3696 | For the moment, we assume that info->curr_row.extents is always updated |
3697 | when a row is read. In the future we may decide to read this on demand |
3698 | for rows split into many extents. |
3699 | */ |
3700 | |
3701 | static my_bool _ma_update_block_record2(MARIA_HA *info, |
3702 | MARIA_RECORD_POS record_pos, |
3703 | const uchar *oldrec, |
3704 | const uchar *record, |
3705 | LSN undo_lsn) |
3706 | { |
3707 | MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; |
3708 | uchar *buff; |
3709 | MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; |
3710 | MARIA_PINNED_PAGE page_link; |
3711 | uint rownr, org_empty_size, head_length; |
3712 | uint block_size= info->s->block_size; |
3713 | uint errpos __attribute__((unused)) = 0; |
3714 | uchar *dir; |
3715 | pgcache_page_no_t page; |
3716 | struct st_row_pos_info row_pos; |
3717 | my_bool res; |
3718 | ha_checksum old_checksum; |
3719 | MARIA_SHARE *share= info->s; |
3720 | DBUG_ENTER("_ma_update_block_record2" ); |
3721 | DBUG_PRINT("enter" , ("rowid: %lu" , (long) record_pos)); |
3722 | |
3723 | #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE |
3724 | DBUG_DUMP("oldrec" , oldrec, share->base.reclength); |
3725 | DBUG_DUMP("newrec" , record, share->base.reclength); |
3726 | #endif |
3727 | |
3728 | /* |
3729 | Checksums of new and old rows were computed by callers already; new |
3730 | row's was put into cur_row, old row's was put into new_row. |
3731 | */ |
3732 | old_checksum= new_row->checksum; |
3733 | new_row->checksum= cur_row->checksum; |
3734 | calc_record_size(info, record, new_row); |
3735 | page= ma_recordpos_to_page(record_pos); |
3736 | |
3737 | _ma_bitmap_flushable(info, 1); |
3738 | buff= pagecache_read(share->pagecache, |
3739 | &info->dfile, (pgcache_page_no_t) page, 0, 0, |
3740 | share->page_type, |
3741 | PAGECACHE_LOCK_WRITE, &page_link.link); |
3742 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
3743 | page_link.changed= buff != 0; |
3744 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
3745 | if (!buff) |
3746 | goto err; |
3747 | |
3748 | org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); |
3749 | rownr= ma_recordpos_to_dir_entry(record_pos); |
3750 | dir= dir_entry_pos(buff, block_size, rownr); |
3751 | |
3752 | /* |
3753 | We can't use cur_row->head_length as the block may have been compacted |
3754 | since we read it. |
3755 | */ |
3756 | head_length= uint2korr(dir + 2); |
3757 | |
3758 | if ((org_empty_size + head_length) >= new_row->total_length) |
3759 | { |
3760 | uint rec_offset, length; |
3761 | MARIA_BITMAP_BLOCK block; |
3762 | |
3763 | DBUG_PRINT("info" , ("org_empty_size: %u org_length: %u new_length: %lu" , |
3764 | org_empty_size, head_length, |
3765 | new_row->total_length)); |
3766 | |
3767 | /* |
3768 | We can fit the new row in the same page as the original head part |
3769 | of the row |
3770 | */ |
3771 | block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap, |
3772 | org_empty_size); |
3773 | if (extend_area_on_page(info, buff, dir, rownr, |
3774 | new_row->total_length, &org_empty_size, |
3775 | &rec_offset, &length, 1)) |
3776 | { |
3777 | errpos= 1; |
3778 | goto err; |
3779 | } |
3780 | |
3781 | row_pos.buff= buff; |
3782 | row_pos.rownr= rownr; |
3783 | row_pos.empty_space= org_empty_size; |
3784 | row_pos.dir= dir; |
3785 | row_pos.data= buff + rec_offset; |
3786 | row_pos.length= length; |
3787 | blocks->block= █ |
3788 | blocks->count= 1; |
3789 | block.page= page; |
3790 | block.sub_blocks= 1; |
3791 | block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; |
3792 | block.empty_space= row_pos.empty_space; |
3793 | |
3794 | if (*cur_row->tail_positions && |
3795 | delete_tails(info, cur_row->tail_positions)) |
3796 | { |
3797 | errpos= 2; |
3798 | goto err; |
3799 | } |
3800 | if (cur_row->extents_count && free_full_pages(info, cur_row)) |
3801 | { |
3802 | errpos= 3; |
3803 | goto err; |
3804 | } |
3805 | res= write_block_record(info, oldrec, record, new_row, blocks, |
3806 | 1, &row_pos, undo_lsn, old_checksum); |
3807 | /* We can't update or delete this without re-reading it again */ |
3808 | info->update&= ~HA_STATE_AKTIV; |
3809 | DBUG_RETURN(res); |
3810 | } |
3811 | /* Delete old row */ |
3812 | if (*cur_row->tail_positions && |
3813 | delete_tails(info, cur_row->tail_positions)) |
3814 | { |
3815 | errpos= 4; |
3816 | goto err; |
3817 | } |
3818 | if (cur_row->extents_count && free_full_pages(info, cur_row)) |
3819 | { |
3820 | errpos= 5; |
3821 | goto err; |
3822 | } |
3823 | |
3824 | head_length= uint2korr(dir + 2); |
3825 | if (_ma_bitmap_find_new_place(info, new_row, page, head_length + |
3826 | org_empty_size, blocks)) |
3827 | { |
3828 | errpos= 6; |
3829 | goto err; |
3830 | } |
3831 | |
3832 | /* |
3833 | Allocate all size in block for record |
3834 | TODO: |
3835 | Need to improve this to do compact if we can fit one more blob into |
3836 | the head page |
3837 | */ |
3838 | if ((head_length < new_row->space_on_head_page || |
3839 | (new_row->total_length <= head_length && |
3840 | org_empty_size + head_length >= new_row->total_length))) |
3841 | { |
3842 | _ma_compact_block_page(share, |
3843 | buff, rownr, 1, |
3844 | info->trn->min_read_from, |
3845 | share->base.min_block_length); |
3846 | org_empty_size= 0; |
3847 | head_length= uint2korr(dir + 2); |
3848 | } |
3849 | |
3850 | row_pos.buff= buff; |
3851 | row_pos.rownr= rownr; |
3852 | row_pos.empty_space= org_empty_size + head_length; |
3853 | row_pos.dir= dir; |
3854 | row_pos.data= buff + uint2korr(dir); |
3855 | row_pos.length= head_length; |
3856 | if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1, |
3857 | &row_pos, undo_lsn, old_checksum))) |
3858 | { |
3859 | errpos= 7; |
3860 | goto err; |
3861 | } |
3862 | DBUG_RETURN(0); |
3863 | |
3864 | err: |
3865 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
3866 | DBUG_PRINT("error" , ("errpos: %d" , errpos)); |
3867 | if (info->non_flushable_state) |
3868 | _ma_bitmap_flushable(info, -1); |
3869 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
3870 | DBUG_RETURN(1); |
3871 | } |
3872 | |
3873 | |
3874 | /* |
3875 | @brief Store new row on it's original position |
3876 | |
3877 | @note |
3878 | This is basicly a copy of _ma_update_block_record2 |
3879 | When we have a purge thread for deleted row, we can remove this function |
3880 | and use _ma_update_block_record2 instead. |
3881 | |
3882 | This is the main reason we don't make a lot of subfunctions that are |
3883 | common between _ma_update_block_record2() and this function. |
3884 | |
3885 | Note: If something goes wrong we mark the file crashed |
3886 | */ |
3887 | |
3888 | static my_bool _ma_update_at_original_place(MARIA_HA *info, |
3889 | pgcache_page_no_t page, |
3890 | uint rownr, |
3891 | uint length_on_head_page, |
3892 | uint extent_count, |
3893 | const uchar *extent_info, |
3894 | const uchar *oldrec, |
3895 | const uchar *record, |
3896 | LSN undo_lsn) |
3897 | { |
3898 | MARIA_BITMAP_BLOCKS *blocks; |
3899 | MARIA_BITMAP_BLOCK *block; |
3900 | MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; |
3901 | MARIA_PINNED_PAGE page_link; |
3902 | MARIA_SHARE *share= info->s; |
3903 | ha_checksum old_checksum; |
3904 | uint org_empty_size, empty_size; |
3905 | uint block_size= info->s->block_size; |
3906 | uchar *dir, *buff; |
3907 | struct st_row_pos_info row_pos; |
3908 | my_bool res; |
3909 | uint rec_offset, length; |
3910 | DBUG_ENTER("_ma_update_at_original_place" ); |
3911 | |
3912 | #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE |
3913 | DBUG_DUMP("oldrec" , oldrec, share->base.reclength); |
3914 | DBUG_DUMP("newrec" , record, share->base.reclength); |
3915 | #endif |
3916 | |
3917 | /* |
3918 | Checksums of new and old rows were computed by callers already; new |
3919 | row's was put into cur_row, old row's was put into new_row. |
3920 | */ |
3921 | old_checksum= new_row->checksum; |
3922 | new_row->checksum= cur_row->checksum; |
3923 | calc_record_size(info, record, new_row); |
3924 | |
3925 | _ma_bitmap_flushable(info, 1); |
3926 | buff= pagecache_read(share->pagecache, |
3927 | &info->dfile, (pgcache_page_no_t) page, 0, 0, |
3928 | share->page_type, |
3929 | PAGECACHE_LOCK_WRITE, &page_link.link); |
3930 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
3931 | page_link.changed= buff != 0; |
3932 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
3933 | if (!buff) |
3934 | goto err; |
3935 | |
3936 | org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); |
3937 | dir= dir_entry_pos(buff, block_size, rownr); |
3938 | |
3939 | if ((org_empty_size + cur_row->head_length) < length_on_head_page) |
3940 | { |
3941 | DBUG_PRINT("error" , |
3942 | ("org_empty_size: %u head_length: %u length_on_page: %u" , |
3943 | org_empty_size, (uint) cur_row->head_length, |
3944 | length_on_head_page)); |
3945 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
3946 | goto err; |
3947 | } |
3948 | |
3949 | /* |
3950 | We can fit the new row in the same page as the original head part |
3951 | of the row |
3952 | */ |
3953 | empty_size= org_empty_size; |
3954 | if (extend_area_on_page(info, buff, dir, rownr, |
3955 | length_on_head_page, &empty_size, |
3956 | &rec_offset, &length, 1)) |
3957 | goto err; |
3958 | |
3959 | row_pos.buff= buff; |
3960 | row_pos.rownr= rownr; |
3961 | row_pos.empty_space= empty_size; |
3962 | row_pos.dir= dir; |
3963 | row_pos.data= buff + rec_offset; |
3964 | |
3965 | /* Delete old row */ |
3966 | if (*cur_row->tail_positions && |
3967 | delete_tails(info, cur_row->tail_positions)) |
3968 | goto err; |
3969 | if (cur_row->extents_count && free_full_pages(info, cur_row)) |
3970 | goto err; |
3971 | |
3972 | /* Change extent information to be usable by write_block_record() */ |
3973 | blocks= &cur_row->insert_blocks; |
3974 | if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) |
3975 | goto err; |
3976 | block= blocks->block; |
3977 | block->empty_space= row_pos.empty_space; |
3978 | block->org_bitmap_value= |
3979 | _ma_free_size_to_head_pattern(&share->bitmap, |
3980 | (enough_free_entries_on_page(share, buff) ? |
3981 | org_empty_size : 0)); |
3982 | |
3983 | DBUG_ASSERT(block->org_bitmap_value == |
3984 | _ma_bitmap_get_page_bits(info, &info->s->bitmap, page)); |
3985 | block->used|= BLOCKUSED_USE_ORG_BITMAP; |
3986 | |
3987 | /* |
3988 | We have to use <= below as the new_row may be smaller than the original |
3989 | row as the new row doesn't have transaction id |
3990 | */ |
3991 | |
3992 | DBUG_ASSERT(blocks->count > 1 || |
3993 | MY_MAX(new_row->total_length, share->base.min_block_length) <= |
3994 | length_on_head_page); |
3995 | |
3996 | /* Store same amount of data on head page as on original page */ |
3997 | row_pos.length= (length_on_head_page - |
3998 | (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); |
3999 | set_if_bigger(row_pos.length, share->base.min_block_length); |
4000 | if ((res= write_block_record(info, oldrec, record, new_row, blocks, |
4001 | 1, &row_pos, undo_lsn, old_checksum))) |
4002 | goto err; |
4003 | DBUG_RETURN(0); |
4004 | |
4005 | err: |
4006 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
4007 | _ma_mark_file_crashed(share); |
4008 | if (info->non_flushable_state) |
4009 | _ma_bitmap_flushable(info, -1); |
4010 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
4011 | DBUG_RETURN(1); |
4012 | } |
4013 | |
4014 | |
4015 | /* Wrapper for _ma_update_block_record2() used by ma_update() */ |
4016 | |
4017 | my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos, |
4018 | const uchar *orig_rec, const uchar *new_rec) |
4019 | { |
4020 | return _ma_update_block_record2(info, record_pos, orig_rec, new_rec, |
4021 | LSN_ERROR); |
4022 | } |
4023 | |
4024 | |
4025 | /* |
4026 | Delete a directory entry |
4027 | |
4028 | SYNOPSIS |
4029 | delete_dir_entry() |
4030 | buff Page buffer |
4031 | record_number Record number to delete |
4032 | empty_space Empty space on page after delete |
4033 | |
4034 | RETURN |
4035 | -1 Error on page |
4036 | 0 ok |
4037 | 1 Page is now empty |
4038 | */ |
4039 | |
4040 | static int delete_dir_entry(MARIA_SHARE *share, |
4041 | uchar *buff, uint record_number, |
4042 | uint *empty_space_res) |
4043 | { |
4044 | uint block_size= share->block_size; |
4045 | uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; |
4046 | uint length, empty_space; |
4047 | uchar *dir; |
4048 | DBUG_ENTER("delete_dir_entry" ); |
4049 | DBUG_PRINT("enter" , ("record_number: %u number_of_records: %u" , |
4050 | record_number, number_of_records)); |
4051 | |
4052 | #ifdef SANITY_CHECKS |
4053 | if (record_number >= number_of_records || |
4054 | record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 - |
4055 | PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE)) |
4056 | { |
4057 | DBUG_PRINT("error" , ("record_number: %u number_of_records: %u" , |
4058 | record_number, number_of_records)); |
4059 | |
4060 | DBUG_RETURN(-1); |
4061 | } |
4062 | #endif |
4063 | |
4064 | check_directory(share, buff, block_size, 0, (uint) -1); |
4065 | empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
4066 | dir= dir_entry_pos(buff, block_size, record_number); |
4067 | length= uint2korr(dir + 2); /* Length of entry we just deleted */ |
4068 | DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size); |
4069 | |
4070 | if (record_number == number_of_records - 1) |
4071 | { |
4072 | /* Delete this entry and all following free directory entries */ |
4073 | uchar *end= buff + block_size - PAGE_SUFFIX_SIZE; |
4074 | number_of_records--; |
4075 | dir+= DIR_ENTRY_SIZE; |
4076 | empty_space+= DIR_ENTRY_SIZE; |
4077 | |
4078 | /* Unlink and free the next empty ones */ |
4079 | while (dir < end && dir[0] == 0 && dir[1] == 0) |
4080 | { |
4081 | number_of_records--; |
4082 | if (dir[2] == END_OF_DIR_FREE_LIST) |
4083 | buff[DIR_FREE_OFFSET]= dir[3]; |
4084 | else |
4085 | { |
4086 | uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]); |
4087 | DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] == |
4088 | number_of_records); |
4089 | prev_entry[3]= dir[3]; |
4090 | } |
4091 | if (dir[3] != END_OF_DIR_FREE_LIST) |
4092 | { |
4093 | uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); |
4094 | DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] == |
4095 | number_of_records); |
4096 | next_entry[2]= dir[2]; |
4097 | } |
4098 | dir+= DIR_ENTRY_SIZE; |
4099 | empty_space+= DIR_ENTRY_SIZE; |
4100 | } |
4101 | |
4102 | if (number_of_records == 0) |
4103 | { |
4104 | /* All entries on page deleted */ |
4105 | DBUG_PRINT("info" , ("Page marked as unallocated" )); |
4106 | buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; |
4107 | #ifdef IDENTICAL_PAGES_AFTER_RECOVERY |
4108 | { |
4109 | dir= dir_entry_pos(buff, block_size, record_number); |
4110 | bzero(dir, (record_number+1) * DIR_ENTRY_SIZE); |
4111 | } |
4112 | #endif |
4113 | *empty_space_res= block_size; |
4114 | DBUG_RETURN(1); |
4115 | } |
4116 | buff[DIR_COUNT_OFFSET]= (uchar) number_of_records; |
4117 | } |
4118 | else |
4119 | { |
4120 | /* Update directory */ |
4121 | dir[0]= dir[1]= 0; |
4122 | dir[2]= END_OF_DIR_FREE_LIST; |
4123 | if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST) |
4124 | { |
4125 | /* Relink next entry to point to newly freed entry */ |
4126 | uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); |
4127 | DBUG_ASSERT(uint2korr(next_entry) == 0 && |
4128 | next_entry[2] == END_OF_DIR_FREE_LIST); |
4129 | next_entry[2]= record_number; |
4130 | } |
4131 | buff[DIR_FREE_OFFSET]= record_number; |
4132 | } |
4133 | empty_space+= length; |
4134 | |
4135 | int2store(buff + EMPTY_SPACE_OFFSET, empty_space); |
4136 | buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED; |
4137 | |
4138 | *empty_space_res= empty_space; |
4139 | |
4140 | check_directory(share, buff, block_size, 0, empty_space); |
4141 | DBUG_RETURN(0); |
4142 | } |
4143 | |
4144 | |
4145 | /* |
4146 | Delete a head a tail part |
4147 | |
4148 | SYNOPSIS |
4149 | delete_head_or_tail() |
4150 | info Maria handler |
4151 | page Page (not file offset!) on which the row is |
4152 | head 1 if this is a head page |
4153 | from_update 1 if we are called from update. In this case we |
4154 | leave the page as write locked as we may put |
4155 | the new row into the old position. |
4156 | |
4157 | RETURN |
4158 | 0 ok |
4159 | 1 error |
4160 | */ |
4161 | |
4162 | static my_bool delete_head_or_tail(MARIA_HA *info, |
4163 | pgcache_page_no_t page, uint record_number, |
4164 | my_bool head, my_bool from_update) |
4165 | { |
4166 | MARIA_SHARE *share= info->s; |
4167 | uint empty_space; |
4168 | int res; |
4169 | my_bool page_is_empty; |
4170 | uchar *buff; |
4171 | LSN lsn; |
4172 | MARIA_PINNED_PAGE page_link; |
4173 | enum pagecache_page_lock lock_at_write, lock_at_unpin; |
4174 | DBUG_ENTER("delete_head_or_tail" ); |
4175 | DBUG_PRINT("enter" , ("id: %lu (%lu:%u)" , |
4176 | (ulong) ma_recordpos(page, record_number), |
4177 | (ulong) page, record_number)); |
4178 | |
4179 | buff= pagecache_read(share->pagecache, |
4180 | &info->dfile, page, 0, 0, |
4181 | share->page_type, |
4182 | PAGECACHE_LOCK_WRITE, &page_link.link); |
4183 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
4184 | page_link.changed= buff != 0; |
4185 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
4186 | if (!buff) |
4187 | DBUG_RETURN(1); |
4188 | DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == |
4189 | (head ? HEAD_PAGE : TAIL_PAGE)); |
4190 | |
4191 | if (from_update) |
4192 | { |
4193 | lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED; |
4194 | lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK; |
4195 | } |
4196 | else |
4197 | { |
4198 | lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ; |
4199 | lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK; |
4200 | } |
4201 | |
4202 | res= delete_dir_entry(share, buff, record_number, &empty_space); |
4203 | if (res < 0) |
4204 | DBUG_RETURN(1); |
4205 | if (res == 0) /* after our deletion, page is still not empty */ |
4206 | { |
4207 | uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; |
4208 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; |
4209 | page_is_empty= 0; |
4210 | if (share->now_transactional) |
4211 | { |
4212 | /* Log REDO data */ |
4213 | page_store(log_data + FILEID_STORE_SIZE, page); |
4214 | dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, |
4215 | record_number); |
4216 | |
4217 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
4218 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
4219 | if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD : |
4220 | LOGREC_REDO_PURGE_ROW_TAIL), |
4221 | info->trn, info, |
4222 | (translog_size_t) sizeof(log_data), |
4223 | TRANSLOG_INTERNAL_PARTS + 1, log_array, |
4224 | log_data, NULL)) |
4225 | DBUG_RETURN(1); |
4226 | } |
4227 | } |
4228 | else /* page is now empty */ |
4229 | { |
4230 | page_is_empty= 1; |
4231 | if (share->now_transactional) |
4232 | { |
4233 | uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE]; |
4234 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; |
4235 | page_store(log_data + FILEID_STORE_SIZE, page); |
4236 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; |
4237 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); |
4238 | if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL, |
4239 | info->trn, info, |
4240 | (translog_size_t) sizeof(log_data), |
4241 | TRANSLOG_INTERNAL_PARTS + 1, log_array, |
4242 | log_data, NULL)) |
4243 | DBUG_RETURN(1); |
4244 | } |
4245 | /* |
4246 | Mark that this page must be written to disk by page cache, even |
4247 | if we could call pagecache_delete() on it. |
4248 | This is needed to ensure that repair finds the empty page on disk |
4249 | and not old data. |
4250 | */ |
4251 | pagecache_set_write_on_delete_by_link(page_link.link); |
4252 | DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]); |
4253 | } |
4254 | |
4255 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
4256 | lock_at_write, |
4257 | PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, |
4258 | LSN_IMPOSSIBLE, 1, FALSE); |
4259 | page_link.unlock= lock_at_unpin; |
4260 | set_dynamic(&info->pinned_pages, (void*) &page_link, |
4261 | info->pinned_pages.elements-1); |
4262 | |
4263 | DBUG_PRINT("info" , ("empty_space: %u" , empty_space)); |
4264 | |
4265 | /* |
4266 | If there is not enough space for all possible tails, mark the |
4267 | page full |
4268 | */ |
4269 | if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size, |
4270 | 1 + share->base.blobs)) |
4271 | empty_space= 0; |
4272 | |
4273 | DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space)); |
4274 | } |
4275 | |
4276 | |
4277 | /* |
4278 | delete all tails |
4279 | |
4280 | SYNOPSIS |
4281 | delete_tails() |
4282 | info Handler |
4283 | tails Pointer to vector of tail positions, ending with 0 |
4284 | |
4285 | RETURN |
4286 | 0 ok |
4287 | 1 error |
4288 | */ |
4289 | |
4290 | static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails) |
4291 | { |
4292 | my_bool res= 0; |
4293 | DBUG_ENTER("delete_tails" ); |
4294 | for (; *tails; tails++) |
4295 | { |
4296 | if (delete_head_or_tail(info, |
4297 | ma_recordpos_to_page(*tails), |
4298 | ma_recordpos_to_dir_entry(*tails), 0, 1)) |
4299 | res= 1; |
4300 | } |
4301 | DBUG_RETURN(res); |
4302 | } |
4303 | |
4304 | |
4305 | /* |
4306 | Delete a record |
4307 | |
4308 | NOTES |
4309 | For the moment, we assume that info->cur_row.extents is always updated |
4310 | when a row is read. In the future we may decide to read this on demand |
4311 | for rows with many splits. |
4312 | */ |
4313 | |
4314 | my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) |
4315 | { |
4316 | pgcache_page_no_t page; |
4317 | uint record_number; |
4318 | MARIA_SHARE *share= info->s; |
4319 | LSN lsn= LSN_IMPOSSIBLE; |
4320 | DBUG_ENTER("_ma_delete_block_record" ); |
4321 | |
4322 | page= ma_recordpos_to_page(info->cur_row.lastpos); |
4323 | record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos); |
4324 | DBUG_PRINT("enter" , ("rowid: %lu (%lu:%u)" , (ulong) info->cur_row.lastpos, |
4325 | (ulong) page, record_number)); |
4326 | |
4327 | _ma_bitmap_flushable(info, 1); |
4328 | if (delete_head_or_tail(info, page, record_number, 1, 0) || |
4329 | delete_tails(info, info->cur_row.tail_positions)) |
4330 | goto err; |
4331 | |
4332 | if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) |
4333 | goto err; |
4334 | |
4335 | if (share->now_transactional) |
4336 | { |
4337 | uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + |
4338 | DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + |
4339 | HA_CHECKSUM_STORE_SIZE]; |
4340 | uchar *log_pos; |
4341 | size_t row_length; |
4342 | uint row_parts_count, extents_length; |
4343 | ha_checksum checksum_delta; |
4344 | |
4345 | /* Write UNDO record */ |
4346 | lsn_store(log_data, info->trn->undo_lsn); |
4347 | page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page); |
4348 | log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE; |
4349 | dirpos_store(log_pos, record_number); |
4350 | log_pos+= DIRPOS_STORE_SIZE; |
4351 | int2store(log_pos, info->cur_row.head_length - |
4352 | info->cur_row.header_length); |
4353 | log_pos+= 2; |
4354 | pagerange_store(log_pos, info->cur_row.extents_count); |
4355 | log_pos+= PAGERANGE_STORE_SIZE; |
4356 | |
4357 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data; |
4358 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length= |
4359 | sizeof(log_data) - HA_CHECKSUM_STORE_SIZE; |
4360 | store_checksum_in_rec(share, checksum_delta, |
4361 | (ha_checksum) 0 - info->cur_row.checksum, log_pos, |
4362 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS + |
4363 | 0].length); |
4364 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= |
4365 | info->cur_row.extents; |
4366 | info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= |
4367 | extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; |
4368 | |
4369 | row_length= fill_insert_undo_parts(info, record, |
4370 | (info->log_row_parts + |
4371 | TRANSLOG_INTERNAL_PARTS + 2), |
4372 | &row_parts_count); |
4373 | |
4374 | if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn, |
4375 | info, |
4376 | (translog_size_t) |
4377 | (info->log_row_parts[TRANSLOG_INTERNAL_PARTS + |
4378 | 0].length + row_length + |
4379 | extents_length), |
4380 | TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count, |
4381 | info->log_row_parts, |
4382 | log_data + LSN_STORE_SIZE, |
4383 | &checksum_delta)) |
4384 | goto err; |
4385 | } |
4386 | |
4387 | _ma_bitmap_flushable(info, -1); |
4388 | _ma_unpin_all_pages_and_finalize_row(info, lsn); |
4389 | DBUG_RETURN(0); |
4390 | |
4391 | err: |
4392 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
4393 | _ma_bitmap_flushable(info, -1); |
4394 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
4395 | DBUG_RETURN(1); |
4396 | } |
4397 | |
4398 | |
4399 | /**************************************************************************** |
4400 | Reading of records |
4401 | ****************************************************************************/ |
4402 | |
4403 | /* |
4404 | Read position to record from record directory at end of page |
4405 | |
4406 | SYNOPSIS |
4407 | get_record_position() |
4408 | buff page buffer |
4409 | block_size block size for page |
4410 | record_number Record number in index |
4411 | end_of_data pointer to end of data for record |
4412 | |
4413 | RETURN |
4414 | 0 Error in data |
4415 | # Pointer to start of record. |
4416 | In this case *end_of_data is set. |
4417 | */ |
4418 | |
4419 | static uchar *get_record_position(MARIA_SHARE *share, uchar *buff, |
4420 | uint record_number, uchar **end_of_data) |
4421 | { |
4422 | uint block_size= share->block_size; |
4423 | uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; |
4424 | uchar *dir; |
4425 | uchar *data; |
4426 | uint offset, length; |
4427 | |
4428 | #ifdef SANITY_CHECKS |
4429 | if (record_number >= number_of_records || |
4430 | record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE) |
4431 | / DIR_ENTRY_SIZE)) |
4432 | { |
4433 | DBUG_PRINT("error" , |
4434 | ("Wrong row number: record_number: %u number_of_records: %u" , |
4435 | record_number, number_of_records)); |
4436 | return 0; |
4437 | } |
4438 | #endif |
4439 | |
4440 | dir= dir_entry_pos(buff, block_size, record_number); |
4441 | offset= uint2korr(dir); |
4442 | length= uint2korr(dir + 2); |
4443 | #ifdef SANITY_CHECKS |
4444 | if (offset < PAGE_HEADER_SIZE(share) || |
4445 | offset + length > (block_size - |
4446 | number_of_records * DIR_ENTRY_SIZE - |
4447 | PAGE_SUFFIX_SIZE)) |
4448 | { |
4449 | DBUG_PRINT("error" , |
4450 | ("Wrong row position: record_number: %u offset: %u " |
4451 | "length: %u number_of_records: %u" , |
4452 | record_number, offset, length, number_of_records)); |
4453 | return 0; |
4454 | } |
4455 | #endif |
4456 | data= buff + offset; |
4457 | *end_of_data= data + length; |
4458 | return data; |
4459 | } |
4460 | |
4461 | |
4462 | /* |
4463 | Init extent |
4464 | |
4465 | NOTES |
4466 | extent is a cursor over which pages to read |
4467 | */ |
4468 | |
4469 | static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info, |
4470 | uint extents, MARIA_RECORD_POS *tail_positions) |
4471 | { |
4472 | uint page_count; |
4473 | extent->extent= extent_info; |
4474 | extent->extent_count= extents; |
4475 | extent->page= page_korr(extent_info); /* First extent */ |
4476 | page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) & |
4477 | ~START_EXTENT_BIT); |
4478 | extent->tail= page_count & TAIL_BIT; |
4479 | if (extent->tail) |
4480 | { |
4481 | extent->page_count= 1; |
4482 | extent->tail_row_nr= page_count & ~TAIL_BIT; |
4483 | } |
4484 | else |
4485 | extent->page_count= page_count; |
4486 | extent->tail_positions= tail_positions; |
4487 | extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED; |
4488 | } |
4489 | |
4490 | |
4491 | /* |
4492 | Read next extent |
4493 | |
4494 | SYNOPSIS |
4495 | read_next_extent() |
4496 | info Maria handler |
4497 | extent Pointer to current extent (this is updated to point |
4498 | to next) |
4499 | end_of_data Pointer to end of data in read block (out) |
4500 | |
4501 | NOTES |
4502 | New block is read into info->buff |
4503 | |
4504 | RETURN |
4505 | 0 Error; my_errno is set |
4506 | # Pointer to start of data in read block |
4507 | In this case end_of_data is updated to point to end of data. |
4508 | */ |
4509 | |
4510 | static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent, |
4511 | uchar **end_of_data) |
4512 | { |
4513 | MARIA_SHARE *share= info->s; |
4514 | uchar *buff, *data; |
4515 | MARIA_PINNED_PAGE page_link; |
4516 | enum pagecache_page_lock lock; |
4517 | DBUG_ENTER("read_next_extent" ); |
4518 | |
4519 | if (!extent->page_count) |
4520 | { |
4521 | uint page_count; |
4522 | if (!--extent->extent_count) |
4523 | goto crashed; |
4524 | extent->extent+= ROW_EXTENT_SIZE; |
4525 | extent->page= page_korr(extent->extent); |
4526 | page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) & |
4527 | ~START_EXTENT_BIT); |
4528 | if (!page_count) |
4529 | goto crashed; |
4530 | extent->tail= page_count & TAIL_BIT; |
4531 | if (extent->tail) |
4532 | extent->tail_row_nr= page_count & ~TAIL_BIT; |
4533 | else |
4534 | extent->page_count= page_count; |
4535 | DBUG_PRINT("info" ,("New extent. Page: %lu page_count: %u tail_flag: %d" , |
4536 | (ulong) extent->page, extent->page_count, |
4537 | extent->tail != 0)); |
4538 | } |
4539 | extent->first_extent= 0; |
4540 | |
4541 | lock= PAGECACHE_LOCK_LEFT_UNLOCKED; |
4542 | if (extent->tail) |
4543 | lock= extent->lock_for_tail_pages; |
4544 | |
4545 | buff= pagecache_read(share->pagecache, |
4546 | &info->dfile, extent->page, 0, |
4547 | info->buff, share->page_type, |
4548 | lock, &page_link.link); |
4549 | if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) |
4550 | { |
4551 | /* Read during UNDO */ |
4552 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
4553 | page_link.changed= buff != 0; |
4554 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
4555 | } |
4556 | if (!buff) |
4557 | { |
4558 | /* check if we tried to read over end of file (ie: bad data in record) */ |
4559 | if ((extent->page + 1) * share->block_size > |
4560 | share->state.state.data_file_length) |
4561 | goto crashed; |
4562 | DBUG_RETURN(0); |
4563 | } |
4564 | |
4565 | if (!extent->tail) |
4566 | { |
4567 | /* Full data page */ |
4568 | if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE) |
4569 | goto crashed; |
4570 | extent->page++; /* point to next page */ |
4571 | extent->page_count--; |
4572 | *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE; |
4573 | info->cur_row.full_page_count++; /* For maria_chk */ |
4574 | DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share)); |
4575 | } |
4576 | |
4577 | /* Found tail */ |
4578 | if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE) |
4579 | goto crashed; |
4580 | *(extent->tail_positions++)= ma_recordpos(extent->page, |
4581 | extent->tail_row_nr); |
4582 | info->cur_row.tail_count++; /* For maria_chk */ |
4583 | |
4584 | if (!(data= get_record_position(share, buff, |
4585 | extent->tail_row_nr, |
4586 | end_of_data))) |
4587 | goto crashed; |
4588 | extent->data_start= data; |
4589 | extent->page_count= 0; /* No more data in extent */ |
4590 | DBUG_RETURN(data); |
4591 | |
4592 | |
4593 | crashed: |
4594 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
4595 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
4596 | DBUG_PRINT("error" , ("wrong extent information" )); |
4597 | DBUG_RETURN(0); |
4598 | } |
4599 | |
4600 | |
4601 | /* |
4602 | Read data that may be split over many blocks |
4603 | |
4604 | SYNOPSIS |
4605 | read_long_data() |
4606 | info Maria handler |
4607 | to Store result string here (this is allocated) |
4608 | extent Pointer to current extent position |
4609 | data Current position in buffer |
4610 | end_of_data End of data in buffer |
4611 | |
4612 | NOTES |
4613 | When we have to read a new buffer, it's read into info->buff |
4614 | |
4615 | This loop is implemented by goto's instead of a for() loop as |
4616 | the code is notable smaller and faster this way (and it's not nice |
4617 | to jump into a for loop() or into a 'then' clause) |
4618 | |
4619 | RETURN |
4620 | 0 ok |
4621 | 1 error |
4622 | */ |
4623 | |
4624 | static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length, |
4625 | MARIA_EXTENT_CURSOR *extent, |
4626 | uchar **data, uchar **end_of_data) |
4627 | { |
4628 | uint left_length= (uint) (*end_of_data - *data); |
4629 | DBUG_ENTER("read_long_data2" ); |
4630 | DBUG_PRINT("enter" , ("length: %lu left_length: %u" , |
4631 | length, left_length)); |
4632 | DBUG_ASSERT(*data <= *end_of_data); |
4633 | |
4634 | /* |
4635 | Fields are never split in middle. This means that if length > rest-of-data |
4636 | we should start reading from the next extent. The reason we may have |
4637 | data left on the page is that if the fixed part of the row was less than |
4638 | min_block_length the head block was extended to min_block_length. |
4639 | |
4640 | This may change in the future, which is why we have the loop written |
4641 | the way it's written. |
4642 | */ |
4643 | if (extent->first_extent && length > left_length) |
4644 | { |
4645 | *end_of_data= *data; |
4646 | left_length= 0; |
4647 | } |
4648 | |
4649 | for(;;) |
4650 | { |
4651 | if (unlikely(left_length >= length)) |
4652 | { |
4653 | memcpy(to, *data, length); |
4654 | (*data)+= length; |
4655 | DBUG_PRINT("info" , ("left_length: %u" , left_length - (uint) length)); |
4656 | DBUG_RETURN(0); |
4657 | } |
4658 | memcpy(to, *data, left_length); |
4659 | to+= left_length; |
4660 | length-= left_length; |
4661 | if (!(*data= read_next_extent(info, extent, end_of_data))) |
4662 | break; |
4663 | left_length= (uint) (*end_of_data - *data); |
4664 | } |
4665 | DBUG_RETURN(1); |
4666 | } |
4667 | |
4668 | static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length, |
4669 | MARIA_EXTENT_CURSOR *extent, |
4670 | uchar **data, uchar **end_of_data) |
4671 | { |
4672 | uint left_length= (uint) (*end_of_data - *data); |
4673 | if (likely(left_length >= length)) |
4674 | { |
4675 | memcpy(to, *data, length); |
4676 | (*data)+= length; |
4677 | return 0; |
4678 | } |
4679 | return read_long_data2(info, to, length, extent, data, end_of_data); |
4680 | } |
4681 | |
4682 | |
4683 | /* |
4684 | Read a record from page (helper function for _ma_read_block_record()) |
4685 | |
4686 | SYNOPSIS |
4687 | _ma_read_block_record2() |
4688 | info Maria handler |
4689 | record Store record here |
4690 | data Start of head data for row |
4691 | end_of_data End of data for row |
4692 | |
4693 | NOTES |
4694 | The head page is already read by caller |
4695 | Following data is update in info->cur_row: |
4696 | |
4697 | cur_row.head_length is set to size of entry in head block |
4698 | cur_row.tail_positions is set to point to all tail blocks |
4699 | cur_row.extents points to extents data |
4700 | cur_row.extents_counts contains number of extents |
4701 | cur_row.empty_bits is set to empty bits |
4702 | cur_row.field_lengths contains packed length of all fields |
4703 | cur_row.blob_length contains total length of all blobs |
4704 | cur_row.checksum contains checksum of read record. |
4705 | |
4706 | RETURN |
4707 | 0 ok |
4708 | # Error code |
4709 | */ |
4710 | |
4711 | int _ma_read_block_record2(MARIA_HA *info, uchar *record, |
4712 | uchar *data, uchar *end_of_data) |
4713 | { |
4714 | MARIA_SHARE *share= info->s; |
4715 | uchar *UNINIT_VAR(field_length_data), *UNINIT_VAR(blob_buffer), *start_of_data; |
4716 | uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths; |
4717 | my_bool found_blob= 0; |
4718 | MARIA_EXTENT_CURSOR extent; |
4719 | MARIA_COLUMNDEF *column, *end_column; |
4720 | MARIA_ROW *cur_row= &info->cur_row; |
4721 | DBUG_ENTER("_ma_read_block_record2" ); |
4722 | |
4723 | start_of_data= data; |
4724 | flag= (uint) (uchar) data[0]; |
4725 | cur_null_bytes= share->base.original_null_bytes; |
4726 | null_bytes= share->base.null_bytes; |
4727 | cur_row->head_length= (uint) (end_of_data - data); |
4728 | cur_row->full_page_count= cur_row->tail_count= 0; |
4729 | cur_row->blob_length= 0; |
4730 | /* Number of bytes in header that we don't need to write during undo */ |
4731 | cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1; |
4732 | |
4733 | if (flag & ROW_FLAG_TRANSID) |
4734 | { |
4735 | cur_row->trid= transid_korr(data+1); |
4736 | if (!info->trn) |
4737 | { |
4738 | /* File crashed */ |
4739 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
4740 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
4741 | DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); |
4742 | } |
4743 | if (!trnman_can_read_from(info->trn, cur_row->trid)) |
4744 | DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE); |
4745 | } |
4746 | |
4747 | /* Skip trans header (for now, until we have MVCC csupport) */ |
4748 | data+= cur_row->header_length + 1 ; |
4749 | if (flag & ROW_FLAG_NULLS_EXTENDED) |
4750 | cur_null_bytes+= data[-1]; |
4751 | |
4752 | row_extents= 0; |
4753 | if (flag & ROW_FLAG_EXTENTS) |
4754 | { |
4755 | uint row_extent_size; |
4756 | /* |
4757 | Record is split over many data pages. |
4758 | Get number of extents and first extent |
4759 | */ |
4760 | get_key_length(row_extents, data); |
4761 | cur_row->extents_count= row_extents; |
4762 | row_extent_size= row_extents * ROW_EXTENT_SIZE; |
4763 | if (cur_row->extents_buffer_length < row_extent_size && |
4764 | _ma_alloc_buffer(&cur_row->extents, |
4765 | &cur_row->extents_buffer_length, |
4766 | row_extent_size)) |
4767 | DBUG_RETURN(my_errno); |
4768 | memcpy(cur_row->extents, data, ROW_EXTENT_SIZE); |
4769 | data+= ROW_EXTENT_SIZE; |
4770 | init_extent(&extent, cur_row->extents, row_extents, |
4771 | cur_row->tail_positions); |
4772 | } |
4773 | else |
4774 | { |
4775 | cur_row->extents_count= 0; |
4776 | (*cur_row->tail_positions)= 0; |
4777 | extent.page_count= 0; |
4778 | extent.extent_count= 1; |
4779 | } |
4780 | extent.first_extent= 1; |
4781 | |
4782 | field_lengths= 0; |
4783 | if (share->base.max_field_lengths) |
4784 | { |
4785 | get_key_length(field_lengths, data); |
4786 | cur_row->field_lengths_length= field_lengths; |
4787 | #ifdef SANITY_CHECKS |
4788 | if (field_lengths > share->base.max_field_lengths) |
4789 | goto err; |
4790 | #endif |
4791 | } |
4792 | |
4793 | if (share->calc_checksum) |
4794 | cur_row->checksum= (uint) (uchar) *data++; |
4795 | /* data now points on null bits */ |
4796 | memcpy(record, data, cur_null_bytes); |
4797 | if (unlikely(cur_null_bytes != null_bytes)) |
4798 | { |
4799 | /* |
4800 | This only happens if we have added more NULL columns with |
4801 | ALTER TABLE and are fetching an old, not yet modified old row |
4802 | */ |
4803 | bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes)); |
4804 | } |
4805 | data+= null_bytes; |
4806 | /* We copy the empty bits to be able to use them for delete/update */ |
4807 | memcpy(cur_row->empty_bits, data, share->base.pack_bytes); |
4808 | data+= share->base.pack_bytes; |
4809 | |
4810 | /* TODO: Use field offsets, instead of just skipping them */ |
4811 | data+= share->base.field_offsets * FIELD_OFFSET_SIZE; |
4812 | |
4813 | /* |
4814 | Read row extents (note that first extent was already read into |
4815 | cur_row->extents above) |
4816 | */ |
4817 | if (row_extents > 1) |
4818 | { |
4819 | if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE, |
4820 | (row_extents - 1) * ROW_EXTENT_SIZE, |
4821 | &extent, &data, &end_of_data)) |
4822 | DBUG_RETURN(my_errno); |
4823 | } |
4824 | |
4825 | /* |
4826 | Data now points to start of fixed length field data that can't be null |
4827 | or 'empty'. Note that these fields can't be split over blocks. |
4828 | */ |
4829 | for (column= share->columndef, |
4830 | end_column= column + share->base.fixed_not_null_fields; |
4831 | column < end_column; column++) |
4832 | { |
4833 | uint column_length= column->length; |
4834 | if (data + column_length > end_of_data && |
4835 | !(data= read_next_extent(info, &extent, &end_of_data))) |
4836 | goto err; |
4837 | memcpy(record + column->offset, data, column_length); |
4838 | data+= column_length; |
4839 | } |
4840 | |
4841 | /* Read array of field lengths. This may be stored in several extents */ |
4842 | if (field_lengths) |
4843 | { |
4844 | field_length_data= cur_row->field_lengths; |
4845 | if (read_long_data(info, field_length_data, field_lengths, &extent, |
4846 | &data, &end_of_data)) |
4847 | DBUG_RETURN(my_errno); |
4848 | } |
4849 | |
4850 | /* Read variable length data. Each of these may be split over many extents */ |
4851 | for (end_column= share->columndef + share->base.fields; |
4852 | column < end_column; column++) |
4853 | { |
4854 | enum en_fieldtype type= column->type; |
4855 | uchar *field_pos= record + column->offset; |
4856 | /* First check if field is present in record */ |
4857 | if ((record[column->null_pos] & column->null_bit) || |
4858 | (cur_row->empty_bits[column->empty_pos] & column->empty_bit)) |
4859 | { |
4860 | bfill(record + column->offset, column->fill_length, |
4861 | type == FIELD_SKIP_ENDSPACE ? ' ' : 0); |
4862 | continue; |
4863 | } |
4864 | switch (type) { |
4865 | case FIELD_NORMAL: /* Fixed length field */ |
4866 | case FIELD_SKIP_PRESPACE: |
4867 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
4868 | if (data + column->length > end_of_data && |
4869 | !(data= read_next_extent(info, &extent, &end_of_data))) |
4870 | goto err; |
4871 | memcpy(field_pos, data, column->length); |
4872 | data+= column->length; |
4873 | break; |
4874 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
4875 | { |
4876 | /* Char that is space filled */ |
4877 | uint length; |
4878 | if (column->length <= 255) |
4879 | length= (uint) (uchar) *field_length_data++; |
4880 | else |
4881 | { |
4882 | length= uint2korr(field_length_data); |
4883 | field_length_data+= 2; |
4884 | } |
4885 | #ifdef SANITY_CHECKS |
4886 | if (length > column->length) |
4887 | goto err; |
4888 | #endif |
4889 | if (read_long_data(info, field_pos, length, &extent, &data, |
4890 | &end_of_data)) |
4891 | DBUG_RETURN(my_errno); |
4892 | bfill(field_pos + length, column->length - length, ' '); |
4893 | break; |
4894 | } |
4895 | case FIELD_VARCHAR: |
4896 | { |
4897 | ulong length; |
4898 | if (column->length <= 256) |
4899 | { |
4900 | length= (uint) (uchar) (*field_pos++= *field_length_data++); |
4901 | } |
4902 | else |
4903 | { |
4904 | length= uint2korr(field_length_data); |
4905 | field_pos[0]= field_length_data[0]; |
4906 | field_pos[1]= field_length_data[1]; |
4907 | field_pos+= 2; |
4908 | field_length_data+= 2; |
4909 | } |
4910 | #ifdef SANITY_CHECKS |
4911 | if (length > column->length) |
4912 | goto err; |
4913 | #endif |
4914 | if (read_long_data(info, field_pos, length, &extent, &data, |
4915 | &end_of_data)) |
4916 | DBUG_RETURN(my_errno); |
4917 | break; |
4918 | } |
4919 | case FIELD_BLOB: |
4920 | { |
4921 | uint column_size_length= column->length - portable_sizeof_char_ptr; |
4922 | ulong blob_length= _ma_calc_blob_length(column_size_length, |
4923 | field_length_data); |
4924 | |
4925 | if (!found_blob) |
4926 | { |
4927 | /* Calculate total length for all blobs */ |
4928 | ulong blob_lengths= 0; |
4929 | uchar *length_data= field_length_data; |
4930 | MARIA_COLUMNDEF *blob_field= column; |
4931 | |
4932 | found_blob= 1; |
4933 | for (; blob_field < end_column; blob_field++) |
4934 | { |
4935 | uint size_length; |
4936 | if ((record[blob_field->null_pos] & blob_field->null_bit) || |
4937 | (cur_row->empty_bits[blob_field->empty_pos] & |
4938 | blob_field->empty_bit)) |
4939 | continue; |
4940 | size_length= blob_field->length - portable_sizeof_char_ptr; |
4941 | blob_lengths+= _ma_calc_blob_length(size_length, length_data); |
4942 | length_data+= size_length; |
4943 | } |
4944 | cur_row->blob_length= blob_lengths; |
4945 | DBUG_PRINT("info" , ("Total blob length: %lu" , blob_lengths)); |
4946 | if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size, |
4947 | blob_lengths)) |
4948 | DBUG_RETURN(my_errno); |
4949 | blob_buffer= info->blob_buff; |
4950 | } |
4951 | |
4952 | memcpy(field_pos, field_length_data, column_size_length); |
4953 | memcpy(field_pos + column_size_length, (uchar *) &blob_buffer, |
4954 | sizeof(char*)); |
4955 | field_length_data+= column_size_length; |
4956 | |
4957 | /* |
4958 | After we have read one extent, then each blob is in it's own extent |
4959 | */ |
4960 | if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length) |
4961 | end_of_data= data; /* Force read of next extent */ |
4962 | |
4963 | if (read_long_data(info, blob_buffer, blob_length, &extent, &data, |
4964 | &end_of_data)) |
4965 | DBUG_RETURN(my_errno); |
4966 | blob_buffer+= blob_length; |
4967 | break; |
4968 | } |
4969 | default: |
4970 | #ifdef EXTRA_DEBUG |
4971 | DBUG_ASSERT(0); /* purecov: deadcode */ |
4972 | #endif |
4973 | goto err; |
4974 | } |
4975 | continue; |
4976 | } |
4977 | |
4978 | if (row_extents) |
4979 | { |
4980 | DBUG_PRINT("info" , ("Row read: page_count: %u extent_count: %u" , |
4981 | extent.page_count, extent.extent_count)); |
4982 | *extent.tail_positions= 0; /* End marker */ |
4983 | if (extent.page_count) |
4984 | goto err; |
4985 | if (extent.extent_count > 1) |
4986 | { |
4987 | if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE, |
4988 | (extent.extent_count-1) * ROW_EXTENT_SIZE)) |
4989 | { |
4990 | DBUG_PRINT("error" , ("Data in extent is not zero" )); |
4991 | DBUG_DUMP("extent" , extent.extent + ROW_EXTENT_SIZE, |
4992 | (extent.extent_count-1) * ROW_EXTENT_SIZE); |
4993 | goto err; |
4994 | } |
4995 | } |
4996 | } |
4997 | else |
4998 | { |
4999 | DBUG_PRINT("info" , ("Row read" )); |
5000 | /* |
5001 | data should normally point to end_of_date. The only exception is if |
5002 | the row is very short in which case we allocated 'min_block_length' data |
5003 | for allowing the row to expand. |
5004 | */ |
5005 | if (data != end_of_data && (uint) (end_of_data - start_of_data) > |
5006 | share->base.min_block_length) |
5007 | goto err; |
5008 | } |
5009 | #ifdef EXTRA_DEBUG |
5010 | if (share->calc_checksum && !info->in_check_table) |
5011 | { |
5012 | /* Esnure that row checksum is correct */ |
5013 | DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) == |
5014 | cur_row->checksum); |
5015 | } |
5016 | #endif |
5017 | info->update|= HA_STATE_AKTIV; /* We have an active record */ |
5018 | DBUG_RETURN(0); |
5019 | |
5020 | err: |
5021 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
5022 | /* Something was wrong with data on record */ |
5023 | DBUG_PRINT("error" , ("Found record with wrong data" )); |
5024 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
5025 | DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); |
5026 | } |
5027 | |
5028 | |
5029 | /** @brief Read positions to tail blocks and full blocks |
5030 | |
5031 | @fn read_row_extent_info() |
5032 | @param info Handler |
5033 | |
5034 | @notes |
5035 | This function is a simpler version of _ma_read_block_record2() |
5036 | The data about the used pages is stored in info->cur_row. |
5037 | |
5038 | @return Status |
5039 | @retval 0 ok |
5040 | @retval 1 Error. my_errno contains error number |
5041 | */ |
5042 | |
5043 | static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff, |
5044 | uint record_number) |
5045 | { |
5046 | MARIA_SHARE *share= info->s; |
5047 | MARIA_EXTENT_CURSOR extent; |
5048 | MARIA_RECORD_POS *tail_pos; |
5049 | uchar *data, *end_of_data; |
5050 | uint flag, row_extents, row_extents_size; |
5051 | uint field_lengths __attribute__ ((unused)); |
5052 | uchar *extents, *end; |
5053 | DBUG_ENTER("read_row_extent_info" ); |
5054 | |
5055 | if (!(data= get_record_position(share, buff, |
5056 | record_number, &end_of_data))) |
5057 | DBUG_RETURN(1); /* Wrong in record */ |
5058 | |
5059 | flag= (uint) (uchar) data[0]; |
5060 | /* Skip trans header */ |
5061 | data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)]; |
5062 | |
5063 | row_extents= 0; |
5064 | row_extents_size= 0; |
5065 | if (flag & ROW_FLAG_EXTENTS) |
5066 | { |
5067 | /* |
5068 | Record is split over many data pages. |
5069 | Get number of extents and first extent |
5070 | */ |
5071 | get_key_length(row_extents, data); |
5072 | row_extents_size= row_extents * ROW_EXTENT_SIZE; |
5073 | if (info->cur_row.extents_buffer_length < row_extents_size && |
5074 | _ma_alloc_buffer(&info->cur_row.extents, |
5075 | &info->cur_row.extents_buffer_length, |
5076 | row_extents_size)) |
5077 | DBUG_RETURN(1); |
5078 | memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE); |
5079 | data+= ROW_EXTENT_SIZE; |
5080 | init_extent(&extent, info->cur_row.extents, row_extents, |
5081 | info->cur_row.tail_positions); |
5082 | extent.first_extent= 1; |
5083 | } |
5084 | info->cur_row.extents_count= row_extents; |
5085 | |
5086 | /* |
5087 | field_lengths looks unused but get_key_length will |
5088 | increment data, which is required as data it's used later. |
5089 | */ |
5090 | if (share->base.max_field_lengths) |
5091 | get_key_length(field_lengths, data); |
5092 | |
5093 | if (share->calc_checksum) |
5094 | info->cur_row.checksum= (uint) (uchar) *data++; |
5095 | if (row_extents > 1) |
5096 | { |
5097 | data+= share->base.null_bytes; |
5098 | data+= share->base.pack_bytes; |
5099 | data+= share->base.field_offsets * FIELD_OFFSET_SIZE; |
5100 | |
5101 | /* |
5102 | Read row extents (note that first extent was already read into |
5103 | info->cur_row.extents above) |
5104 | Lock tails with write lock as we will delete them later. |
5105 | */ |
5106 | extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED; |
5107 | if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE, |
5108 | row_extents_size - ROW_EXTENT_SIZE, |
5109 | &extent, &data, &end_of_data)) |
5110 | DBUG_RETURN(1); |
5111 | } |
5112 | |
5113 | /* Update tail_positions with pointer to tails */ |
5114 | tail_pos= info->cur_row.tail_positions; |
5115 | for (extents= info->cur_row.extents, end= extents + row_extents_size; |
5116 | extents < end; |
5117 | extents+= ROW_EXTENT_SIZE) |
5118 | { |
5119 | pgcache_page_no_t page= uint5korr(extents); |
5120 | uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); |
5121 | if (page_count & TAIL_BIT) |
5122 | *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT | |
5123 | START_EXTENT_BIT))); |
5124 | } |
5125 | *tail_pos= 0; /* End marker */ |
5126 | DBUG_RETURN(0); |
5127 | } |
5128 | |
5129 | |
5130 | /* |
5131 | Read a record based on record position |
5132 | |
5133 | @fn _ma_read_block_record() |
5134 | @param info Maria handler |
5135 | @param record Store record here |
5136 | @param record_pos Record position |
5137 | |
5138 | @return Status |
5139 | @retval 0 ok |
5140 | @retval # Error number |
5141 | */ |
5142 | |
5143 | int _ma_read_block_record(MARIA_HA *info, uchar *record, |
5144 | MARIA_RECORD_POS record_pos) |
5145 | { |
5146 | MARIA_SHARE *share= info->s; |
5147 | uchar *data, *end_of_data, *buff; |
5148 | uint offset; |
5149 | int ret; |
5150 | DBUG_ENTER("_ma_read_block_record" ); |
5151 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u" , |
5152 | (ulong) record_pos, |
5153 | (ulong) ma_recordpos_to_page(record_pos), |
5154 | ma_recordpos_to_dir_entry(record_pos))); |
5155 | |
5156 | offset= ma_recordpos_to_dir_entry(record_pos); |
5157 | |
5158 | if (!(buff= pagecache_read(share->pagecache, |
5159 | &info->dfile, ma_recordpos_to_page(record_pos), 0, |
5160 | info->buff, share->page_type, |
5161 | PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) |
5162 | DBUG_RETURN(my_errno); |
5163 | DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE); |
5164 | if (!(data= get_record_position(share, buff, offset, &end_of_data))) |
5165 | { |
5166 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
5167 | DBUG_PRINT("error" , ("Wrong directory entry in data block" )); |
5168 | my_errno= HA_ERR_RECORD_DELETED; /* File crashed */ |
5169 | DBUG_RETURN(HA_ERR_RECORD_DELETED); |
5170 | } |
5171 | ret= _ma_read_block_record2(info, record, data, end_of_data); |
5172 | DBUG_RETURN(ret); |
5173 | } |
5174 | |
5175 | |
5176 | /* compare unique constraint between stored rows */ |
5177 | |
5178 | my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, |
5179 | const uchar *record, MARIA_RECORD_POS pos) |
5180 | { |
5181 | uchar *org_rec_buff, *old_record; |
5182 | size_t org_rec_buff_size; |
5183 | int error; |
5184 | DBUG_ENTER("_ma_cmp_block_unique" ); |
5185 | |
5186 | /* |
5187 | Don't allocate more than 16K on the stack to ensure we don't get |
5188 | stack overflow. |
5189 | */ |
5190 | if (!(old_record= my_safe_alloca(info->s->base.reclength))) |
5191 | DBUG_RETURN(1); |
5192 | |
5193 | /* Don't let the compare destroy blobs that may be in use */ |
5194 | org_rec_buff= info->rec_buff; |
5195 | org_rec_buff_size= info->rec_buff_size; |
5196 | if (info->s->base.blobs) |
5197 | { |
5198 | /* Force realloc of record buffer*/ |
5199 | info->rec_buff= 0; |
5200 | info->rec_buff_size= 0; |
5201 | } |
5202 | error= _ma_read_block_record(info, old_record, pos); |
5203 | if (!error) |
5204 | error= _ma_unique_comp(def, record, old_record, def->null_are_equal); |
5205 | if (info->s->base.blobs) |
5206 | { |
5207 | my_free(info->rec_buff); |
5208 | info->rec_buff= org_rec_buff; |
5209 | info->rec_buff_size= org_rec_buff_size; |
5210 | } |
5211 | DBUG_PRINT("exit" , ("result: %d" , error)); |
5212 | my_safe_afree(old_record, info->s->base.reclength); |
5213 | DBUG_RETURN(error != 0); |
5214 | } |
5215 | |
5216 | |
5217 | /**************************************************************************** |
5218 | Table scan |
5219 | ****************************************************************************/ |
5220 | |
5221 | /* |
5222 | Allocate buffers for table scan |
5223 | |
5224 | SYNOPSIS |
5225 | _ma_scan_init_block_record(MARIA_HA *info) |
5226 | |
5227 | IMPLEMENTATION |
5228 | We allocate one buffer for the current bitmap and one buffer for the |
5229 | current page |
5230 | |
5231 | RETURN |
5232 | 0 ok |
5233 | 1 error (couldn't allocate memory or disk error) |
5234 | */ |
5235 | |
5236 | my_bool _ma_scan_init_block_record(MARIA_HA *info) |
5237 | { |
5238 | MARIA_SHARE *share= info->s; |
5239 | DBUG_ENTER("_ma_scan_init_block_record" ); |
5240 | /* |
5241 | bitmap_buff may already be allocated if this is the second call to |
5242 | rnd_init() without a rnd_end() in between, see sql/handler.h |
5243 | */ |
5244 | if (!(info->scan.bitmap_buff || |
5245 | ((info->scan.bitmap_buff= |
5246 | (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME)))))) |
5247 | DBUG_RETURN(1); |
5248 | info->scan.page_buff= info->scan.bitmap_buff + share->block_size; |
5249 | info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size; |
5250 | |
5251 | /* Set scan variables to get _ma_scan_block() to start with reading bitmap */ |
5252 | info->scan.number_of_rows= 0; |
5253 | info->scan.bitmap_pos= info->scan.bitmap_end; |
5254 | info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered; |
5255 | info->scan.max_page= share->state.state.data_file_length / share->block_size; |
5256 | /* |
5257 | We need to flush what's in memory (bitmap.map) to page cache otherwise, as |
5258 | we are going to read bitmaps from page cache in table scan (see |
5259 | _ma_scan_block_record()), we may miss recently inserted rows (bitmap page |
5260 | in page cache would be too old). |
5261 | */ |
5262 | DBUG_RETURN(_ma_bitmap_flush(info->s)); |
5263 | } |
5264 | |
5265 | |
5266 | /* Free buffers allocated by _ma_scan_block_init() */ |
5267 | |
5268 | void _ma_scan_end_block_record(MARIA_HA *info) |
5269 | { |
5270 | DBUG_ENTER("_ma_scan_end_block_record" ); |
5271 | my_free(info->scan.bitmap_buff); |
5272 | info->scan.bitmap_buff= 0; |
5273 | if (info->scan_save) |
5274 | { |
5275 | my_free(info->scan_save); |
5276 | info->scan_save= 0; |
5277 | } |
5278 | DBUG_VOID_RETURN; |
5279 | } |
5280 | |
5281 | |
5282 | /** |
5283 | @brief Save current scan position |
5284 | |
5285 | @note |
5286 | For the moment we can only remember one position, but this is |
5287 | good enough for MySQL usage |
5288 | |
5289 | @return |
5290 | @retval 0 ok |
5291 | @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position |
5292 | */ |
5293 | |
5294 | int _ma_scan_remember_block_record(MARIA_HA *info, |
5295 | MARIA_RECORD_POS *lastpos) |
5296 | { |
5297 | uchar *bitmap_buff; |
5298 | DBUG_ENTER("_ma_scan_remember_block_record" ); |
5299 | if (!(info->scan_save)) |
5300 | { |
5301 | if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) + |
5302 | info->s->block_size * 2, |
5303 | MYF(MY_WME)))) |
5304 | DBUG_RETURN(HA_ERR_OUT_OF_MEM); |
5305 | info->scan_save->bitmap_buff= ((uchar*) info->scan_save + |
5306 | ALIGN_SIZE(sizeof(*info->scan_save))); |
5307 | } |
5308 | /* For checking if pages have changed since we last read it */ |
5309 | info->scan.row_changes= info->row_changes; |
5310 | |
5311 | /* Remember used bitmap and used head page */ |
5312 | bitmap_buff= info->scan_save->bitmap_buff; |
5313 | memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save)); |
5314 | info->scan_save->bitmap_buff= bitmap_buff; |
5315 | memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2); |
5316 | |
5317 | /* Point to the last read row */ |
5318 | *lastpos= info->cur_row.nextpos - 1; |
5319 | info->scan_save->dir+= DIR_ENTRY_SIZE; |
5320 | DBUG_RETURN(0); |
5321 | } |
5322 | |
5323 | |
5324 | /** |
5325 | @brief restore scan block it's original values |
5326 | |
5327 | @return |
5328 | 0 ok |
5329 | # error |
5330 | |
5331 | @note |
5332 | In theory we could swap bitmap buffers instead of copy them. |
5333 | For the moment we don't do that because there are variables pointing |
5334 | inside the buffers and it's a bit of hassle to either make them relative |
5335 | or repoint them. |
5336 | |
5337 | If the data file has changed, we will re-read the new block record |
5338 | to ensure that when we continue scanning we can ignore any deleted rows. |
5339 | */ |
5340 | |
5341 | int _ma_scan_restore_block_record(MARIA_HA *info, |
5342 | MARIA_RECORD_POS lastpos) |
5343 | { |
5344 | uchar *bitmap_buff; |
5345 | DBUG_ENTER("_ma_scan_restore_block_record" ); |
5346 | |
5347 | info->cur_row.nextpos= lastpos; |
5348 | bitmap_buff= info->scan.bitmap_buff; |
5349 | memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save)); |
5350 | info->scan.bitmap_buff= bitmap_buff; |
5351 | memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2); |
5352 | |
5353 | if (info->scan.row_changes != info->row_changes) |
5354 | { |
5355 | /* |
5356 | Table has been changed. We have to re-read the current page block as |
5357 | data may have changed on it that we have to see. |
5358 | */ |
5359 | if (!(pagecache_read(info->s->pagecache, |
5360 | &info->dfile, |
5361 | ma_recordpos_to_page(info->scan.row_base_page), |
5362 | 0, info->scan.page_buff, |
5363 | info->s->page_type, |
5364 | PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) |
5365 | DBUG_RETURN(my_errno); |
5366 | info->scan.number_of_rows= |
5367 | (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]; |
5368 | info->scan.dir_end= (info->scan.page_buff + info->s->block_size - |
5369 | PAGE_SUFFIX_SIZE - |
5370 | info->scan.number_of_rows * DIR_ENTRY_SIZE); |
5371 | } |
5372 | DBUG_RETURN(0); |
5373 | } |
5374 | |
5375 | |
5376 | /* |
5377 | Read next record while scanning table |
5378 | |
5379 | SYNOPSIS |
5380 | _ma_scan_block_record() |
5381 | info Maria handler |
5382 | record Store found here |
5383 | record_pos Value stored in info->cur_row.next_pos after last call |
5384 | This is offset inside the current pagebuff |
5385 | skip_deleted |
5386 | |
5387 | NOTES |
5388 | - One must have called mi_scan() before this |
5389 | - In this version, we don't actually need record_pos, we as easily |
5390 | use a variable in info->scan |
5391 | |
5392 | IMPLEMENTATION |
5393 | Current code uses a lot of goto's to separate the different kind of |
5394 | states we may be in. This gives us a minimum of executed if's for |
5395 | the normal cases. I tried several different ways to code this, but |
5396 | the current one was in the end the most readable and fastest. |
5397 | |
5398 | RETURN |
5399 | 0 ok |
5400 | # Error code (Normally HA_ERR_END_OF_FILE) |
5401 | */ |
5402 | |
5403 | int _ma_scan_block_record(MARIA_HA *info, uchar *record, |
5404 | MARIA_RECORD_POS record_pos, |
5405 | my_bool skip_deleted __attribute__ ((unused))) |
5406 | { |
5407 | uint block_size; |
5408 | MARIA_SHARE *share= info->s; |
5409 | DBUG_ENTER("_ma_scan_block_record" ); |
5410 | |
5411 | restart_record_read: |
5412 | /* Find next row in current page */ |
5413 | while (likely(record_pos < info->scan.number_of_rows)) |
5414 | { |
5415 | uint length, offset; |
5416 | uchar *data, *end_of_data; |
5417 | int error; |
5418 | |
5419 | /* Ensure that scan.dir and record_pos are in sync */ |
5420 | DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff, |
5421 | share->block_size, |
5422 | (uint) record_pos)); |
5423 | |
5424 | /* Search for a valid directory entry (not 0) */ |
5425 | while (!(offset= uint2korr(info->scan.dir))) |
5426 | { |
5427 | info->scan.dir-= DIR_ENTRY_SIZE; |
5428 | record_pos++; |
5429 | #ifdef SANITY_CHECKS |
5430 | if (info->scan.dir < info->scan.dir_end) |
5431 | { |
5432 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
5433 | goto err; |
5434 | } |
5435 | #endif |
5436 | } |
5437 | /* |
5438 | This should always be true as the directory should always start with |
5439 | a valid entry. |
5440 | */ |
5441 | DBUG_ASSERT(info->scan.dir >= info->scan.dir_end); |
5442 | |
5443 | /* found row */ |
5444 | info->cur_row.lastpos= info->scan.row_base_page + record_pos; |
5445 | info->cur_row.nextpos= record_pos + 1; |
5446 | data= info->scan.page_buff + offset; |
5447 | length= uint2korr(info->scan.dir + 2); |
5448 | end_of_data= data + length; |
5449 | info->scan.dir-= DIR_ENTRY_SIZE; /* Point to next row to process */ |
5450 | #ifdef SANITY_CHECKS |
5451 | if (end_of_data > info->scan.dir_end || |
5452 | offset < PAGE_HEADER_SIZE(share) || |
5453 | length < share->base.min_block_length) |
5454 | { |
5455 | DBUG_ASSERT(!(end_of_data > info->scan.dir_end)); |
5456 | DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share))); |
5457 | DBUG_ASSERT(!(length < share->base.min_block_length)); |
5458 | goto err; |
5459 | } |
5460 | #endif |
5461 | DBUG_PRINT("info" , ("rowid: %lu" , (ulong) info->cur_row.lastpos)); |
5462 | error= _ma_read_block_record2(info, record, data, end_of_data); |
5463 | if (error != HA_ERR_ROW_NOT_VISIBLE) |
5464 | DBUG_RETURN(error); |
5465 | record_pos++; |
5466 | } |
5467 | |
5468 | /* Find next head page in current bitmap */ |
5469 | restart_bitmap_scan: |
5470 | block_size= share->block_size; |
5471 | if (likely(info->scan.bitmap_pos < info->scan.bitmap_end)) |
5472 | { |
5473 | uchar *data= info->scan.bitmap_pos; |
5474 | longlong bits= info->scan.bits; |
5475 | uint bit_pos= info->scan.bit_pos; |
5476 | |
5477 | do |
5478 | { |
5479 | while (likely(bits)) |
5480 | { |
5481 | uint pattern= (uint) (bits & 7); |
5482 | bits >>= 3; |
5483 | bit_pos++; |
5484 | if (pattern > 0 && pattern <= 4) |
5485 | { |
5486 | /* Found head page; Read it */ |
5487 | pgcache_page_no_t page; |
5488 | info->scan.bitmap_pos= data; |
5489 | info->scan.bits= bits; |
5490 | info->scan.bit_pos= bit_pos; |
5491 | page= (info->scan.bitmap_page + 1 + |
5492 | (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1); |
5493 | info->scan.row_base_page= ma_recordpos(page, 0); |
5494 | if (page >= info->scan.max_page) |
5495 | { |
5496 | DBUG_PRINT("info" , ("Found end of file" )); |
5497 | DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); |
5498 | } |
5499 | if (!(pagecache_read(share->pagecache, |
5500 | &info->dfile, |
5501 | page, 0, info->scan.page_buff, |
5502 | share->page_type, |
5503 | PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) |
5504 | DBUG_RETURN(my_errno); |
5505 | if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != |
5506 | HEAD_PAGE)) |
5507 | { |
5508 | /* |
5509 | This may happen if someone has been deleting all rows |
5510 | from a page since we read the bitmap, so it may be ok. |
5511 | Print warning in debug log and continue. |
5512 | */ |
5513 | DBUG_PRINT("warning" , |
5514 | ("Found page of type %d when expecting head page" , |
5515 | (info->scan.page_buff[PAGE_TYPE_OFFSET] & |
5516 | PAGE_TYPE_MASK))); |
5517 | continue; |
5518 | } |
5519 | if ((info->scan.number_of_rows= |
5520 | (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0) |
5521 | { |
5522 | DBUG_PRINT("error" , ("Wrong page header" )); |
5523 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
5524 | DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); |
5525 | } |
5526 | DBUG_PRINT("info" , ("Page %lu has %u rows" , |
5527 | (ulong) page, info->scan.number_of_rows)); |
5528 | info->scan.dir= (info->scan.page_buff + block_size - |
5529 | PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); |
5530 | info->scan.dir_end= (info->scan.dir - |
5531 | (info->scan.number_of_rows - 1) * |
5532 | DIR_ENTRY_SIZE); |
5533 | record_pos= 0; |
5534 | goto restart_record_read; |
5535 | } |
5536 | } |
5537 | for (data+= 6; data < info->scan.bitmap_end; data+= 6) |
5538 | { |
5539 | bits= uint6korr(data); |
5540 | /* Skip not allocated pages and blob / full tail pages */ |
5541 | if (bits && bits != 07777777777777777LL) |
5542 | break; |
5543 | } |
5544 | bit_pos= 0; |
5545 | } while (data < info->scan.bitmap_end); |
5546 | } |
5547 | |
5548 | /* Read next bitmap */ |
5549 | info->scan.bitmap_page+= share->bitmap.pages_covered; |
5550 | if (unlikely(info->scan.bitmap_page >= info->scan.max_page)) |
5551 | { |
5552 | DBUG_PRINT("info" , ("Found end of file" )); |
5553 | DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); |
5554 | } |
5555 | DBUG_PRINT("info" , ("Reading bitmap at %lu" , |
5556 | (ulong) info->scan.bitmap_page)); |
5557 | if (!(pagecache_read(share->pagecache, &info->s->bitmap.file, |
5558 | info->scan.bitmap_page, |
5559 | 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE, |
5560 | PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) |
5561 | DBUG_RETURN(my_errno); |
5562 | /* Skip scanning 'bits' in bitmap scan code */ |
5563 | info->scan.bitmap_pos= info->scan.bitmap_buff - 6; |
5564 | info->scan.bits= 0; |
5565 | goto restart_bitmap_scan; |
5566 | |
5567 | err: |
5568 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
5569 | DBUG_PRINT("error" , ("Wrong data on page" )); |
5570 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
5571 | DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); |
5572 | } |
5573 | |
5574 | |
5575 | /* |
5576 | Compare a row against a stored one |
5577 | |
5578 | NOTES |
5579 | Not implemented, as block record is not supposed to be used in a shared |
5580 | global environment |
5581 | */ |
5582 | |
5583 | my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)), |
5584 | const uchar *record __attribute__ ((unused))) |
5585 | { |
5586 | return 0; |
5587 | } |
5588 | |
5589 | |
5590 | /* |
5591 | Store an integer with simple packing |
5592 | |
5593 | SYNOPSIS |
5594 | ma_store_integer() |
5595 | to Store the packed integer here |
5596 | nr Integer to store |
5597 | |
5598 | NOTES |
5599 | This is mostly used to store field numbers and lengths of strings. |
5600 | We have to cast the result for the LL() becasue of a bug in Forte CC |
5601 | compiler. |
5602 | |
5603 | Packing used is: |
5604 | nr < 251 is stored as is (in 1 byte) |
5605 | Numbers that require 1-4 bytes are stored as char(250+byte_length), data |
5606 | Bigger numbers are stored as 255, data as ulonglong (not yet done). |
5607 | |
5608 | RETURN |
5609 | Position in 'to' after the packed length |
5610 | */ |
5611 | |
5612 | uchar *ma_store_length(uchar *to, ulong nr) |
5613 | { |
5614 | if (nr < 251) |
5615 | { |
5616 | *to=(uchar) nr; |
5617 | return to+1; |
5618 | } |
5619 | if (nr < 65536) |
5620 | { |
5621 | if (nr <= 255) |
5622 | { |
5623 | to[0]= (uchar) 251; |
5624 | to[1]= (uchar) nr; |
5625 | return to+2; |
5626 | } |
5627 | to[0]= (uchar) 252; |
5628 | int2store(to+1, nr); |
5629 | return to+3; |
5630 | } |
5631 | if (nr < 16777216) |
5632 | { |
5633 | *to++= (uchar) 253; |
5634 | int3store(to, nr); |
5635 | return to+3; |
5636 | } |
5637 | *to++= (uchar) 254; |
5638 | int4store(to, nr); |
5639 | return to+4; |
5640 | } |
5641 | |
5642 | |
5643 | /* Calculate how many bytes needed to store a number */ |
5644 | |
5645 | uint ma_calc_length_for_store_length(ulong nr) |
5646 | { |
5647 | if (nr < 251) |
5648 | return 1; |
5649 | if (nr < 65536) |
5650 | { |
5651 | if (nr <= 255) |
5652 | return 2; |
5653 | return 3; |
5654 | } |
5655 | if (nr < 16777216) |
5656 | return 4; |
5657 | return 5; |
5658 | } |
5659 | |
5660 | |
5661 | /* Retrive a stored number */ |
5662 | |
5663 | static ulong ma_get_length(const uchar **packet) |
5664 | { |
5665 | reg1 const uchar *pos= *packet; |
5666 | if (*pos < 251) |
5667 | { |
5668 | (*packet)++; |
5669 | return (ulong) *pos; |
5670 | } |
5671 | if (*pos == 251) |
5672 | { |
5673 | (*packet)+= 2; |
5674 | return (ulong) pos[1]; |
5675 | } |
5676 | if (*pos == 252) |
5677 | { |
5678 | (*packet)+= 3; |
5679 | return (ulong) uint2korr(pos+1); |
5680 | } |
5681 | if (*pos == 253) |
5682 | { |
5683 | (*packet)+= 4; |
5684 | return (ulong) uint3korr(pos+1); |
5685 | } |
5686 | DBUG_ASSERT(*pos == 254); |
5687 | (*packet)+= 5; |
5688 | return (ulong) uint4korr(pos+1); |
5689 | } |
5690 | |
5691 | |
5692 | /* |
5693 | Fill array with pointers to field parts to be stored in log for insert |
5694 | |
5695 | SYNOPSIS |
5696 | fill_insert_undo_parts() |
5697 | info Maria handler |
5698 | record Inserted row |
5699 | log_parts Store pointers to changed memory areas here |
5700 | log_parts_count See RETURN |
5701 | |
5702 | NOTES |
5703 | We have information in info->cur_row about the read row. |
5704 | |
5705 | RETURN |
5706 | length of data in log_parts. |
5707 | log_parts_count contains number of used log_parts |
5708 | */ |
5709 | |
5710 | static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, |
5711 | LEX_CUSTRING *log_parts, |
5712 | uint *log_parts_count) |
5713 | { |
5714 | MARIA_SHARE *share= info->s; |
5715 | MARIA_COLUMNDEF *column, *end_column; |
5716 | uchar *field_lengths= info->cur_row.field_lengths; |
5717 | size_t row_length; |
5718 | MARIA_ROW *cur_row= &info->cur_row; |
5719 | LEX_CUSTRING *start_log_parts; |
5720 | DBUG_ENTER("fill_insert_undo_parts" ); |
5721 | |
5722 | start_log_parts= log_parts; |
5723 | |
5724 | /* Store null bits */ |
5725 | log_parts->str= record; |
5726 | log_parts->length= share->base.null_bytes; |
5727 | row_length= log_parts->length; |
5728 | log_parts++; |
5729 | |
5730 | /* Stored bitmap over packed (zero length or all-zero fields) */ |
5731 | log_parts->str= info->cur_row.empty_bits; |
5732 | log_parts->length= share->base.pack_bytes; |
5733 | row_length+= log_parts->length; |
5734 | log_parts++; |
5735 | |
5736 | if (share->base.max_field_lengths) |
5737 | { |
5738 | /* Store length of all not empty char, varchar and blob fields */ |
5739 | log_parts->str= field_lengths - 2; |
5740 | log_parts->length= info->cur_row.field_lengths_length+2; |
5741 | int2store(log_parts->str, info->cur_row.field_lengths_length); |
5742 | row_length+= log_parts->length; |
5743 | log_parts++; |
5744 | } |
5745 | |
5746 | if (share->base.blobs) |
5747 | { |
5748 | /* |
5749 | Store total blob length to make buffer allocation easier during UNDO |
5750 | */ |
5751 | log_parts->str= info->length_buff; |
5752 | log_parts->length= (uint) (ma_store_length(info->length_buff, |
5753 | info->cur_row.blob_length) - |
5754 | (uchar*) log_parts->str); |
5755 | row_length+= log_parts->length; |
5756 | log_parts++; |
5757 | } |
5758 | |
5759 | /* Handle constant length fields that are always present */ |
5760 | for (column= share->columndef, |
5761 | end_column= column+ share->base.fixed_not_null_fields; |
5762 | column < end_column; |
5763 | column++) |
5764 | { |
5765 | log_parts->str= record + column->offset; |
5766 | log_parts->length= column->length; |
5767 | row_length+= log_parts->length; |
5768 | log_parts++; |
5769 | } |
5770 | |
5771 | /* Handle NULL fields and CHAR/VARCHAR fields */ |
5772 | for (end_column= share->columndef + share->base.fields - share->base.blobs; |
5773 | column < end_column; |
5774 | column++) |
5775 | { |
5776 | const uchar *column_pos; |
5777 | size_t column_length; |
5778 | if ((record[column->null_pos] & column->null_bit) || |
5779 | cur_row->empty_bits[column->empty_pos] & column->empty_bit) |
5780 | continue; |
5781 | |
5782 | column_pos= record+ column->offset; |
5783 | column_length= column->length; |
5784 | |
5785 | switch (column->type) { |
5786 | case FIELD_CHECK: |
5787 | case FIELD_NORMAL: /* Fixed length field */ |
5788 | case FIELD_ZERO: |
5789 | case FIELD_SKIP_PRESPACE: /* Not packed */ |
5790 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
5791 | break; |
5792 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
5793 | { |
5794 | if (column->length <= 255) |
5795 | column_length= *field_lengths++; |
5796 | else |
5797 | { |
5798 | column_length= uint2korr(field_lengths); |
5799 | field_lengths+= 2; |
5800 | } |
5801 | break; |
5802 | } |
5803 | case FIELD_VARCHAR: |
5804 | { |
5805 | if (column->fill_length == 1) |
5806 | column_length= *field_lengths; |
5807 | else |
5808 | column_length= uint2korr(field_lengths); |
5809 | field_lengths+= column->fill_length; |
5810 | column_pos+= column->fill_length; |
5811 | break; |
5812 | } |
5813 | default: |
5814 | DBUG_ASSERT(0); |
5815 | } |
5816 | log_parts->str= column_pos; |
5817 | log_parts->length= column_length; |
5818 | row_length+= log_parts->length; |
5819 | log_parts++; |
5820 | } |
5821 | |
5822 | /* Add blobs */ |
5823 | for (end_column+= share->base.blobs; column < end_column; column++) |
5824 | { |
5825 | const uchar *field_pos= record + column->offset; |
5826 | uint size_length= column->length - portable_sizeof_char_ptr; |
5827 | ulong blob_length= _ma_calc_blob_length(size_length, field_pos); |
5828 | |
5829 | /* |
5830 | We don't have to check for null, as blob_length is guranteed to be 0 |
5831 | if the blob is null |
5832 | */ |
5833 | if (blob_length) |
5834 | { |
5835 | uchar *blob_pos; |
5836 | memcpy(&blob_pos, record + column->offset + size_length, |
5837 | sizeof(blob_pos)); |
5838 | log_parts->str= blob_pos; |
5839 | log_parts->length= blob_length; |
5840 | row_length+= log_parts->length; |
5841 | log_parts++; |
5842 | } |
5843 | } |
5844 | *log_parts_count= (uint) (log_parts - start_log_parts); |
5845 | DBUG_RETURN(row_length); |
5846 | } |
5847 | |
5848 | |
5849 | /* |
5850 | Fill array with pointers to field parts to be stored in log for update |
5851 | |
5852 | SYNOPSIS |
5853 | fill_update_undo_parts() |
5854 | info Maria handler |
5855 | oldrec Original row |
5856 | newrec New row |
5857 | log_parts Store pointers to changed memory areas here |
5858 | log_parts_count See RETURN |
5859 | |
5860 | IMPLEMENTATION |
5861 | Format of undo record: |
5862 | |
5863 | Fields are stored in same order as the field array. |
5864 | |
5865 | Offset to changed field data (packed) |
5866 | |
5867 | For each changed field |
5868 | Fieldnumber (packed) |
5869 | Length, if variable length field (packed) |
5870 | |
5871 | For each changed field |
5872 | Data |
5873 | |
5874 | Packing is using ma_store_integer() |
5875 | |
5876 | The reason we store field numbers & length separated from data (ie, not |
5877 | after each other) is to get better cpu caching when we loop over |
5878 | fields (as we probably don't have to access data for each field when we |
5879 | want to read and old row through the undo log record). |
5880 | |
5881 | As a special case, we use '255' for the field number of the null bitmap. |
5882 | |
5883 | RETURN |
5884 | length of data in log_parts. |
5885 | log_parts_count contains number of used log_parts |
5886 | */ |
5887 | |
5888 | static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, |
5889 | const uchar *newrec, |
5890 | LEX_CUSTRING *log_parts, |
5891 | uint *log_parts_count) |
5892 | { |
5893 | MARIA_SHARE *share= info->s; |
5894 | MARIA_COLUMNDEF *column, *end_column; |
5895 | MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row; |
5896 | uchar *field_data, *start_field_data, *length_str; |
5897 | uchar *old_field_lengths= old_row->field_lengths; |
5898 | uchar *new_field_lengths= new_row->field_lengths; |
5899 | size_t row_length= 0; |
5900 | uint field_lengths; |
5901 | LEX_CUSTRING *start_log_parts; |
5902 | my_bool new_column_is_empty; |
5903 | DBUG_ENTER("fill_update_undo_parts" ); |
5904 | |
5905 | start_log_parts= log_parts; |
5906 | |
5907 | /* |
5908 | First log part is for number of fields, field numbers and lengths |
5909 | The +4 is to reserve place for the number of changed fields. |
5910 | */ |
5911 | start_field_data= field_data= info->update_field_data + 4; |
5912 | log_parts++; |
5913 | |
5914 | if (memcmp(oldrec, newrec, share->base.null_bytes)) |
5915 | { |
5916 | /* Store changed null bits */ |
5917 | *field_data++= (uchar) 255; /* Special case */ |
5918 | log_parts->str= oldrec; |
5919 | log_parts->length= share->base.null_bytes; |
5920 | row_length= log_parts->length; |
5921 | log_parts++; |
5922 | } |
5923 | |
5924 | /* Handle constant length fields */ |
5925 | for (column= share->columndef, |
5926 | end_column= column+ share->base.fixed_not_null_fields; |
5927 | column < end_column; |
5928 | column++) |
5929 | { |
5930 | if (memcmp(oldrec + column->offset, newrec + column->offset, |
5931 | column->length)) |
5932 | { |
5933 | field_data= ma_store_length(field_data, |
5934 | (uint) (column - share->columndef)); |
5935 | log_parts->str= oldrec + column->offset; |
5936 | log_parts->length= column->length; |
5937 | row_length+= column->length; |
5938 | log_parts++; |
5939 | } |
5940 | } |
5941 | |
5942 | /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */ |
5943 | for (end_column= share->columndef + share->base.fields; |
5944 | column < end_column; |
5945 | column++) |
5946 | { |
5947 | const uchar *new_column_pos, *old_column_pos; |
5948 | size_t new_column_length, old_column_length; |
5949 | |
5950 | /* First check if old column is null or empty */ |
5951 | if (oldrec[column->null_pos] & column->null_bit) |
5952 | { |
5953 | /* |
5954 | It's safe to skip this one as either the new column is also null |
5955 | (no change) or the new_column is not null, in which case the null-bit |
5956 | maps differed and we have already stored the null bitmap. |
5957 | */ |
5958 | continue; |
5959 | } |
5960 | if (old_row->empty_bits[column->empty_pos] & column->empty_bit) |
5961 | { |
5962 | if (new_row->empty_bits[column->empty_pos] & column->empty_bit) |
5963 | continue; /* Both are empty; skip */ |
5964 | |
5965 | /* Store null length column */ |
5966 | field_data= ma_store_length(field_data, |
5967 | (uint) (column - share->columndef)); |
5968 | field_data= ma_store_length(field_data, 0); |
5969 | continue; |
5970 | } |
5971 | /* |
5972 | Remember if the 'new' value is empty (as in this case we must always |
5973 | log the original value |
5974 | */ |
5975 | new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) || |
5976 | (new_row->empty_bits[column->empty_pos] & |
5977 | column->empty_bit)); |
5978 | |
5979 | old_column_pos= oldrec + column->offset; |
5980 | new_column_pos= newrec + column->offset; |
5981 | old_column_length= new_column_length= column->length; |
5982 | |
5983 | switch (column->type) { |
5984 | case FIELD_CHECK: |
5985 | case FIELD_NORMAL: /* Fixed length field */ |
5986 | case FIELD_ZERO: |
5987 | case FIELD_SKIP_PRESPACE: /* Not packed */ |
5988 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
5989 | break; |
5990 | case FIELD_VARCHAR: |
5991 | new_column_length--; /* Skip length prefix */ |
5992 | old_column_pos+= column->fill_length; |
5993 | new_column_pos+= column->fill_length; |
5994 | /* Fall through */ |
5995 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
5996 | { |
5997 | if (new_column_length <= 255) |
5998 | { |
5999 | old_column_length= *old_field_lengths++; |
6000 | if (!new_column_is_empty) |
6001 | new_column_length= *new_field_lengths++; |
6002 | } |
6003 | else |
6004 | { |
6005 | old_column_length= uint2korr(old_field_lengths); |
6006 | old_field_lengths+= 2; |
6007 | if (!new_column_is_empty) |
6008 | { |
6009 | new_column_length= uint2korr(new_field_lengths); |
6010 | new_field_lengths+= 2; |
6011 | } |
6012 | } |
6013 | break; |
6014 | } |
6015 | case FIELD_BLOB: |
6016 | { |
6017 | uint size_length= column->length - portable_sizeof_char_ptr; |
6018 | old_column_length= _ma_calc_blob_length(size_length, old_column_pos); |
6019 | memcpy((void*) &old_column_pos, oldrec + column->offset + size_length, |
6020 | sizeof(old_column_pos)); |
6021 | if (!new_column_is_empty) |
6022 | { |
6023 | new_column_length= _ma_calc_blob_length(size_length, new_column_pos); |
6024 | memcpy((void*) &new_column_pos, newrec + column->offset + size_length, |
6025 | sizeof(old_column_pos)); |
6026 | } |
6027 | break; |
6028 | } |
6029 | default: |
6030 | DBUG_ASSERT(0); |
6031 | } |
6032 | |
6033 | if (new_column_is_empty || new_column_length != old_column_length || |
6034 | memcmp(old_column_pos, new_column_pos, new_column_length)) |
6035 | { |
6036 | field_data= ma_store_length(field_data, |
6037 | (ulong) (column - share->columndef)); |
6038 | field_data= ma_store_length(field_data, (ulong) old_column_length); |
6039 | |
6040 | log_parts->str= old_column_pos; |
6041 | log_parts->length= old_column_length; |
6042 | row_length+= old_column_length; |
6043 | log_parts++; |
6044 | } |
6045 | } |
6046 | |
6047 | *log_parts_count= (uint) (log_parts - start_log_parts); |
6048 | |
6049 | /* Store length of field length data before the field/field_lengths */ |
6050 | field_lengths= (uint) (field_data - start_field_data); |
6051 | length_str= start_field_data - ma_calc_length_for_store_length(field_lengths); |
6052 | start_log_parts->str= length_str; |
6053 | ma_store_length(length_str, field_lengths); |
6054 | start_log_parts->length= (size_t) (field_data - start_log_parts->str); |
6055 | row_length+= start_log_parts->length; |
6056 | DBUG_RETURN(row_length); |
6057 | } |
6058 | |
6059 | /*************************************************************************** |
6060 | In-write hooks called under log's lock when log record is written |
6061 | ***************************************************************************/ |
6062 | |
6063 | /** |
6064 | @brief Sets transaction's rec_lsn if needed |
6065 | |
6066 | A transaction sometimes writes a REDO even before the page is in the |
6067 | pagecache (example: brand new head or tail pages; full pages). So, if |
6068 | Checkpoint happens just after the REDO write, it needs to know that the |
6069 | REDO phase must start before this REDO. Scanning the pagecache cannot |
6070 | tell that as the page is not in the cache. So, transaction sets its rec_lsn |
6071 | to the REDO's LSN or somewhere before, and Checkpoint reads the |
6072 | transaction's rec_lsn. |
6073 | |
6074 | @return Operation status, always 0 (success) |
6075 | */ |
6076 | |
6077 | my_bool write_hook_for_redo(enum translog_record_type type |
6078 | __attribute__ ((unused)), |
6079 | TRN *trn, MARIA_HA *tbl_info |
6080 | __attribute__ ((unused)), |
6081 | LSN *lsn, void *hook_arg |
6082 | __attribute__ ((unused))) |
6083 | { |
6084 | /* |
6085 | Users of dummy_transaction_object must keep this TRN clean as it |
6086 | is used by many threads (like those manipulating non-transactional |
6087 | tables). It might be dangerous if one user sets rec_lsn or some other |
6088 | member and it is picked up by another user (like putting this rec_lsn into |
6089 | a page of a non-transactional table); it's safer if all members stay 0. So |
6090 | non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not |
6091 | call this hook; we trust them but verify ;) |
6092 | */ |
6093 | DBUG_ASSERT(trn->trid != 0); |
6094 | /* |
6095 | If the hook stays so simple, it would be faster to pass |
6096 | !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn |
6097 | to translog_write_record(), like Monty did in his original code, and not |
6098 | have a hook. For now we keep it like this. |
6099 | */ |
6100 | if (trn->rec_lsn == 0) |
6101 | trn->rec_lsn= *lsn; |
6102 | return 0; |
6103 | } |
6104 | |
6105 | |
6106 | /** |
6107 | @brief Sets transaction's undo_lsn, first_undo_lsn if needed |
6108 | |
6109 | @return Operation status, always 0 (success) |
6110 | */ |
6111 | |
6112 | my_bool write_hook_for_undo(enum translog_record_type type |
6113 | __attribute__ ((unused)), |
6114 | TRN *trn, MARIA_HA *tbl_info |
6115 | __attribute__ ((unused)), |
6116 | LSN *lsn, void *hook_arg |
6117 | __attribute__ ((unused))) |
6118 | { |
6119 | DBUG_ASSERT(trn->trid != 0); |
6120 | trn->undo_lsn= *lsn; |
6121 | if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0)) |
6122 | trn->first_undo_lsn= |
6123 | trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); |
6124 | return 0; |
6125 | /* |
6126 | when we implement purging, we will specialize this hook: UNDO_PURGE |
6127 | records will additionally set trn->undo_purge_lsn |
6128 | */ |
6129 | } |
6130 | |
6131 | |
6132 | /** |
6133 | @brief Sets the table's records count and checksum and others to 0, then |
6134 | calls the generic REDO hook. |
6135 | |
6136 | @return Operation status, always 0 (success) |
6137 | */ |
6138 | |
6139 | my_bool write_hook_for_redo_delete_all(enum translog_record_type type |
6140 | __attribute__ ((unused)), |
6141 | TRN *trn, MARIA_HA *tbl_info |
6142 | __attribute__ ((unused)), |
6143 | LSN *lsn, void *hook_arg) |
6144 | { |
6145 | _ma_reset_status(tbl_info); |
6146 | return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg); |
6147 | } |
6148 | |
6149 | |
6150 | /** |
6151 | @brief Updates "records" and "checksum" and calls the generic UNDO hook |
6152 | |
6153 | @return Operation status, always 0 (success) |
6154 | */ |
6155 | |
6156 | my_bool write_hook_for_undo_row_insert(enum translog_record_type type |
6157 | __attribute__ ((unused)), |
6158 | TRN *trn, MARIA_HA *tbl_info, |
6159 | LSN *lsn, void *hook_arg) |
6160 | { |
6161 | MARIA_SHARE *share= tbl_info->s; |
6162 | share->state.state.records++; |
6163 | share->state.state.checksum+= *(ha_checksum *)hook_arg; |
6164 | return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); |
6165 | } |
6166 | |
6167 | |
6168 | /** |
6169 | @brief Updates "records" and calls the generic UNDO hook |
6170 | |
6171 | @return Operation status, always 0 (success) |
6172 | */ |
6173 | |
6174 | my_bool write_hook_for_undo_row_delete(enum translog_record_type type |
6175 | __attribute__ ((unused)), |
6176 | TRN *trn, MARIA_HA *tbl_info, |
6177 | LSN *lsn, void *hook_arg) |
6178 | { |
6179 | MARIA_SHARE *share= tbl_info->s; |
6180 | share->state.state.records--; |
6181 | share->state.state.checksum+= *(ha_checksum *)hook_arg; |
6182 | return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); |
6183 | } |
6184 | |
6185 | |
6186 | /** |
6187 | @brief Upates "records" and "checksum" and calls the generic UNDO hook |
6188 | |
6189 | @return Operation status, always 0 (success) |
6190 | */ |
6191 | |
6192 | my_bool write_hook_for_undo_row_update(enum translog_record_type type |
6193 | __attribute__ ((unused)), |
6194 | TRN *trn, MARIA_HA *tbl_info, |
6195 | LSN *lsn, void *hook_arg) |
6196 | { |
6197 | MARIA_SHARE *share= tbl_info->s; |
6198 | share->state.state.checksum+= *(ha_checksum *)hook_arg; |
6199 | return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); |
6200 | } |
6201 | |
6202 | |
6203 | my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type |
6204 | __attribute__ ((unused)), |
6205 | TRN *trn, MARIA_HA *tbl_info, |
6206 | LSN *lsn, void *hook_arg) |
6207 | { |
6208 | /* |
6209 | We are going to call maria_delete_all_rows(), but without logging and |
6210 | syncing, as an optimization (if we crash before commit, the UNDO will |
6211 | empty; if we crash after commit, we have flushed and forced the files). |
6212 | Status still needs to be reset under log mutex, in case of a concurrent |
6213 | checkpoint. |
6214 | */ |
6215 | _ma_reset_status(tbl_info); |
6216 | return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); |
6217 | } |
6218 | |
6219 | |
6220 | /** |
6221 | @brief Updates table's lsn_of_file_id. |
6222 | |
6223 | @return Operation status, always 0 (success) |
6224 | */ |
6225 | |
6226 | my_bool write_hook_for_file_id(enum translog_record_type type |
6227 | __attribute__ ((unused)), |
6228 | TRN *trn |
6229 | __attribute__ ((unused)), |
6230 | MARIA_HA *tbl_info, |
6231 | LSN *lsn, |
6232 | void *hook_arg |
6233 | __attribute__ ((unused))) |
6234 | { |
6235 | DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0); |
6236 | tbl_info->s->lsn_of_file_id= *lsn; |
6237 | return 0; |
6238 | } |
6239 | |
6240 | |
6241 | /** |
6242 | Updates transaction's rec_lsn when committing. |
6243 | |
6244 | A transaction writes its commit record before being committed in trnman, so |
6245 | if Checkpoint happens just between the COMMIT record log write and the |
6246 | commit in trnman, it will record that transaction is not committed. Assume |
6247 | the transaction (trn1) did an INSERT; after the checkpoint, a second |
6248 | transaction (trn2) does a DELETE of what trn1 has inserted. Then crash, |
6249 | Checkpoint record says that trn1 was not committed, and REDO phase starts |
6250 | from Checkpoint record's LSN. So it will not find the COMMIT record of |
6251 | trn1, will want to roll back trn1, which will fail because the row/key |
6252 | which it wants to delete does not exist anymore. |
6253 | To avoid this, Checkpoint needs to know that the REDO phase must start |
6254 | before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's |
6255 | record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint |
6256 | will know. |
6257 | |
6258 | @note so after commit trn->rec_lsn is a "commit LSN", which could be of |
6259 | use later. |
6260 | |
6261 | @return Operation status, always 0 (success) |
6262 | */ |
6263 | |
6264 | my_bool write_hook_for_commit(enum translog_record_type type |
6265 | __attribute__ ((unused)), |
6266 | TRN *trn, |
6267 | MARIA_HA *tbl_info |
6268 | __attribute__ ((unused)), |
6269 | LSN *lsn, |
6270 | void *hook_arg |
6271 | __attribute__ ((unused))) |
6272 | { |
6273 | trn->rec_lsn= *lsn; |
6274 | return 0; |
6275 | } |
6276 | |
6277 | |
6278 | /*************************************************************************** |
6279 | Applying of REDO log records |
6280 | ***************************************************************************/ |
6281 | |
6282 | /* |
6283 | Apply changes to head and tail pages |
6284 | |
6285 | SYNOPSIS |
6286 | _ma_apply_redo_insert_row_head_or_tail() |
6287 | info Maria handler |
6288 | lsn LSN to put on page |
6289 | page_type HEAD_PAGE or TAIL_PAGE |
6290 | new_page True if this is first entry on page |
6291 | header Header (without FILEID) |
6292 | data Data to be put on page |
6293 | data_length Length of data |
6294 | |
6295 | NOTE |
6296 | Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL |
6297 | LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL |
6298 | |
6299 | RETURN |
6300 | 0 ok |
6301 | # Error number |
6302 | */ |
6303 | |
6304 | uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, |
6305 | uint page_type, |
6306 | my_bool new_page, |
6307 | const uchar *, |
6308 | const uchar *data, |
6309 | size_t data_length) |
6310 | { |
6311 | MARIA_SHARE *share= info->s; |
6312 | pgcache_page_no_t page; |
6313 | uint rownr, empty_space; |
6314 | uint block_size= share->block_size; |
6315 | uint rec_offset; |
6316 | uchar *buff, *dir; |
6317 | uint result; |
6318 | MARIA_PINNED_PAGE page_link; |
6319 | enum pagecache_page_lock lock_method; |
6320 | enum pagecache_page_pin pin_method; |
6321 | my_off_t end_of_page; |
6322 | uint error; |
6323 | DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail" ); |
6324 | |
6325 | page= page_korr(header); |
6326 | rownr= dirpos_korr(header + PAGE_STORE_SIZE); |
6327 | |
6328 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u data_length: %u" , |
6329 | (ulong) ma_recordpos(page, rownr), |
6330 | (ulong) page, rownr, (uint) data_length)); |
6331 | |
6332 | share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | |
6333 | STATE_NOT_MOVABLE); |
6334 | |
6335 | end_of_page= (page + 1) * share->block_size; |
6336 | if (end_of_page > share->state.state.data_file_length) |
6337 | { |
6338 | DBUG_PRINT("info" , ("Enlarging data file from %lu to %lu" , |
6339 | (ulong) share->state.state.data_file_length, |
6340 | (ulong) end_of_page)); |
6341 | /* |
6342 | New page at end of file. Note that the test above is also positive if |
6343 | data_file_length is not a multiple of block_size (system crashed while |
6344 | writing the last page): in this case we just extend the last page and |
6345 | fill it entirely with zeroes, then the REDO will put correct data on |
6346 | it. |
6347 | */ |
6348 | lock_method= PAGECACHE_LOCK_WRITE; |
6349 | pin_method= PAGECACHE_PIN; |
6350 | |
6351 | DBUG_ASSERT(rownr == 0 && new_page); |
6352 | if (rownr != 0 || !new_page) |
6353 | goto crashed_file; |
6354 | |
6355 | buff= info->keyread_buff; |
6356 | info->keyread_buff_used= 1; |
6357 | make_empty_page(info, buff, page_type, 1); |
6358 | empty_space= (block_size - PAGE_OVERHEAD_SIZE(share)); |
6359 | rec_offset= PAGE_HEADER_SIZE(share); |
6360 | dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; |
6361 | } |
6362 | else |
6363 | { |
6364 | lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; |
6365 | pin_method= PAGECACHE_PIN_LEFT_PINNED; |
6366 | |
6367 | share->pagecache->readwrite_flags&= ~MY_WME; |
6368 | buff= pagecache_read(share->pagecache, &info->dfile, |
6369 | page, 0, 0, |
6370 | PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, |
6371 | &page_link.link); |
6372 | share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags; |
6373 | if (!buff) |
6374 | { |
6375 | /* Skip errors when reading outside of file and uninitialized pages */ |
6376 | if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT && |
6377 | my_errno != HA_ERR_WRONG_CRC)) |
6378 | { |
6379 | DBUG_PRINT("error" , ("Error %d when reading page" , (int) my_errno)); |
6380 | goto err; |
6381 | } |
6382 | /* Create new page */ |
6383 | buff= pagecache_block_link_to_buffer(page_link.link); |
6384 | buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; |
6385 | } |
6386 | else if (lsn_korr(buff) >= lsn) /* Test if already applied */ |
6387 | { |
6388 | /* Fix bitmap, just in case */ |
6389 | empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
6390 | if (!enough_free_entries_on_page(share, buff)) |
6391 | empty_space= 0; /* Page is full */ |
6392 | |
6393 | if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) |
6394 | goto err; |
6395 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6396 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6397 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6398 | LSN_IMPOSSIBLE, 0, FALSE); |
6399 | DBUG_RETURN(0); |
6400 | } |
6401 | |
6402 | if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type)) |
6403 | { |
6404 | /* |
6405 | This is a page that has been freed before and now should be |
6406 | changed to new type. |
6407 | */ |
6408 | if (!new_page) |
6409 | { |
6410 | DBUG_PRINT("error" , |
6411 | ("Found page of wrong type: %u, should have been %u" , |
6412 | (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK), |
6413 | page_type)); |
6414 | goto crashed_file; |
6415 | } |
6416 | make_empty_page(info, buff, page_type, 0); |
6417 | empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE; |
6418 | (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space, |
6419 | page_type == HEAD_PAGE); |
6420 | rec_offset= PAGE_HEADER_SIZE(share); |
6421 | dir= dir_entry_pos(buff, block_size, rownr); |
6422 | empty_space+= uint2korr(dir+2); |
6423 | } |
6424 | else |
6425 | { |
6426 | uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; |
6427 | uint length; |
6428 | |
6429 | DBUG_ASSERT(!new_page); |
6430 | dir= dir_entry_pos(buff, block_size, rownr); |
6431 | empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); |
6432 | |
6433 | if (max_entry <= rownr) |
6434 | { |
6435 | /* Add directory entry first in directory and data last on page */ |
6436 | if (extend_directory(info, buff, block_size, max_entry, rownr, |
6437 | &empty_space, page_type == HEAD_PAGE)) |
6438 | goto crashed_file; |
6439 | } |
6440 | if (extend_area_on_page(info, buff, dir, rownr, |
6441 | (uint) data_length, &empty_space, |
6442 | &rec_offset, &length, page_type == HEAD_PAGE)) |
6443 | goto crashed_file; |
6444 | } |
6445 | } |
6446 | /* Copy data */ |
6447 | int2store(dir+2, data_length); |
6448 | memcpy(buff + rec_offset, data, data_length); |
6449 | empty_space-= (uint) data_length; |
6450 | int2store(buff + EMPTY_SPACE_OFFSET, empty_space); |
6451 | |
6452 | /* Fix bitmap */ |
6453 | if (!enough_free_entries_on_page(share, buff)) |
6454 | empty_space= 0; /* Page is full */ |
6455 | if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) |
6456 | goto err; |
6457 | |
6458 | /* |
6459 | If page was not read before, write it but keep it pinned. |
6460 | We don't update its LSN When we have processed all REDOs for this page |
6461 | in the current REDO's group, we will stamp page with UNDO's LSN |
6462 | (if we stamped it now, a next REDO, in |
6463 | this group, for this page, would be skipped) and unpin then. |
6464 | */ |
6465 | result= 0; |
6466 | if (lock_method == PAGECACHE_LOCK_WRITE && |
6467 | pagecache_write(share->pagecache, |
6468 | &info->dfile, page, 0, |
6469 | buff, PAGECACHE_PLAIN_PAGE, |
6470 | lock_method, pin_method, |
6471 | PAGECACHE_WRITE_DELAY, &page_link.link, |
6472 | LSN_IMPOSSIBLE)) |
6473 | result= my_errno; |
6474 | |
6475 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
6476 | page_link.changed= 1; |
6477 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
6478 | |
6479 | /* |
6480 | Data page and bitmap page are in place, we can update data_file_length in |
6481 | case we extended the file. We could not do it earlier: bitmap code tests |
6482 | data_file_length to know if it has to create a new page or not. |
6483 | */ |
6484 | set_if_bigger(share->state.state.data_file_length, end_of_page); |
6485 | DBUG_RETURN(result); |
6486 | |
6487 | crashed_file: |
6488 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
6489 | err: |
6490 | error= my_errno; |
6491 | if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED) |
6492 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6493 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6494 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6495 | LSN_IMPOSSIBLE, 0, FALSE); |
6496 | _ma_mark_file_crashed(share); |
6497 | DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */ |
6498 | DBUG_RETURN((my_errno= error)); |
6499 | } |
6500 | |
6501 | |
6502 | /* |
6503 | Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL |
6504 | |
6505 | SYNOPSIS |
6506 | _ma_apply_redo_purge_row_head_or_tail() |
6507 | info Maria handler |
6508 | lsn LSN to put on page |
6509 | page_type HEAD_PAGE or TAIL_PAGE |
6510 | header Header (without FILEID) |
6511 | |
6512 | NOTES |
6513 | This function is very similar to delete_head_or_tail() |
6514 | |
6515 | RETURN |
6516 | 0 ok |
6517 | # Error number |
6518 | */ |
6519 | |
6520 | uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, |
6521 | uint page_type, |
6522 | const uchar *) |
6523 | { |
6524 | MARIA_SHARE *share= info->s; |
6525 | pgcache_page_no_t page; |
6526 | uint rownr, empty_space; |
6527 | uchar *buff; |
6528 | int result; |
6529 | uint error; |
6530 | MARIA_PINNED_PAGE page_link; |
6531 | DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail" ); |
6532 | |
6533 | page= page_korr(header); |
6534 | rownr= dirpos_korr(header+PAGE_STORE_SIZE); |
6535 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u" , |
6536 | (ulong) ma_recordpos(page, rownr), |
6537 | (ulong) page, rownr)); |
6538 | |
6539 | share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | |
6540 | STATE_NOT_MOVABLE); |
6541 | |
6542 | if (!(buff= pagecache_read(share->pagecache, &info->dfile, |
6543 | page, 0, 0, |
6544 | PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, |
6545 | &page_link.link))) |
6546 | goto err; |
6547 | |
6548 | if (lsn_korr(buff) >= lsn) |
6549 | { |
6550 | /* |
6551 | Already applied |
6552 | Note that in case the page is not anymore a head or tail page |
6553 | a future redo will fix the bitmap. |
6554 | */ |
6555 | if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type) |
6556 | { |
6557 | empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET); |
6558 | if (!enough_free_entries_on_page(share, buff)) |
6559 | empty_space= 0; /* Page is full */ |
6560 | if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, |
6561 | empty_space)) |
6562 | goto err; |
6563 | } |
6564 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6565 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6566 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6567 | LSN_IMPOSSIBLE, 0, FALSE); |
6568 | DBUG_RETURN(0); |
6569 | } |
6570 | |
6571 | DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type); |
6572 | |
6573 | if (delete_dir_entry(share, buff, rownr, &empty_space) < 0) |
6574 | { |
6575 | _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); |
6576 | goto err; |
6577 | } |
6578 | |
6579 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
6580 | page_link.changed= 1; |
6581 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
6582 | |
6583 | result= 0; |
6584 | if (!enough_free_entries_on_page(share, buff)) |
6585 | empty_space= 0; /* Page is full */ |
6586 | /* This will work even if the page was marked as UNALLOCATED_PAGE */ |
6587 | if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) |
6588 | result= my_errno; |
6589 | |
6590 | DBUG_RETURN(result); |
6591 | |
6592 | err: |
6593 | error= my_errno; |
6594 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6595 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6596 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6597 | LSN_IMPOSSIBLE, 0, FALSE); |
6598 | _ma_mark_file_crashed(share); |
6599 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
6600 | DBUG_RETURN((my_errno= error)); |
6601 | |
6602 | } |
6603 | |
6604 | |
6605 | /** |
6606 | @brief Apply LOGREC_REDO_FREE_BLOCKS |
6607 | |
6608 | @param info Maria handler |
6609 | @param header Header (without FILEID) |
6610 | |
6611 | Mark the pages free in the bitmap. |
6612 | |
6613 | We have to check against _ma_redo_not_needed_for_page() |
6614 | to guard against the case where we first clear a block and after |
6615 | that insert new data into the blocks. If we would unconditionally |
6616 | clear the bitmap here, future changes would be ignored for the page |
6617 | if it's not in the dirty list (ie, it would be flushed). |
6618 | |
6619 | @return Operation status |
6620 | @retval 0 OK |
6621 | @retval 1 Error |
6622 | */ |
6623 | |
6624 | uint _ma_apply_redo_free_blocks(MARIA_HA *info, |
6625 | LSN lsn __attribute__((unused)), |
6626 | LSN redo_lsn, |
6627 | const uchar *) |
6628 | { |
6629 | MARIA_SHARE *share= info->s; |
6630 | uint ranges; |
6631 | uint16 sid; |
6632 | DBUG_ENTER("_ma_apply_redo_free_blocks" ); |
6633 | |
6634 | share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | |
6635 | STATE_NOT_MOVABLE); |
6636 | |
6637 | sid= fileid_korr(header); |
6638 | header+= FILEID_STORE_SIZE; |
6639 | ranges= pagerange_korr(header); |
6640 | header+= PAGERANGE_STORE_SIZE; |
6641 | DBUG_ASSERT(ranges > 0); |
6642 | |
6643 | /** @todo leave bitmap lock to the bitmap code... */ |
6644 | mysql_mutex_lock(&share->bitmap.bitmap_lock); |
6645 | while (ranges--) |
6646 | { |
6647 | my_bool res; |
6648 | uint page_range; |
6649 | pgcache_page_no_t page, start_page; |
6650 | |
6651 | start_page= page= page_korr(header); |
6652 | header+= PAGE_STORE_SIZE; |
6653 | /* Page range may have this bit set to indicate a tail page */ |
6654 | page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT); |
6655 | DBUG_ASSERT(page_range > 0); |
6656 | |
6657 | header+= PAGERANGE_STORE_SIZE; |
6658 | |
6659 | DBUG_PRINT("info" , ("page: %lu pages: %u" , (long) page, page_range)); |
6660 | |
6661 | for ( ; page_range-- ; start_page++) |
6662 | { |
6663 | if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE)) |
6664 | continue; |
6665 | res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page, |
6666 | 1); |
6667 | if (res) |
6668 | { |
6669 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
6670 | _ma_mark_file_crashed(share); |
6671 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
6672 | DBUG_RETURN(res); |
6673 | } |
6674 | } |
6675 | } |
6676 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
6677 | DBUG_RETURN(0); |
6678 | } |
6679 | |
6680 | |
6681 | /** |
6682 | @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL |
6683 | |
6684 | @param info Maria handler |
6685 | @param header Header (without FILEID) |
6686 | |
6687 | @note It marks the page free in the bitmap, and sets the directory's count |
6688 | to 0. |
6689 | |
6690 | @return Operation status |
6691 | @retval 0 OK |
6692 | @retval 1 Error |
6693 | */ |
6694 | |
6695 | uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, |
6696 | const uchar *) |
6697 | { |
6698 | MARIA_SHARE *share= info->s; |
6699 | uchar *buff; |
6700 | pgcache_page_no_t page; |
6701 | MARIA_PINNED_PAGE page_link; |
6702 | my_bool res; |
6703 | DBUG_ENTER("_ma_apply_redo_free_head_or_tail" ); |
6704 | |
6705 | share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | |
6706 | STATE_NOT_MOVABLE); |
6707 | |
6708 | page= page_korr(header); |
6709 | |
6710 | if (!(buff= pagecache_read(share->pagecache, |
6711 | &info->dfile, |
6712 | page, 0, 0, |
6713 | PAGECACHE_PLAIN_PAGE, |
6714 | PAGECACHE_LOCK_WRITE, &page_link.link))) |
6715 | { |
6716 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6717 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6718 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6719 | LSN_IMPOSSIBLE, 0, FALSE); |
6720 | goto err; |
6721 | } |
6722 | if (lsn_korr(buff) >= lsn) |
6723 | { |
6724 | /* Already applied */ |
6725 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6726 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6727 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6728 | LSN_IMPOSSIBLE, 0, FALSE); |
6729 | } |
6730 | else |
6731 | { |
6732 | buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; |
6733 | #ifdef IDENTICAL_PAGES_AFTER_RECOVERY |
6734 | { |
6735 | uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; |
6736 | uchar *dir= dir_entry_pos(buff, share->block_size, |
6737 | number_of_records-1); |
6738 | buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; |
6739 | bzero(dir, number_of_records * DIR_ENTRY_SIZE); |
6740 | } |
6741 | #endif |
6742 | |
6743 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
6744 | page_link.changed= 1; |
6745 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
6746 | } |
6747 | /** @todo leave bitmap lock to the bitmap code... */ |
6748 | mysql_mutex_lock(&share->bitmap.bitmap_lock); |
6749 | res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1); |
6750 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
6751 | if (res) |
6752 | goto err; |
6753 | DBUG_RETURN(0); |
6754 | |
6755 | err: |
6756 | _ma_mark_file_crashed(share); |
6757 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
6758 | DBUG_RETURN(1); |
6759 | } |
6760 | |
6761 | |
6762 | /** |
6763 | @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS |
6764 | |
6765 | @param info Maria handler |
6766 | @parma lsn LSN to put on pages |
6767 | @param header Header (with FILEID) |
6768 | @param redo_lsn REDO record's LSN |
6769 | @param[out] number_of_blobs Number of blobs found in log record |
6770 | @param[out] number_of_ranges Number of ranges found |
6771 | @param[out] first_page First page touched |
6772 | @param[out] last_page Last page touched |
6773 | |
6774 | @note Write full pages (full head & blob pages) |
6775 | |
6776 | @return Operation status |
6777 | @retval 0 OK |
6778 | @retval !=0 Error |
6779 | */ |
6780 | |
6781 | uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, |
6782 | LSN lsn, const uchar *, |
6783 | LSN redo_lsn, |
6784 | uint * const number_of_blobs, |
6785 | uint * const number_of_ranges, |
6786 | pgcache_page_no_t * const first_page, |
6787 | pgcache_page_no_t * const last_page) |
6788 | { |
6789 | MARIA_SHARE *share= info->s; |
6790 | const uchar *data; |
6791 | uint data_size= FULL_PAGE_SIZE(share); |
6792 | uint blob_count, ranges; |
6793 | uint16 sid; |
6794 | pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0; |
6795 | DBUG_ENTER("_ma_apply_redo_insert_row_blobs" ); |
6796 | |
6797 | share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | |
6798 | STATE_NOT_MOVABLE); |
6799 | |
6800 | sid= fileid_korr(header); |
6801 | header+= FILEID_STORE_SIZE; |
6802 | *number_of_ranges= ranges= pagerange_korr(header); |
6803 | header+= PAGERANGE_STORE_SIZE; |
6804 | *number_of_blobs= blob_count= pagerange_korr(header); |
6805 | header+= PAGERANGE_STORE_SIZE; |
6806 | DBUG_ASSERT(ranges >= blob_count); |
6807 | |
6808 | data= (header + ranges * ROW_EXTENT_SIZE + |
6809 | blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE)); |
6810 | |
6811 | while (blob_count--) |
6812 | { |
6813 | uint sub_ranges, empty_space; |
6814 | |
6815 | sub_ranges= uint2korr(header); |
6816 | header+= SUB_RANGE_SIZE; |
6817 | empty_space= uint2korr(header); |
6818 | header+= BLOCK_FILLER_SIZE; |
6819 | DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size); |
6820 | ranges-= sub_ranges; |
6821 | |
6822 | while (sub_ranges--) |
6823 | { |
6824 | uint i; |
6825 | uint res; |
6826 | uint page_range; |
6827 | pgcache_page_no_t page; |
6828 | uchar *buff; |
6829 | uint data_on_page= data_size; |
6830 | |
6831 | page= page_korr(header); |
6832 | header+= PAGE_STORE_SIZE; |
6833 | page_range= pagerange_korr(header); |
6834 | header+= PAGERANGE_STORE_SIZE; |
6835 | |
6836 | for (i= page_range; i-- > 0 ; page++, data+= data_on_page) |
6837 | { |
6838 | MARIA_PINNED_PAGE page_link; |
6839 | enum pagecache_page_lock unlock_method; |
6840 | enum pagecache_page_pin unpin_method; |
6841 | |
6842 | set_if_smaller(first_page2, page); |
6843 | set_if_bigger(last_page2, page); |
6844 | if (i == 0 && sub_ranges == 0) |
6845 | data_on_page= data_size - empty_space; /* data on last page */ |
6846 | if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE)) |
6847 | continue; |
6848 | |
6849 | if (((page + 1) * share->block_size) > |
6850 | share->state.state.data_file_length) |
6851 | { |
6852 | /* New page or half written page at end of file */ |
6853 | DBUG_PRINT("info" , ("Enlarging data file from %lu to %lu" , |
6854 | (ulong) share->state.state.data_file_length, |
6855 | (ulong) ((page + 1 ) * share->block_size))); |
6856 | share->state.state.data_file_length= (page + 1) * share->block_size; |
6857 | buff= info->keyread_buff; |
6858 | info->keyread_buff_used= 1; |
6859 | make_empty_page(info, buff, BLOB_PAGE, 0); |
6860 | unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; |
6861 | unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; |
6862 | } |
6863 | else |
6864 | { |
6865 | share->pagecache->readwrite_flags&= ~MY_WME; |
6866 | buff= pagecache_read(share->pagecache, |
6867 | &info->dfile, |
6868 | page, 0, 0, |
6869 | PAGECACHE_PLAIN_PAGE, |
6870 | PAGECACHE_LOCK_WRITE, &page_link.link); |
6871 | share->pagecache->readwrite_flags= share->pagecache-> |
6872 | org_readwrite_flags; |
6873 | if (!buff) |
6874 | { |
6875 | if (my_errno != HA_ERR_FILE_TOO_SHORT && |
6876 | my_errno != HA_ERR_WRONG_CRC) |
6877 | { |
6878 | /* If not read outside of file */ |
6879 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6880 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6881 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6882 | LSN_IMPOSSIBLE, 0, FALSE); |
6883 | goto err; |
6884 | } |
6885 | /* |
6886 | Physical file was too short, create new page. It can be that |
6887 | recovery started with a file with N pages, wrote page N+2 into |
6888 | pagecache (increased data_file_length but not physical file |
6889 | length), now reads page N+1: the read fails. |
6890 | */ |
6891 | buff= pagecache_block_link_to_buffer(page_link.link); |
6892 | make_empty_page(info, buff, BLOB_PAGE, 0); |
6893 | } |
6894 | else |
6895 | { |
6896 | #ifdef DBUG_ASSERT_EXISTS |
6897 | uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); |
6898 | #endif |
6899 | if (lsn_korr(buff) >= lsn) |
6900 | { |
6901 | /* Already applied */ |
6902 | DBUG_PRINT("info" , ("already applied %llu >= %llu" , |
6903 | lsn_korr(buff), lsn)); |
6904 | pagecache_unlock_by_link(share->pagecache, page_link.link, |
6905 | PAGECACHE_LOCK_WRITE_UNLOCK, |
6906 | PAGECACHE_UNPIN, LSN_IMPOSSIBLE, |
6907 | LSN_IMPOSSIBLE, 0, FALSE); |
6908 | goto fix_bitmap; |
6909 | } |
6910 | DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) || |
6911 | (found_page_type == (uchar) UNALLOCATED_PAGE)); |
6912 | } |
6913 | unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; |
6914 | unpin_method= PAGECACHE_UNPIN; |
6915 | } |
6916 | |
6917 | /* |
6918 | Blob pages are never updated twice in same redo-undo chain, so |
6919 | it's safe to update lsn for them here |
6920 | */ |
6921 | lsn_store(buff, lsn); |
6922 | buff[PAGE_TYPE_OFFSET]= BLOB_PAGE; |
6923 | bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE, |
6924 | FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE)); |
6925 | |
6926 | if (data_on_page != data_size) |
6927 | { |
6928 | /* |
6929 | Last page may be only partly filled. We zero the rest, like |
6930 | write_full_pages() does. |
6931 | */ |
6932 | bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space, |
6933 | empty_space); |
6934 | } |
6935 | memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page); |
6936 | if (pagecache_write(share->pagecache, |
6937 | &info->dfile, page, 0, |
6938 | buff, PAGECACHE_PLAIN_PAGE, |
6939 | unlock_method, unpin_method, |
6940 | PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) |
6941 | goto err; |
6942 | |
6943 | fix_bitmap: |
6944 | /** @todo leave bitmap lock to the bitmap code... */ |
6945 | mysql_mutex_lock(&share->bitmap.bitmap_lock); |
6946 | res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page, |
6947 | 1); |
6948 | mysql_mutex_unlock(&share->bitmap.bitmap_lock); |
6949 | if (res) |
6950 | goto err; |
6951 | } |
6952 | } |
6953 | } |
6954 | *first_page= first_page2; |
6955 | *last_page= last_page2; |
6956 | DBUG_RETURN(0); |
6957 | |
6958 | err: |
6959 | _ma_mark_file_crashed(share); |
6960 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
6961 | DBUG_RETURN(1); |
6962 | } |
6963 | |
6964 | |
6965 | /**************************************************************************** |
6966 | Applying of UNDO entries |
6967 | ****************************************************************************/ |
6968 | |
6969 | /** Execute undo of a row insert (delete the inserted row) */ |
6970 | |
6971 | my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, |
6972 | const uchar *) |
6973 | { |
6974 | pgcache_page_no_t page; |
6975 | uint rownr; |
6976 | uchar *buff; |
6977 | my_bool res; |
6978 | MARIA_PINNED_PAGE page_link; |
6979 | MARIA_SHARE *share= info->s; |
6980 | ha_checksum checksum; |
6981 | LSN lsn; |
6982 | DBUG_ENTER("_ma_apply_undo_row_insert" ); |
6983 | |
6984 | page= page_korr(header); |
6985 | header+= PAGE_STORE_SIZE; |
6986 | rownr= dirpos_korr(header); |
6987 | header+= DIRPOS_STORE_SIZE; |
6988 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u" , |
6989 | (ulong) ma_recordpos(page, rownr), |
6990 | (ulong) page, rownr)); |
6991 | |
6992 | buff= pagecache_read(share->pagecache, |
6993 | &info->dfile, page, 0, |
6994 | 0, share->page_type, |
6995 | PAGECACHE_LOCK_WRITE, |
6996 | &page_link.link); |
6997 | page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; |
6998 | page_link.changed= buff != 0; |
6999 | push_dynamic(&info->pinned_pages, (void*) &page_link); |
7000 | if (!buff) |
7001 | goto err; |
7002 | |
7003 | if (read_row_extent_info(info, buff, rownr)) |
7004 | goto err; |
7005 | |
7006 | _ma_bitmap_flushable(info, 1); |
7007 | if (delete_head_or_tail(info, page, rownr, 1, 1) || |
7008 | delete_tails(info, info->cur_row.tail_positions)) |
7009 | goto err; |
7010 | |
7011 | if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) |
7012 | goto err; |
7013 | |
7014 | checksum= 0; |
7015 | if (share->calc_checksum) |
7016 | checksum= (ha_checksum) 0 - ha_checksum_korr(header); |
7017 | info->last_auto_increment= ~ (ulonglong) 0; |
7018 | if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT, |
7019 | share->calc_checksum != 0, checksum, &lsn, (void*) 0)) |
7020 | goto err; |
7021 | |
7022 | res= 0; |
7023 | end: |
7024 | /* The following is true only if _ma_bitmap_flushable() was called earlier */ |
7025 | if (info->non_flushable_state) |
7026 | _ma_bitmap_flushable(info, -1); |
7027 | _ma_unpin_all_pages_and_finalize_row(info, lsn); |
7028 | DBUG_RETURN(res); |
7029 | |
7030 | err: |
7031 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
7032 | res= 1; |
7033 | _ma_mark_file_crashed(share); |
7034 | /* |
7035 | Don't write a new LSN on the used pages. Not important as the file is |
7036 | marked as crashed and need to be repaired before it can be used. |
7037 | */ |
7038 | lsn= LSN_IMPOSSIBLE; |
7039 | goto end; |
7040 | } |
7041 | |
7042 | |
7043 | /** Execute undo of a row delete (insert the row back where it was) */ |
7044 | |
7045 | my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, |
7046 | const uchar *, size_t |
7047 | __attribute__((unused))) |
7048 | { |
7049 | MARIA_SHARE *share= info->s; |
7050 | MARIA_ROW row; |
7051 | MARIA_COLUMNDEF *column, *end_column; |
7052 | MARIA_BITMAP_BLOCKS *blocks; |
7053 | struct st_row_pos_info row_pos; |
7054 | uchar *record; |
7055 | const uchar *null_bits, *field_length_data, *extent_info; |
7056 | pgcache_page_no_t page; |
7057 | ulong *blob_lengths; |
7058 | uint *null_field_lengths, extent_count, rownr, length_on_head_page; |
7059 | DBUG_ENTER("_ma_apply_undo_row_delete" ); |
7060 | |
7061 | /* |
7062 | Use cur row as a base; We need to make a copy as we will change |
7063 | some buffers to point directly to 'header' |
7064 | */ |
7065 | memcpy(&row, &info->cur_row, sizeof(row)); |
7066 | |
7067 | page= page_korr(header); |
7068 | header+= PAGE_STORE_SIZE; |
7069 | rownr= dirpos_korr(header); |
7070 | header+= DIRPOS_STORE_SIZE; |
7071 | length_on_head_page= uint2korr(header); |
7072 | header+= 2; |
7073 | extent_count= pagerange_korr(header); |
7074 | header+= PAGERANGE_STORE_SIZE; |
7075 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u" , |
7076 | (ulong) ma_recordpos(page, rownr), |
7077 | (ulong) page, rownr)); |
7078 | |
7079 | if (share->calc_checksum) |
7080 | { |
7081 | /* |
7082 | We extract the checksum delta here, saving a recomputation in |
7083 | allocate_and_write_block_record(). It's only an optimization. |
7084 | */ |
7085 | row.checksum= (ha_checksum) 0 - ha_checksum_korr(header); |
7086 | header+= HA_CHECKSUM_STORE_SIZE; |
7087 | } |
7088 | extent_info= header; |
7089 | header+= extent_count * ROW_EXTENT_SIZE; |
7090 | |
7091 | null_field_lengths= row.null_field_lengths; |
7092 | blob_lengths= row.blob_lengths; |
7093 | |
7094 | /* |
7095 | Fill in info->cur_row with information about the row, like in |
7096 | calc_record_size(), to be used by write_block_record() |
7097 | */ |
7098 | |
7099 | row.normal_length= row.char_length= row.varchar_length= |
7100 | row.blob_length= row.extents_count= row.field_lengths_length= 0; |
7101 | |
7102 | null_bits= header; |
7103 | header+= share->base.null_bytes; |
7104 | /* This will not be changed */ |
7105 | row.empty_bits= (uchar*) header; |
7106 | header+= share->base.pack_bytes; |
7107 | if (share->base.max_field_lengths) |
7108 | { |
7109 | row.field_lengths_length= uint2korr(header); |
7110 | row.field_lengths= (uchar*) header + 2 ; |
7111 | header+= 2 + row.field_lengths_length; |
7112 | } |
7113 | if (share->base.blobs) |
7114 | row.blob_length= ma_get_length(&header); |
7115 | |
7116 | /* We need to build up a record (without blobs) in rec_buff */ |
7117 | if (!(record= my_malloc(share->base.reclength, MYF(MY_WME)))) |
7118 | DBUG_RETURN(1); |
7119 | |
7120 | memcpy(record, null_bits, share->base.null_bytes); |
7121 | |
7122 | /* Copy field information from header to record */ |
7123 | |
7124 | /* Handle constant length fields that are always present */ |
7125 | for (column= share->columndef, |
7126 | end_column= column+ share->base.fixed_not_null_fields; |
7127 | column < end_column; |
7128 | column++) |
7129 | { |
7130 | memcpy(record + column->offset, header, column->length); |
7131 | header+= column->length; |
7132 | } |
7133 | |
7134 | /* Handle NULL fields and CHAR/VARCHAR fields */ |
7135 | field_length_data= row.field_lengths; |
7136 | for (end_column= share->columndef + share->base.fields; |
7137 | column < end_column; |
7138 | column++, null_field_lengths++) |
7139 | { |
7140 | if ((record[column->null_pos] & column->null_bit) || |
7141 | row.empty_bits[column->empty_pos] & column->empty_bit) |
7142 | { |
7143 | if (column->type != FIELD_BLOB) |
7144 | *null_field_lengths= 0; |
7145 | else |
7146 | *blob_lengths++= 0; |
7147 | if (share->calc_checksum) |
7148 | bfill(record + column->offset, column->fill_length, |
7149 | column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); |
7150 | continue; |
7151 | } |
7152 | switch (column->type) { |
7153 | case FIELD_CHECK: |
7154 | case FIELD_NORMAL: /* Fixed length field */ |
7155 | case FIELD_ZERO: |
7156 | case FIELD_SKIP_PRESPACE: /* Not packed */ |
7157 | case FIELD_SKIP_ZERO: /* Fixed length field */ |
7158 | row.normal_length+= column->length; |
7159 | *null_field_lengths= column->length; |
7160 | memcpy(record + column->offset, header, column->length); |
7161 | header+= column->length; |
7162 | break; |
7163 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
7164 | { |
7165 | uint length; |
7166 | if (column->length <= 255) |
7167 | length= (uint) *field_length_data++; |
7168 | else |
7169 | { |
7170 | length= uint2korr(field_length_data); |
7171 | field_length_data+= 2; |
7172 | } |
7173 | row.char_length+= length; |
7174 | *null_field_lengths= length; |
7175 | memcpy(record + column->offset, header, length); |
7176 | if (share->calc_checksum) |
7177 | bfill(record + column->offset + length, (column->length - length), |
7178 | ' '); |
7179 | header+= length; |
7180 | break; |
7181 | } |
7182 | case FIELD_VARCHAR: |
7183 | { |
7184 | uint length; |
7185 | uchar *field_pos= record + column->offset; |
7186 | |
7187 | /* 256 is correct as this includes the length uchar */ |
7188 | if (column->fill_length == 1) |
7189 | { |
7190 | field_pos[0]= *field_length_data; |
7191 | length= (uint) *field_length_data; |
7192 | } |
7193 | else |
7194 | { |
7195 | field_pos[0]= field_length_data[0]; |
7196 | field_pos[1]= field_length_data[1]; |
7197 | length= uint2korr(field_length_data); |
7198 | } |
7199 | field_length_data+= column->fill_length; |
7200 | field_pos+= column->fill_length; |
7201 | row.varchar_length+= length; |
7202 | *null_field_lengths= length; |
7203 | memcpy(field_pos, header, length); |
7204 | header+= length; |
7205 | break; |
7206 | } |
7207 | case FIELD_BLOB: |
7208 | { |
7209 | /* Copy length of blob and pointer to blob data to record */ |
7210 | uchar *field_pos= record + column->offset; |
7211 | uint size_length= column->length - portable_sizeof_char_ptr; |
7212 | ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); |
7213 | |
7214 | memcpy(field_pos, field_length_data, size_length); |
7215 | field_length_data+= size_length; |
7216 | memcpy(field_pos + size_length, &header, sizeof(header)); |
7217 | header+= blob_length; |
7218 | *blob_lengths++= blob_length; |
7219 | break; |
7220 | } |
7221 | default: |
7222 | DBUG_ASSERT(0); |
7223 | } |
7224 | } |
7225 | row.head_length= (info->row_base_length + |
7226 | share->base.fixed_not_null_fields_length + |
7227 | row.field_lengths_length + |
7228 | size_to_store_key_length(row.field_lengths_length) + |
7229 | row.normal_length + |
7230 | row.char_length + row.varchar_length); |
7231 | row.total_length= (row.head_length + row.blob_length); |
7232 | if (row.total_length < share->base.min_block_length) |
7233 | row.total_length= share->base.min_block_length; |
7234 | |
7235 | /* |
7236 | Row is now generated. Now we need to insert record on the original |
7237 | pages with original size on each page. |
7238 | */ |
7239 | |
7240 | _ma_bitmap_flushable(info, 1); |
7241 | /* Change extent information to be usable by write_block_record() */ |
7242 | blocks= &row.insert_blocks; |
7243 | if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) |
7244 | goto err; |
7245 | blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info, |
7246 | &share->bitmap, |
7247 | page); |
7248 | blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP; |
7249 | |
7250 | /* Read head page and allocate data for rowid */ |
7251 | if (get_rowpos_in_head_or_tail_page(info, blocks->block, |
7252 | info->buff, |
7253 | length_on_head_page, |
7254 | HEAD_PAGE, PAGECACHE_LOCK_WRITE, |
7255 | rownr, &row_pos)) |
7256 | goto err; |
7257 | |
7258 | if (share->calc_checksum) |
7259 | { |
7260 | DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record)); |
7261 | } |
7262 | /* Store same amount of data on head page as on original page */ |
7263 | row_pos.length= (length_on_head_page - |
7264 | (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); |
7265 | set_if_bigger(row_pos.length, share->base.min_block_length); |
7266 | if (write_block_record(info, (uchar*) 0, record, &row, |
7267 | blocks, blocks->block->org_bitmap_value != 0, |
7268 | &row_pos, undo_lsn, 0)) |
7269 | goto err; |
7270 | |
7271 | my_free(record); |
7272 | DBUG_RETURN(0); |
7273 | |
7274 | err: |
7275 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
7276 | _ma_mark_file_crashed(share); |
7277 | if (info->non_flushable_state) |
7278 | _ma_bitmap_flushable(info, -1); |
7279 | _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); |
7280 | my_free(record); |
7281 | DBUG_RETURN(1); |
7282 | } |
7283 | |
7284 | |
7285 | /** |
7286 | Execute undo of a row update |
7287 | |
7288 | @fn _ma_apply_undo_row_update() |
7289 | |
7290 | @return Operation status |
7291 | @retval 0 OK |
7292 | @retval 1 Error |
7293 | */ |
7294 | |
7295 | my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, |
7296 | const uchar *, |
7297 | size_t |
7298 | __attribute__((unused))) |
7299 | { |
7300 | MARIA_SHARE *share= info->s; |
7301 | MARIA_RECORD_POS record_pos; |
7302 | const uchar *field_length_data, *field_length_data_end, *extent_info; |
7303 | uchar *current_record, *orig_record; |
7304 | pgcache_page_no_t page; |
7305 | ha_checksum UNINIT_VAR(checksum_delta); |
7306 | uint rownr, , extent_count, length_on_head_page; |
7307 | int error; |
7308 | DBUG_ENTER("_ma_apply_undo_row_update" ); |
7309 | |
7310 | page= page_korr(header); |
7311 | header+= PAGE_STORE_SIZE; |
7312 | rownr= dirpos_korr(header); |
7313 | header+= DIRPOS_STORE_SIZE; |
7314 | |
7315 | record_pos= ma_recordpos(page, rownr); |
7316 | DBUG_PRINT("enter" , ("rowid: %lu page: %lu rownr: %u" , |
7317 | (ulong) record_pos, (ulong) page, rownr)); |
7318 | |
7319 | if (share->calc_checksum) |
7320 | { |
7321 | checksum_delta= ha_checksum_korr(header); |
7322 | header+= HA_CHECKSUM_STORE_SIZE; |
7323 | } |
7324 | length_on_head_page= uint2korr(header); |
7325 | set_if_bigger(length_on_head_page, share->base.min_block_length); |
7326 | header+= 2; |
7327 | extent_count= pagerange_korr(header); |
7328 | header+= PAGERANGE_STORE_SIZE; |
7329 | extent_info= header; |
7330 | header+= extent_count * ROW_EXTENT_SIZE; |
7331 | |
7332 | /* |
7333 | Set header to point to old field values, generated by |
7334 | fill_update_undo_parts() |
7335 | */ |
7336 | field_length_header= ma_get_length(&header); |
7337 | field_length_data= (uchar*) header; |
7338 | header+= field_length_header; |
7339 | field_length_data_end= header; |
7340 | |
7341 | /* Allocate buffer for current row & original row */ |
7342 | if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME)))) |
7343 | DBUG_RETURN(1); |
7344 | orig_record= current_record+ share->base.reclength; |
7345 | |
7346 | /* Read current record */ |
7347 | if (_ma_read_block_record(info, current_record, record_pos)) |
7348 | goto err; |
7349 | |
7350 | if (*field_length_data == 255) |
7351 | { |
7352 | /* Bitmap changed */ |
7353 | field_length_data++; |
7354 | memcpy(orig_record, header, share->base.null_bytes); |
7355 | header+= share->base.null_bytes; |
7356 | } |
7357 | else |
7358 | memcpy(orig_record, current_record, share->base.null_bytes); |
7359 | bitmap_clear_all(&info->changed_fields); |
7360 | |
7361 | while (field_length_data < field_length_data_end) |
7362 | { |
7363 | uint field_nr= ma_get_length(&field_length_data), field_length; |
7364 | MARIA_COLUMNDEF *column= share->columndef + field_nr; |
7365 | uchar *orig_field_pos= orig_record + column->offset; |
7366 | |
7367 | bitmap_set_bit(&info->changed_fields, field_nr); |
7368 | if (field_nr >= share->base.fixed_not_null_fields) |
7369 | { |
7370 | if (!(field_length= ma_get_length(&field_length_data))) |
7371 | { |
7372 | /* Null field or empty field */ |
7373 | bfill(orig_field_pos, column->fill_length, |
7374 | column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); |
7375 | continue; |
7376 | } |
7377 | } |
7378 | else |
7379 | field_length= column->length; |
7380 | |
7381 | switch (column->type) { |
7382 | case FIELD_CHECK: |
7383 | case FIELD_NORMAL: /* Fixed length field */ |
7384 | case FIELD_ZERO: |
7385 | case FIELD_SKIP_PRESPACE: /* Not packed */ |
7386 | memcpy(orig_field_pos, header, column->length); |
7387 | header+= column->length; |
7388 | break; |
7389 | case FIELD_SKIP_ZERO: /* Number */ |
7390 | case FIELD_SKIP_ENDSPACE: /* CHAR */ |
7391 | { |
7392 | uint diff; |
7393 | memcpy(orig_field_pos, header, field_length); |
7394 | if ((diff= (column->length - field_length))) |
7395 | bfill(orig_field_pos + column->length - diff, diff, |
7396 | column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); |
7397 | header+= field_length; |
7398 | } |
7399 | break; |
7400 | case FIELD_VARCHAR: |
7401 | if (column->length <= 256) |
7402 | { |
7403 | *orig_field_pos++= (uchar) field_length; |
7404 | } |
7405 | else |
7406 | { |
7407 | int2store(orig_field_pos, field_length); |
7408 | orig_field_pos+= 2; |
7409 | } |
7410 | memcpy(orig_field_pos, header, field_length); |
7411 | header+= field_length; |
7412 | break; |
7413 | case FIELD_BLOB: |
7414 | { |
7415 | uint size_length= column->length - portable_sizeof_char_ptr; |
7416 | _ma_store_blob_length(orig_field_pos, size_length, field_length); |
7417 | memcpy(orig_field_pos + size_length, &header, sizeof(header)); |
7418 | header+= field_length; |
7419 | break; |
7420 | } |
7421 | default: |
7422 | DBUG_ASSERT(0); |
7423 | } |
7424 | } |
7425 | copy_not_changed_fields(info, &info->changed_fields, |
7426 | orig_record, current_record); |
7427 | |
7428 | if (share->calc_checksum) |
7429 | { |
7430 | info->new_row.checksum= checksum_delta + |
7431 | (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record)); |
7432 | /* verify that record's content is sane */ |
7433 | DBUG_ASSERT(info->new_row.checksum == |
7434 | (*share->calc_checksum)(info, current_record)); |
7435 | } |
7436 | |
7437 | info->last_auto_increment= ~ (ulonglong) 0; |
7438 | /* Now records are up to date, execute the update to original values */ |
7439 | if (_ma_update_at_original_place(info, page, rownr, length_on_head_page, |
7440 | extent_count, extent_info, |
7441 | current_record, orig_record, undo_lsn)) |
7442 | goto err; |
7443 | |
7444 | error= 0; |
7445 | end: |
7446 | my_free(current_record); |
7447 | DBUG_RETURN(error); |
7448 | |
7449 | err: |
7450 | DBUG_ASSERT(!maria_assert_if_crashed_table); |
7451 | error= 1; |
7452 | _ma_mark_file_crashed(share); |
7453 | goto end; |
7454 | } |
7455 | |
7456 | |
7457 | /** |
7458 | Execute undo of a bulk insert which used repair |
7459 | |
7460 | @return Operation status |
7461 | @retval 0 OK |
7462 | @retval 1 Error |
7463 | */ |
7464 | |
7465 | my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn) |
7466 | { |
7467 | my_bool error; |
7468 | LSN lsn; |
7469 | DBUG_ENTER("_ma_apply_undo_bulk_insert" ); |
7470 | /* |
7471 | We delete all rows, re-enable indices as bulk insert had disabled |
7472 | non-unique ones. |
7473 | */ |
7474 | error= (maria_delete_all_rows(info) || |
7475 | maria_enable_indexes(info) || |
7476 | /* we enabled indices so need '2' below */ |
7477 | _ma_state_info_write(info->s, |
7478 | MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | |
7479 | MA_STATE_INFO_WRITE_FULL_INFO | |
7480 | MA_STATE_INFO_WRITE_LOCK) || |
7481 | _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT, |
7482 | FALSE, 0, &lsn, NULL)); |
7483 | DBUG_RETURN(error); |
7484 | } |
7485 | |
7486 | |
7487 | /** |
7488 | @brief Get the TRANSLOG_ADDRESS to flush up to |
7489 | |
7490 | @param page Page's content |
7491 | @param page_no Page's number (<offset>/<page length>) |
7492 | @param data_ptr Callback data pointer (pointer to MARIA_SHARE) |
7493 | |
7494 | @note |
7495 | Usable for data (non-bitmap) and index pages |
7496 | |
7497 | @retval LSN to flush up to |
7498 | */ |
7499 | |
7500 | TRANSLOG_ADDRESS |
7501 | maria_page_get_lsn(uchar *page, |
7502 | pgcache_page_no_t page_no __attribute__((unused)), |
7503 | uchar* data_ptr __attribute__((unused))) |
7504 | { |
7505 | #ifndef DBUG_OFF |
7506 | const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr; |
7507 | DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE && |
7508 | share->now_transactional); |
7509 | #endif |
7510 | return lsn_korr(page); |
7511 | } |
7512 | |
7513 | |
7514 | /** |
7515 | @brief Enable reading of all rows, ignoring versioning |
7516 | |
7517 | @note |
7518 | This is mainly useful in single user applications, like maria_pack, |
7519 | where we want to be able to read all rows without having to read the |
7520 | transaction id from the control file |
7521 | */ |
7522 | |
7523 | void maria_ignore_trids(MARIA_HA *info) |
7524 | { |
7525 | if (info->s->base.born_transactional) |
7526 | { |
7527 | if (!info->trn) |
7528 | _ma_set_trn_for_table(info, &dummy_transaction_object); |
7529 | /* Ignore transaction id when row is read */ |
7530 | info->trn->min_read_from= ~(TrID) 0; |
7531 | } |
7532 | } |
7533 | |
7534 | |
7535 | #ifndef DBUG_OFF |
7536 | |
7537 | /* The following functions are useful to call from debugger */ |
7538 | |
7539 | void _ma_print_block_info(MARIA_SHARE *share, uchar *buff) |
7540 | { |
7541 | LSN lsn= lsn_korr(buff); |
7542 | |
7543 | printf("LSN:" LSN_FMT " type: %u dir_entries: %u dir_free: %u empty_space: %u\n" , |
7544 | LSN_IN_PARTS(lsn), |
7545 | (uint)buff[PAGE_TYPE_OFFSET], |
7546 | (uint)buff[DIR_COUNT_OFFSET], |
7547 | (uint)buff[DIR_FREE_OFFSET], |
7548 | (uint) uint2korr(buff + EMPTY_SPACE_OFFSET)); |
7549 | printf("Start of directory: %lu\n" , |
7550 | maria_block_size - PAGE_SUFFIX_SIZE - |
7551 | (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE); |
7552 | _ma_print_directory(share, stdout, buff, maria_block_size); |
7553 | } |
7554 | #endif |
7555 | |