1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2012, Facebook Inc. |
5 | Copyright (c) 2017, MariaDB Corporation. |
6 | |
7 | This program is free software; you can redistribute it and/or modify it under |
8 | the terms of the GNU General Public License as published by the Free Software |
9 | Foundation; version 2 of the License. |
10 | |
11 | This program is distributed in the hope that it will be useful, but WITHOUT |
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
13 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License along with |
16 | this program; if not, write to the Free Software Foundation, Inc., |
17 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
18 | |
19 | *****************************************************************************/ |
20 | |
21 | /**************************************************//** |
22 | @file include/page0zip.ic |
23 | Compressed page interface |
24 | |
25 | Created June 2005 by Marko Makela |
26 | *******************************************************/ |
27 | |
28 | #ifdef UNIV_MATERIALIZE |
29 | # undef UNIV_INLINE |
30 | # define UNIV_INLINE |
31 | #endif |
32 | |
33 | #include "page0zip.h" |
34 | #include "mtr0log.h" |
35 | #include "page0page.h" |
36 | #include "srv0srv.h" |
37 | |
38 | /* The format of compressed pages is as follows. |
39 | |
40 | The header and trailer of the uncompressed pages, excluding the page |
41 | directory in the trailer, are copied as is to the header and trailer |
42 | of the compressed page. |
43 | |
44 | At the end of the compressed page, there is a dense page directory |
45 | pointing to every user record contained on the page, including deleted |
46 | records on the free list. The dense directory is indexed in the |
47 | collation order, i.e., in the order in which the record list is |
48 | linked on the uncompressed page. The infimum and supremum records are |
49 | excluded. The two most significant bits of the entries are allocated |
50 | for the delete-mark and an n_owned flag indicating the last record in |
51 | a chain of records pointed to from the sparse page directory on the |
52 | uncompressed page. |
53 | |
54 | The data between PAGE_ZIP_START and the last page directory entry will |
55 | be written in compressed format, starting at offset PAGE_DATA. |
56 | Infimum and supremum records are not stored. We exclude the |
57 | REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered |
58 | from the dense page directory stored at the end of the compressed |
59 | page. |
60 | |
61 | The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and |
62 | roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of |
63 | externally stored columns are stored separately, in ascending order of |
64 | heap_no and column index, starting backwards from the dense page |
65 | directory. |
66 | |
67 | The compressed data stream may be followed by a modification log |
68 | covering the compressed portion of the page, as follows. |
69 | |
70 | MODIFICATION LOG ENTRY FORMAT |
71 | - write record: |
72 | - (heap_no - 1) << 1 (1..2 bytes) |
73 | - extra bytes backwards |
74 | - data bytes |
75 | - clear record: |
76 | - (heap_no - 1) << 1 | 1 (1..2 bytes) |
77 | |
78 | The integer values are stored in a variable-length format: |
79 | - 0xxxxxxx: 0..127 |
80 | - 1xxxxxxx xxxxxxxx: 0..32767 |
81 | |
82 | The end of the modification log is marked by a 0 byte. |
83 | |
84 | In summary, the compressed page looks like this: |
85 | |
86 | (1) Uncompressed page header (PAGE_DATA bytes) |
87 | (2) Compressed index information |
88 | (3) Compressed page data |
89 | (4) Page modification log (page_zip->m_start..page_zip->m_end) |
90 | (5) Empty zero-filled space |
91 | (6) BLOB pointers (on leaf pages) |
92 | - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column |
93 | - in descending collation order |
94 | (7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, |
95 | - indexed by heap_no |
96 | - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes |
97 | - REC_NODE_PTR_SIZE for non-leaf pages |
98 | - 0 otherwise |
99 | (8) dense page directory, stored backwards |
100 | - n_dense = n_heap - 2 |
101 | - existing records in ascending collation order |
102 | - deleted records (free list) in link order |
103 | */ |
104 | |
105 | /**********************************************************************//** |
106 | Determine the size of a compressed page in bytes. |
107 | @return size in bytes */ |
108 | UNIV_INLINE |
109 | ulint |
110 | page_zip_get_size( |
111 | /*==============*/ |
112 | const page_zip_des_t* page_zip) /*!< in: compressed page */ |
113 | { |
114 | ulint size; |
115 | |
116 | if (!page_zip->ssize) { |
117 | return(0); |
118 | } |
119 | |
120 | size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; |
121 | |
122 | ut_ad(size >= UNIV_ZIP_SIZE_MIN); |
123 | ut_ad(size <= srv_page_size); |
124 | |
125 | return(size); |
126 | } |
127 | /**********************************************************************//** |
128 | Set the size of a compressed page in bytes. */ |
129 | UNIV_INLINE |
130 | void |
131 | page_zip_set_size( |
132 | /*==============*/ |
133 | page_zip_des_t* page_zip, /*!< in/out: compressed page */ |
134 | ulint size) /*!< in: size in bytes */ |
135 | { |
136 | if (size) { |
137 | unsigned ssize; |
138 | |
139 | ut_ad(ut_is_2pow(size)); |
140 | |
141 | for (ssize = 1; size > (512U << ssize); ssize++) { |
142 | } |
143 | |
144 | page_zip->ssize = ssize; |
145 | } else { |
146 | page_zip->ssize = 0; |
147 | } |
148 | |
149 | ut_ad(page_zip_get_size(page_zip) == size); |
150 | } |
151 | |
152 | /** Determine if a record is so big that it needs to be stored externally. |
153 | @param[in] rec_size length of the record in bytes |
154 | @param[in] comp nonzero=compact format |
155 | @param[in] n_fields number of fields in the record; ignored if |
156 | tablespace is not compressed |
157 | @param[in] page_size page size |
158 | @return FALSE if the entire record can be stored locally on the page */ |
159 | UNIV_INLINE |
160 | ibool |
161 | page_zip_rec_needs_ext( |
162 | ulint rec_size, |
163 | ulint comp, |
164 | ulint n_fields, |
165 | const page_size_t& page_size) |
166 | { |
167 | ut_ad(rec_size |
168 | > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); |
169 | ut_ad(comp || !page_size.is_compressed()); |
170 | |
171 | #if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE |
172 | if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : |
173 | rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) { |
174 | return(TRUE); |
175 | } |
176 | #endif |
177 | |
178 | if (page_size.is_compressed()) { |
179 | ut_ad(comp); |
180 | /* On a compressed page, there is a two-byte entry in |
181 | the dense page directory for every record. But there |
182 | is no record header. There should be enough room for |
183 | one record on an empty leaf page. Subtract 1 byte for |
184 | the encoded heap number. Check also the available space |
185 | on the uncompressed page. */ |
186 | return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) |
187 | >= page_zip_empty_size(n_fields, page_size.physical()) |
188 | || rec_size >= page_get_free_space_of_empty(TRUE) / 2); |
189 | } |
190 | |
191 | return(rec_size >= page_get_free_space_of_empty(comp) / 2); |
192 | } |
193 | |
194 | #ifdef UNIV_DEBUG |
195 | /**********************************************************************//** |
196 | Validate a compressed page descriptor. |
197 | @return TRUE if ok */ |
198 | UNIV_INLINE |
199 | ibool |
200 | page_zip_simple_validate( |
201 | /*=====================*/ |
202 | const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ |
203 | { |
204 | ut_ad(page_zip); |
205 | ut_ad(page_zip->data); |
206 | ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); |
207 | ut_ad(page_zip_get_size(page_zip) |
208 | > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); |
209 | ut_ad(page_zip->m_start <= page_zip->m_end); |
210 | ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); |
211 | ut_ad(page_zip->n_blobs |
212 | < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); |
213 | return(TRUE); |
214 | } |
215 | #endif /* UNIV_DEBUG */ |
216 | |
217 | /**********************************************************************//** |
218 | Determine if the length of the page trailer. |
219 | @return length of the page trailer, in bytes, not including the |
220 | terminating zero byte of the modification log */ |
221 | UNIV_INLINE |
222 | ibool |
223 | page_zip_get_trailer_len( |
224 | /*=====================*/ |
225 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
226 | ibool is_clust)/*!< in: TRUE if clustered index */ |
227 | { |
228 | ulint uncompressed_size; |
229 | |
230 | ut_ad(page_zip_simple_validate(page_zip)); |
231 | UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); |
232 | |
233 | if (!page_is_leaf(page_zip->data)) { |
234 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE |
235 | + REC_NODE_PTR_SIZE; |
236 | ut_ad(!page_zip->n_blobs); |
237 | } else if (is_clust) { |
238 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE |
239 | + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; |
240 | } else { |
241 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; |
242 | ut_ad(!page_zip->n_blobs); |
243 | } |
244 | |
245 | return (ulint(page_dir_get_n_heap(page_zip->data)) - 2) |
246 | * uncompressed_size |
247 | + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE; |
248 | } |
249 | |
250 | /**********************************************************************//** |
251 | Determine how big record can be inserted without recompressing the page. |
252 | @return a positive number indicating the maximum size of a record |
253 | whose insertion is guaranteed to succeed, or zero or negative */ |
254 | UNIV_INLINE |
255 | lint |
256 | page_zip_max_ins_size( |
257 | /*==================*/ |
258 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
259 | ibool is_clust)/*!< in: TRUE if clustered index */ |
260 | { |
261 | ulint trailer_len; |
262 | |
263 | trailer_len = page_zip_get_trailer_len(page_zip, is_clust); |
264 | |
265 | /* When a record is created, a pointer may be added to |
266 | the dense directory. |
267 | Likewise, space for the columns that will not be |
268 | compressed will be allocated from the page trailer. |
269 | Also the BLOB pointers will be allocated from there, but |
270 | we may as well count them in the length of the record. */ |
271 | |
272 | trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; |
273 | |
274 | return(lint(page_zip_get_size(page_zip) |
275 | - trailer_len - page_zip->m_end |
276 | - (REC_N_NEW_EXTRA_BYTES - 2))); |
277 | } |
278 | |
279 | /**********************************************************************//** |
280 | Determine if enough space is available in the modification log. |
281 | @return TRUE if enough space is available */ |
282 | UNIV_INLINE |
283 | ibool |
284 | page_zip_available( |
285 | /*===============*/ |
286 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
287 | ibool is_clust,/*!< in: TRUE if clustered index */ |
288 | ulint length, /*!< in: combined size of the record */ |
289 | ulint create) /*!< in: nonzero=add the record to |
290 | the heap */ |
291 | { |
292 | ulint trailer_len; |
293 | |
294 | ut_ad(length > REC_N_NEW_EXTRA_BYTES); |
295 | |
296 | trailer_len = page_zip_get_trailer_len(page_zip, is_clust); |
297 | |
298 | /* Subtract the fixed extra bytes and add the maximum |
299 | space needed for identifying the record (encoded heap_no). */ |
300 | length -= REC_N_NEW_EXTRA_BYTES - 2; |
301 | |
302 | if (create > 0) { |
303 | /* When a record is created, a pointer may be added to |
304 | the dense directory. |
305 | Likewise, space for the columns that will not be |
306 | compressed will be allocated from the page trailer. |
307 | Also the BLOB pointers will be allocated from there, but |
308 | we may as well count them in the length of the record. */ |
309 | |
310 | trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; |
311 | } |
312 | |
313 | return(length + trailer_len + page_zip->m_end |
314 | < page_zip_get_size(page_zip)); |
315 | } |
316 | |
317 | /**********************************************************************//** |
318 | Initialize a compressed page descriptor. */ |
319 | UNIV_INLINE |
320 | void |
321 | page_zip_des_init( |
322 | /*==============*/ |
323 | page_zip_des_t* page_zip) /*!< in/out: compressed page |
324 | descriptor */ |
325 | { |
326 | memset(page_zip, 0, sizeof *page_zip); |
327 | } |
328 | |
329 | /**********************************************************************//** |
330 | Write a log record of writing to the uncompressed header portion of a page. */ |
331 | void |
332 | ( |
333 | /*======================*/ |
334 | const byte* data,/*!< in: data on the uncompressed page */ |
335 | ulint length, /*!< in: length of the data */ |
336 | mtr_t* mtr); /*!< in: mini-transaction */ |
337 | |
338 | /**********************************************************************//** |
339 | Write data to the uncompressed header portion of a page. The data must |
340 | already have been written to the uncompressed page. |
341 | However, the data portion of the uncompressed page may differ from |
342 | the compressed page when a record is being inserted in |
343 | page_cur_insert_rec_zip(). */ |
344 | UNIV_INLINE |
345 | void |
346 | ( |
347 | /*==================*/ |
348 | page_zip_des_t* page_zip,/*!< in/out: compressed page */ |
349 | const byte* str, /*!< in: address on the uncompressed page */ |
350 | ulint length, /*!< in: length of the data */ |
351 | mtr_t* mtr) /*!< in: mini-transaction, or NULL */ |
352 | { |
353 | ulint pos; |
354 | |
355 | ut_ad(page_zip_simple_validate(page_zip)); |
356 | UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); |
357 | |
358 | pos = page_offset(str); |
359 | |
360 | ut_ad(pos < PAGE_DATA); |
361 | |
362 | memcpy(page_zip->data + pos, str, length); |
363 | |
364 | /* The following would fail in page_cur_insert_rec_zip(). */ |
365 | /* ut_ad(page_zip_validate(page_zip, str - pos)); */ |
366 | |
367 | if (mtr) { |
368 | page_zip_write_header_log(str, length, mtr); |
369 | } |
370 | } |
371 | |
372 | /**********************************************************************//** |
373 | Write a log record of compressing an index page without the data on the page. */ |
374 | UNIV_INLINE |
375 | void |
376 | page_zip_compress_write_log_no_data( |
377 | /*================================*/ |
378 | ulint level, /*!< in: compression level */ |
379 | const page_t* page, /*!< in: page that is compressed */ |
380 | dict_index_t* index, /*!< in: index */ |
381 | mtr_t* mtr) /*!< in: mtr */ |
382 | { |
383 | byte* log_ptr = mlog_open_and_write_index( |
384 | mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1); |
385 | |
386 | if (log_ptr) { |
387 | mach_write_to_1(log_ptr, level); |
388 | mlog_close(mtr, log_ptr + 1); |
389 | } |
390 | } |
391 | |
392 | /**********************************************************************//** |
393 | Parses a log record of compressing an index page without the data. |
394 | @return end of log record or NULL */ |
395 | UNIV_INLINE |
396 | byte* |
397 | page_zip_parse_compress_no_data( |
398 | /*============================*/ |
399 | byte* ptr, /*!< in: buffer */ |
400 | byte* end_ptr, /*!< in: buffer end */ |
401 | page_t* page, /*!< in: uncompressed page */ |
402 | page_zip_des_t* page_zip, /*!< out: compressed page */ |
403 | dict_index_t* index) /*!< in: index */ |
404 | { |
405 | ulint level; |
406 | if (end_ptr == ptr) { |
407 | return(NULL); |
408 | } |
409 | |
410 | level = mach_read_from_1(ptr); |
411 | |
412 | /* If page compression fails then there must be something wrong |
413 | because a compress log record is logged only if the compression |
414 | was successful. Crash in this case. */ |
415 | |
416 | if (page |
417 | && !page_zip_compress(page_zip, page, index, level, NULL, NULL)) { |
418 | ut_error; |
419 | } |
420 | |
421 | return(ptr + 1); |
422 | } |
423 | |
424 | /**********************************************************************//** |
425 | Reset the counters used for filling |
426 | INFORMATION_SCHEMA.innodb_cmp_per_index. */ |
427 | UNIV_INLINE |
428 | void |
429 | page_zip_reset_stat_per_index() |
430 | /*===========================*/ |
431 | { |
432 | mutex_enter(&page_zip_stat_per_index_mutex); |
433 | |
434 | page_zip_stat_per_index.erase( |
435 | page_zip_stat_per_index.begin(), |
436 | page_zip_stat_per_index.end()); |
437 | |
438 | mutex_exit(&page_zip_stat_per_index_mutex); |
439 | } |
440 | |
441 | #ifdef UNIV_MATERIALIZE |
442 | # undef UNIV_INLINE |
443 | # define UNIV_INLINE UNIV_INLINE_ORIGINAL |
444 | #endif |
445 | |