1/*****************************************************************************
2
3Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2012, Facebook Inc.
5Copyright (c) 2017, MariaDB Corporation.
6
7This program is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free Software
9Foundation; version 2 of the License.
10
11This program is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License along with
16this program; if not, write to the Free Software Foundation, Inc.,
1751 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
18
19*****************************************************************************/
20
21/**************************************************//**
22@file include/page0zip.ic
23Compressed page interface
24
25Created June 2005 by Marko Makela
26*******************************************************/
27
28#ifdef UNIV_MATERIALIZE
29# undef UNIV_INLINE
30# define UNIV_INLINE
31#endif
32
33#include "page0zip.h"
34#include "mtr0log.h"
35#include "page0page.h"
36#include "srv0srv.h"
37
38/* The format of compressed pages is as follows.
39
40The header and trailer of the uncompressed pages, excluding the page
41directory in the trailer, are copied as is to the header and trailer
42of the compressed page.
43
44At the end of the compressed page, there is a dense page directory
45pointing to every user record contained on the page, including deleted
46records on the free list. The dense directory is indexed in the
47collation order, i.e., in the order in which the record list is
48linked on the uncompressed page. The infimum and supremum records are
49excluded. The two most significant bits of the entries are allocated
50for the delete-mark and an n_owned flag indicating the last record in
51a chain of records pointed to from the sparse page directory on the
52uncompressed page.
53
54The data between PAGE_ZIP_START and the last page directory entry will
55be written in compressed format, starting at offset PAGE_DATA.
56Infimum and supremum records are not stored. We exclude the
57REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered
58from the dense page directory stored at the end of the compressed
59page.
60
61The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
62roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
63externally stored columns are stored separately, in ascending order of
64heap_no and column index, starting backwards from the dense page
65directory.
66
67The compressed data stream may be followed by a modification log
68covering the compressed portion of the page, as follows.
69
70MODIFICATION LOG ENTRY FORMAT
71- write record:
72 - (heap_no - 1) << 1 (1..2 bytes)
73 - extra bytes backwards
74 - data bytes
75- clear record:
76 - (heap_no - 1) << 1 | 1 (1..2 bytes)
77
78The integer values are stored in a variable-length format:
79- 0xxxxxxx: 0..127
80- 1xxxxxxx xxxxxxxx: 0..32767
81
82The end of the modification log is marked by a 0 byte.
83
84In summary, the compressed page looks like this:
85
86(1) Uncompressed page header (PAGE_DATA bytes)
87(2) Compressed index information
88(3) Compressed page data
89(4) Page modification log (page_zip->m_start..page_zip->m_end)
90(5) Empty zero-filled space
91(6) BLOB pointers (on leaf pages)
92 - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
93 - in descending collation order
94(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
95 - indexed by heap_no
96 - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
97 - REC_NODE_PTR_SIZE for non-leaf pages
98 - 0 otherwise
99(8) dense page directory, stored backwards
100 - n_dense = n_heap - 2
101 - existing records in ascending collation order
102 - deleted records (free list) in link order
103*/
104
105/**********************************************************************//**
106Determine the size of a compressed page in bytes.
107@return size in bytes */
108UNIV_INLINE
109ulint
110page_zip_get_size(
111/*==============*/
112 const page_zip_des_t* page_zip) /*!< in: compressed page */
113{
114 ulint size;
115
116 if (!page_zip->ssize) {
117 return(0);
118 }
119
120 size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
121
122 ut_ad(size >= UNIV_ZIP_SIZE_MIN);
123 ut_ad(size <= srv_page_size);
124
125 return(size);
126}
127/**********************************************************************//**
128Set the size of a compressed page in bytes. */
129UNIV_INLINE
130void
131page_zip_set_size(
132/*==============*/
133 page_zip_des_t* page_zip, /*!< in/out: compressed page */
134 ulint size) /*!< in: size in bytes */
135{
136 if (size) {
137 unsigned ssize;
138
139 ut_ad(ut_is_2pow(size));
140
141 for (ssize = 1; size > (512U << ssize); ssize++) {
142 }
143
144 page_zip->ssize = ssize;
145 } else {
146 page_zip->ssize = 0;
147 }
148
149 ut_ad(page_zip_get_size(page_zip) == size);
150}
151
152/** Determine if a record is so big that it needs to be stored externally.
153@param[in] rec_size length of the record in bytes
154@param[in] comp nonzero=compact format
155@param[in] n_fields number of fields in the record; ignored if
156tablespace is not compressed
157@param[in] page_size page size
158@return FALSE if the entire record can be stored locally on the page */
159UNIV_INLINE
160ibool
161page_zip_rec_needs_ext(
162 ulint rec_size,
163 ulint comp,
164 ulint n_fields,
165 const page_size_t& page_size)
166{
167 ut_ad(rec_size
168 > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
169 ut_ad(comp || !page_size.is_compressed());
170
171#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE
172 if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE :
173 rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) {
174 return(TRUE);
175 }
176#endif
177
178 if (page_size.is_compressed()) {
179 ut_ad(comp);
180 /* On a compressed page, there is a two-byte entry in
181 the dense page directory for every record. But there
182 is no record header. There should be enough room for
183 one record on an empty leaf page. Subtract 1 byte for
184 the encoded heap number. Check also the available space
185 on the uncompressed page. */
186 return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
187 >= page_zip_empty_size(n_fields, page_size.physical())
188 || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
189 }
190
191 return(rec_size >= page_get_free_space_of_empty(comp) / 2);
192}
193
194#ifdef UNIV_DEBUG
195/**********************************************************************//**
196Validate a compressed page descriptor.
197@return TRUE if ok */
198UNIV_INLINE
199ibool
200page_zip_simple_validate(
201/*=====================*/
202 const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */
203{
204 ut_ad(page_zip);
205 ut_ad(page_zip->data);
206 ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
207 ut_ad(page_zip_get_size(page_zip)
208 > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
209 ut_ad(page_zip->m_start <= page_zip->m_end);
210 ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
211 ut_ad(page_zip->n_blobs
212 < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
213 return(TRUE);
214}
215#endif /* UNIV_DEBUG */
216
217/**********************************************************************//**
218Determine if the length of the page trailer.
219@return length of the page trailer, in bytes, not including the
220terminating zero byte of the modification log */
221UNIV_INLINE
222ibool
223page_zip_get_trailer_len(
224/*=====================*/
225 const page_zip_des_t* page_zip,/*!< in: compressed page */
226 ibool is_clust)/*!< in: TRUE if clustered index */
227{
228 ulint uncompressed_size;
229
230 ut_ad(page_zip_simple_validate(page_zip));
231 UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
232
233 if (!page_is_leaf(page_zip->data)) {
234 uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
235 + REC_NODE_PTR_SIZE;
236 ut_ad(!page_zip->n_blobs);
237 } else if (is_clust) {
238 uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
239 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
240 } else {
241 uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
242 ut_ad(!page_zip->n_blobs);
243 }
244
245 return (ulint(page_dir_get_n_heap(page_zip->data)) - 2)
246 * uncompressed_size
247 + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE;
248}
249
250/**********************************************************************//**
251Determine how big record can be inserted without recompressing the page.
252@return a positive number indicating the maximum size of a record
253whose insertion is guaranteed to succeed, or zero or negative */
254UNIV_INLINE
255lint
256page_zip_max_ins_size(
257/*==================*/
258 const page_zip_des_t* page_zip,/*!< in: compressed page */
259 ibool is_clust)/*!< in: TRUE if clustered index */
260{
261 ulint trailer_len;
262
263 trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
264
265 /* When a record is created, a pointer may be added to
266 the dense directory.
267 Likewise, space for the columns that will not be
268 compressed will be allocated from the page trailer.
269 Also the BLOB pointers will be allocated from there, but
270 we may as well count them in the length of the record. */
271
272 trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
273
274 return(lint(page_zip_get_size(page_zip)
275 - trailer_len - page_zip->m_end
276 - (REC_N_NEW_EXTRA_BYTES - 2)));
277}
278
279/**********************************************************************//**
280Determine if enough space is available in the modification log.
281@return TRUE if enough space is available */
282UNIV_INLINE
283ibool
284page_zip_available(
285/*===============*/
286 const page_zip_des_t* page_zip,/*!< in: compressed page */
287 ibool is_clust,/*!< in: TRUE if clustered index */
288 ulint length, /*!< in: combined size of the record */
289 ulint create) /*!< in: nonzero=add the record to
290 the heap */
291{
292 ulint trailer_len;
293
294 ut_ad(length > REC_N_NEW_EXTRA_BYTES);
295
296 trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
297
298 /* Subtract the fixed extra bytes and add the maximum
299 space needed for identifying the record (encoded heap_no). */
300 length -= REC_N_NEW_EXTRA_BYTES - 2;
301
302 if (create > 0) {
303 /* When a record is created, a pointer may be added to
304 the dense directory.
305 Likewise, space for the columns that will not be
306 compressed will be allocated from the page trailer.
307 Also the BLOB pointers will be allocated from there, but
308 we may as well count them in the length of the record. */
309
310 trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
311 }
312
313 return(length + trailer_len + page_zip->m_end
314 < page_zip_get_size(page_zip));
315}
316
317/**********************************************************************//**
318Initialize a compressed page descriptor. */
319UNIV_INLINE
320void
321page_zip_des_init(
322/*==============*/
323 page_zip_des_t* page_zip) /*!< in/out: compressed page
324 descriptor */
325{
326 memset(page_zip, 0, sizeof *page_zip);
327}
328
329/**********************************************************************//**
330Write a log record of writing to the uncompressed header portion of a page. */
331void
332page_zip_write_header_log(
333/*======================*/
334 const byte* data,/*!< in: data on the uncompressed page */
335 ulint length, /*!< in: length of the data */
336 mtr_t* mtr); /*!< in: mini-transaction */
337
338/**********************************************************************//**
339Write data to the uncompressed header portion of a page. The data must
340already have been written to the uncompressed page.
341However, the data portion of the uncompressed page may differ from
342the compressed page when a record is being inserted in
343page_cur_insert_rec_zip(). */
344UNIV_INLINE
345void
346page_zip_write_header(
347/*==================*/
348 page_zip_des_t* page_zip,/*!< in/out: compressed page */
349 const byte* str, /*!< in: address on the uncompressed page */
350 ulint length, /*!< in: length of the data */
351 mtr_t* mtr) /*!< in: mini-transaction, or NULL */
352{
353 ulint pos;
354
355 ut_ad(page_zip_simple_validate(page_zip));
356 UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
357
358 pos = page_offset(str);
359
360 ut_ad(pos < PAGE_DATA);
361
362 memcpy(page_zip->data + pos, str, length);
363
364 /* The following would fail in page_cur_insert_rec_zip(). */
365 /* ut_ad(page_zip_validate(page_zip, str - pos)); */
366
367 if (mtr) {
368 page_zip_write_header_log(str, length, mtr);
369 }
370}
371
372/**********************************************************************//**
373Write a log record of compressing an index page without the data on the page. */
374UNIV_INLINE
375void
376page_zip_compress_write_log_no_data(
377/*================================*/
378 ulint level, /*!< in: compression level */
379 const page_t* page, /*!< in: page that is compressed */
380 dict_index_t* index, /*!< in: index */
381 mtr_t* mtr) /*!< in: mtr */
382{
383 byte* log_ptr = mlog_open_and_write_index(
384 mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1);
385
386 if (log_ptr) {
387 mach_write_to_1(log_ptr, level);
388 mlog_close(mtr, log_ptr + 1);
389 }
390}
391
392/**********************************************************************//**
393Parses a log record of compressing an index page without the data.
394@return end of log record or NULL */
395UNIV_INLINE
396byte*
397page_zip_parse_compress_no_data(
398/*============================*/
399 byte* ptr, /*!< in: buffer */
400 byte* end_ptr, /*!< in: buffer end */
401 page_t* page, /*!< in: uncompressed page */
402 page_zip_des_t* page_zip, /*!< out: compressed page */
403 dict_index_t* index) /*!< in: index */
404{
405 ulint level;
406 if (end_ptr == ptr) {
407 return(NULL);
408 }
409
410 level = mach_read_from_1(ptr);
411
412 /* If page compression fails then there must be something wrong
413 because a compress log record is logged only if the compression
414 was successful. Crash in this case. */
415
416 if (page
417 && !page_zip_compress(page_zip, page, index, level, NULL, NULL)) {
418 ut_error;
419 }
420
421 return(ptr + 1);
422}
423
424/**********************************************************************//**
425Reset the counters used for filling
426INFORMATION_SCHEMA.innodb_cmp_per_index. */
427UNIV_INLINE
428void
429page_zip_reset_stat_per_index()
430/*===========================*/
431{
432 mutex_enter(&page_zip_stat_per_index_mutex);
433
434 page_zip_stat_per_index.erase(
435 page_zip_stat_per_index.begin(),
436 page_zip_stat_per_index.end());
437
438 mutex_exit(&page_zip_stat_per_index_mutex);
439}
440
441#ifdef UNIV_MATERIALIZE
442# undef UNIV_INLINE
443# define UNIV_INLINE UNIV_INLINE_ORIGINAL
444#endif
445