| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2012, Facebook Inc. |
| 5 | Copyright (c) 2017, MariaDB Corporation. |
| 6 | |
| 7 | This program is free software; you can redistribute it and/or modify it under |
| 8 | the terms of the GNU General Public License as published by the Free Software |
| 9 | Foundation; version 2 of the License. |
| 10 | |
| 11 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 13 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU General Public License along with |
| 16 | this program; if not, write to the Free Software Foundation, Inc., |
| 17 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 18 | |
| 19 | *****************************************************************************/ |
| 20 | |
| 21 | /**************************************************//** |
| 22 | @file include/page0zip.ic |
| 23 | Compressed page interface |
| 24 | |
| 25 | Created June 2005 by Marko Makela |
| 26 | *******************************************************/ |
| 27 | |
| 28 | #ifdef UNIV_MATERIALIZE |
| 29 | # undef UNIV_INLINE |
| 30 | # define UNIV_INLINE |
| 31 | #endif |
| 32 | |
| 33 | #include "page0zip.h" |
| 34 | #include "mtr0log.h" |
| 35 | #include "page0page.h" |
| 36 | #include "srv0srv.h" |
| 37 | |
| 38 | /* The format of compressed pages is as follows. |
| 39 | |
| 40 | The header and trailer of the uncompressed pages, excluding the page |
| 41 | directory in the trailer, are copied as is to the header and trailer |
| 42 | of the compressed page. |
| 43 | |
| 44 | At the end of the compressed page, there is a dense page directory |
| 45 | pointing to every user record contained on the page, including deleted |
| 46 | records on the free list. The dense directory is indexed in the |
| 47 | collation order, i.e., in the order in which the record list is |
| 48 | linked on the uncompressed page. The infimum and supremum records are |
| 49 | excluded. The two most significant bits of the entries are allocated |
| 50 | for the delete-mark and an n_owned flag indicating the last record in |
| 51 | a chain of records pointed to from the sparse page directory on the |
| 52 | uncompressed page. |
| 53 | |
| 54 | The data between PAGE_ZIP_START and the last page directory entry will |
| 55 | be written in compressed format, starting at offset PAGE_DATA. |
| 56 | Infimum and supremum records are not stored. We exclude the |
| 57 | REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered |
| 58 | from the dense page directory stored at the end of the compressed |
| 59 | page. |
| 60 | |
| 61 | The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and |
| 62 | roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of |
| 63 | externally stored columns are stored separately, in ascending order of |
| 64 | heap_no and column index, starting backwards from the dense page |
| 65 | directory. |
| 66 | |
| 67 | The compressed data stream may be followed by a modification log |
| 68 | covering the compressed portion of the page, as follows. |
| 69 | |
| 70 | MODIFICATION LOG ENTRY FORMAT |
| 71 | - write record: |
| 72 | - (heap_no - 1) << 1 (1..2 bytes) |
| 73 | - extra bytes backwards |
| 74 | - data bytes |
| 75 | - clear record: |
| 76 | - (heap_no - 1) << 1 | 1 (1..2 bytes) |
| 77 | |
| 78 | The integer values are stored in a variable-length format: |
| 79 | - 0xxxxxxx: 0..127 |
| 80 | - 1xxxxxxx xxxxxxxx: 0..32767 |
| 81 | |
| 82 | The end of the modification log is marked by a 0 byte. |
| 83 | |
| 84 | In summary, the compressed page looks like this: |
| 85 | |
| 86 | (1) Uncompressed page header (PAGE_DATA bytes) |
| 87 | (2) Compressed index information |
| 88 | (3) Compressed page data |
| 89 | (4) Page modification log (page_zip->m_start..page_zip->m_end) |
| 90 | (5) Empty zero-filled space |
| 91 | (6) BLOB pointers (on leaf pages) |
| 92 | - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column |
| 93 | - in descending collation order |
| 94 | (7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, |
| 95 | - indexed by heap_no |
| 96 | - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes |
| 97 | - REC_NODE_PTR_SIZE for non-leaf pages |
| 98 | - 0 otherwise |
| 99 | (8) dense page directory, stored backwards |
| 100 | - n_dense = n_heap - 2 |
| 101 | - existing records in ascending collation order |
| 102 | - deleted records (free list) in link order |
| 103 | */ |
| 104 | |
| 105 | /**********************************************************************//** |
| 106 | Determine the size of a compressed page in bytes. |
| 107 | @return size in bytes */ |
| 108 | UNIV_INLINE |
| 109 | ulint |
| 110 | page_zip_get_size( |
| 111 | /*==============*/ |
| 112 | const page_zip_des_t* page_zip) /*!< in: compressed page */ |
| 113 | { |
| 114 | ulint size; |
| 115 | |
| 116 | if (!page_zip->ssize) { |
| 117 | return(0); |
| 118 | } |
| 119 | |
| 120 | size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; |
| 121 | |
| 122 | ut_ad(size >= UNIV_ZIP_SIZE_MIN); |
| 123 | ut_ad(size <= srv_page_size); |
| 124 | |
| 125 | return(size); |
| 126 | } |
| 127 | /**********************************************************************//** |
| 128 | Set the size of a compressed page in bytes. */ |
| 129 | UNIV_INLINE |
| 130 | void |
| 131 | page_zip_set_size( |
| 132 | /*==============*/ |
| 133 | page_zip_des_t* page_zip, /*!< in/out: compressed page */ |
| 134 | ulint size) /*!< in: size in bytes */ |
| 135 | { |
| 136 | if (size) { |
| 137 | unsigned ssize; |
| 138 | |
| 139 | ut_ad(ut_is_2pow(size)); |
| 140 | |
| 141 | for (ssize = 1; size > (512U << ssize); ssize++) { |
| 142 | } |
| 143 | |
| 144 | page_zip->ssize = ssize; |
| 145 | } else { |
| 146 | page_zip->ssize = 0; |
| 147 | } |
| 148 | |
| 149 | ut_ad(page_zip_get_size(page_zip) == size); |
| 150 | } |
| 151 | |
| 152 | /** Determine if a record is so big that it needs to be stored externally. |
| 153 | @param[in] rec_size length of the record in bytes |
| 154 | @param[in] comp nonzero=compact format |
| 155 | @param[in] n_fields number of fields in the record; ignored if |
| 156 | tablespace is not compressed |
| 157 | @param[in] page_size page size |
| 158 | @return FALSE if the entire record can be stored locally on the page */ |
| 159 | UNIV_INLINE |
| 160 | ibool |
| 161 | page_zip_rec_needs_ext( |
| 162 | ulint rec_size, |
| 163 | ulint comp, |
| 164 | ulint n_fields, |
| 165 | const page_size_t& page_size) |
| 166 | { |
| 167 | ut_ad(rec_size |
| 168 | > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); |
| 169 | ut_ad(comp || !page_size.is_compressed()); |
| 170 | |
| 171 | #if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE |
| 172 | if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : |
| 173 | rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) { |
| 174 | return(TRUE); |
| 175 | } |
| 176 | #endif |
| 177 | |
| 178 | if (page_size.is_compressed()) { |
| 179 | ut_ad(comp); |
| 180 | /* On a compressed page, there is a two-byte entry in |
| 181 | the dense page directory for every record. But there |
| 182 | is no record header. There should be enough room for |
| 183 | one record on an empty leaf page. Subtract 1 byte for |
| 184 | the encoded heap number. Check also the available space |
| 185 | on the uncompressed page. */ |
| 186 | return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) |
| 187 | >= page_zip_empty_size(n_fields, page_size.physical()) |
| 188 | || rec_size >= page_get_free_space_of_empty(TRUE) / 2); |
| 189 | } |
| 190 | |
| 191 | return(rec_size >= page_get_free_space_of_empty(comp) / 2); |
| 192 | } |
| 193 | |
| 194 | #ifdef UNIV_DEBUG |
| 195 | /**********************************************************************//** |
| 196 | Validate a compressed page descriptor. |
| 197 | @return TRUE if ok */ |
| 198 | UNIV_INLINE |
| 199 | ibool |
| 200 | page_zip_simple_validate( |
| 201 | /*=====================*/ |
| 202 | const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ |
| 203 | { |
| 204 | ut_ad(page_zip); |
| 205 | ut_ad(page_zip->data); |
| 206 | ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); |
| 207 | ut_ad(page_zip_get_size(page_zip) |
| 208 | > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); |
| 209 | ut_ad(page_zip->m_start <= page_zip->m_end); |
| 210 | ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); |
| 211 | ut_ad(page_zip->n_blobs |
| 212 | < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); |
| 213 | return(TRUE); |
| 214 | } |
| 215 | #endif /* UNIV_DEBUG */ |
| 216 | |
| 217 | /**********************************************************************//** |
| 218 | Determine if the length of the page trailer. |
| 219 | @return length of the page trailer, in bytes, not including the |
| 220 | terminating zero byte of the modification log */ |
| 221 | UNIV_INLINE |
| 222 | ibool |
| 223 | page_zip_get_trailer_len( |
| 224 | /*=====================*/ |
| 225 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
| 226 | ibool is_clust)/*!< in: TRUE if clustered index */ |
| 227 | { |
| 228 | ulint uncompressed_size; |
| 229 | |
| 230 | ut_ad(page_zip_simple_validate(page_zip)); |
| 231 | UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); |
| 232 | |
| 233 | if (!page_is_leaf(page_zip->data)) { |
| 234 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE |
| 235 | + REC_NODE_PTR_SIZE; |
| 236 | ut_ad(!page_zip->n_blobs); |
| 237 | } else if (is_clust) { |
| 238 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE |
| 239 | + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; |
| 240 | } else { |
| 241 | uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; |
| 242 | ut_ad(!page_zip->n_blobs); |
| 243 | } |
| 244 | |
| 245 | return (ulint(page_dir_get_n_heap(page_zip->data)) - 2) |
| 246 | * uncompressed_size |
| 247 | + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE; |
| 248 | } |
| 249 | |
| 250 | /**********************************************************************//** |
| 251 | Determine how big record can be inserted without recompressing the page. |
| 252 | @return a positive number indicating the maximum size of a record |
| 253 | whose insertion is guaranteed to succeed, or zero or negative */ |
| 254 | UNIV_INLINE |
| 255 | lint |
| 256 | page_zip_max_ins_size( |
| 257 | /*==================*/ |
| 258 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
| 259 | ibool is_clust)/*!< in: TRUE if clustered index */ |
| 260 | { |
| 261 | ulint trailer_len; |
| 262 | |
| 263 | trailer_len = page_zip_get_trailer_len(page_zip, is_clust); |
| 264 | |
| 265 | /* When a record is created, a pointer may be added to |
| 266 | the dense directory. |
| 267 | Likewise, space for the columns that will not be |
| 268 | compressed will be allocated from the page trailer. |
| 269 | Also the BLOB pointers will be allocated from there, but |
| 270 | we may as well count them in the length of the record. */ |
| 271 | |
| 272 | trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; |
| 273 | |
| 274 | return(lint(page_zip_get_size(page_zip) |
| 275 | - trailer_len - page_zip->m_end |
| 276 | - (REC_N_NEW_EXTRA_BYTES - 2))); |
| 277 | } |
| 278 | |
| 279 | /**********************************************************************//** |
| 280 | Determine if enough space is available in the modification log. |
| 281 | @return TRUE if enough space is available */ |
| 282 | UNIV_INLINE |
| 283 | ibool |
| 284 | page_zip_available( |
| 285 | /*===============*/ |
| 286 | const page_zip_des_t* page_zip,/*!< in: compressed page */ |
| 287 | ibool is_clust,/*!< in: TRUE if clustered index */ |
| 288 | ulint length, /*!< in: combined size of the record */ |
| 289 | ulint create) /*!< in: nonzero=add the record to |
| 290 | the heap */ |
| 291 | { |
| 292 | ulint trailer_len; |
| 293 | |
| 294 | ut_ad(length > REC_N_NEW_EXTRA_BYTES); |
| 295 | |
| 296 | trailer_len = page_zip_get_trailer_len(page_zip, is_clust); |
| 297 | |
| 298 | /* Subtract the fixed extra bytes and add the maximum |
| 299 | space needed for identifying the record (encoded heap_no). */ |
| 300 | length -= REC_N_NEW_EXTRA_BYTES - 2; |
| 301 | |
| 302 | if (create > 0) { |
| 303 | /* When a record is created, a pointer may be added to |
| 304 | the dense directory. |
| 305 | Likewise, space for the columns that will not be |
| 306 | compressed will be allocated from the page trailer. |
| 307 | Also the BLOB pointers will be allocated from there, but |
| 308 | we may as well count them in the length of the record. */ |
| 309 | |
| 310 | trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; |
| 311 | } |
| 312 | |
| 313 | return(length + trailer_len + page_zip->m_end |
| 314 | < page_zip_get_size(page_zip)); |
| 315 | } |
| 316 | |
| 317 | /**********************************************************************//** |
| 318 | Initialize a compressed page descriptor. */ |
| 319 | UNIV_INLINE |
| 320 | void |
| 321 | page_zip_des_init( |
| 322 | /*==============*/ |
| 323 | page_zip_des_t* page_zip) /*!< in/out: compressed page |
| 324 | descriptor */ |
| 325 | { |
| 326 | memset(page_zip, 0, sizeof *page_zip); |
| 327 | } |
| 328 | |
| 329 | /**********************************************************************//** |
| 330 | Write a log record of writing to the uncompressed header portion of a page. */ |
| 331 | void |
| 332 | ( |
| 333 | /*======================*/ |
| 334 | const byte* data,/*!< in: data on the uncompressed page */ |
| 335 | ulint length, /*!< in: length of the data */ |
| 336 | mtr_t* mtr); /*!< in: mini-transaction */ |
| 337 | |
| 338 | /**********************************************************************//** |
| 339 | Write data to the uncompressed header portion of a page. The data must |
| 340 | already have been written to the uncompressed page. |
| 341 | However, the data portion of the uncompressed page may differ from |
| 342 | the compressed page when a record is being inserted in |
| 343 | page_cur_insert_rec_zip(). */ |
| 344 | UNIV_INLINE |
| 345 | void |
| 346 | ( |
| 347 | /*==================*/ |
| 348 | page_zip_des_t* page_zip,/*!< in/out: compressed page */ |
| 349 | const byte* str, /*!< in: address on the uncompressed page */ |
| 350 | ulint length, /*!< in: length of the data */ |
| 351 | mtr_t* mtr) /*!< in: mini-transaction, or NULL */ |
| 352 | { |
| 353 | ulint pos; |
| 354 | |
| 355 | ut_ad(page_zip_simple_validate(page_zip)); |
| 356 | UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); |
| 357 | |
| 358 | pos = page_offset(str); |
| 359 | |
| 360 | ut_ad(pos < PAGE_DATA); |
| 361 | |
| 362 | memcpy(page_zip->data + pos, str, length); |
| 363 | |
| 364 | /* The following would fail in page_cur_insert_rec_zip(). */ |
| 365 | /* ut_ad(page_zip_validate(page_zip, str - pos)); */ |
| 366 | |
| 367 | if (mtr) { |
| 368 | page_zip_write_header_log(str, length, mtr); |
| 369 | } |
| 370 | } |
| 371 | |
| 372 | /**********************************************************************//** |
| 373 | Write a log record of compressing an index page without the data on the page. */ |
| 374 | UNIV_INLINE |
| 375 | void |
| 376 | page_zip_compress_write_log_no_data( |
| 377 | /*================================*/ |
| 378 | ulint level, /*!< in: compression level */ |
| 379 | const page_t* page, /*!< in: page that is compressed */ |
| 380 | dict_index_t* index, /*!< in: index */ |
| 381 | mtr_t* mtr) /*!< in: mtr */ |
| 382 | { |
| 383 | byte* log_ptr = mlog_open_and_write_index( |
| 384 | mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1); |
| 385 | |
| 386 | if (log_ptr) { |
| 387 | mach_write_to_1(log_ptr, level); |
| 388 | mlog_close(mtr, log_ptr + 1); |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | /**********************************************************************//** |
| 393 | Parses a log record of compressing an index page without the data. |
| 394 | @return end of log record or NULL */ |
| 395 | UNIV_INLINE |
| 396 | byte* |
| 397 | page_zip_parse_compress_no_data( |
| 398 | /*============================*/ |
| 399 | byte* ptr, /*!< in: buffer */ |
| 400 | byte* end_ptr, /*!< in: buffer end */ |
| 401 | page_t* page, /*!< in: uncompressed page */ |
| 402 | page_zip_des_t* page_zip, /*!< out: compressed page */ |
| 403 | dict_index_t* index) /*!< in: index */ |
| 404 | { |
| 405 | ulint level; |
| 406 | if (end_ptr == ptr) { |
| 407 | return(NULL); |
| 408 | } |
| 409 | |
| 410 | level = mach_read_from_1(ptr); |
| 411 | |
| 412 | /* If page compression fails then there must be something wrong |
| 413 | because a compress log record is logged only if the compression |
| 414 | was successful. Crash in this case. */ |
| 415 | |
| 416 | if (page |
| 417 | && !page_zip_compress(page_zip, page, index, level, NULL, NULL)) { |
| 418 | ut_error; |
| 419 | } |
| 420 | |
| 421 | return(ptr + 1); |
| 422 | } |
| 423 | |
| 424 | /**********************************************************************//** |
| 425 | Reset the counters used for filling |
| 426 | INFORMATION_SCHEMA.innodb_cmp_per_index. */ |
| 427 | UNIV_INLINE |
| 428 | void |
| 429 | page_zip_reset_stat_per_index() |
| 430 | /*===========================*/ |
| 431 | { |
| 432 | mutex_enter(&page_zip_stat_per_index_mutex); |
| 433 | |
| 434 | page_zip_stat_per_index.erase( |
| 435 | page_zip_stat_per_index.begin(), |
| 436 | page_zip_stat_per_index.end()); |
| 437 | |
| 438 | mutex_exit(&page_zip_stat_per_index_mutex); |
| 439 | } |
| 440 | |
| 441 | #ifdef UNIV_MATERIALIZE |
| 442 | # undef UNIV_INLINE |
| 443 | # define UNIV_INLINE UNIV_INLINE_ORIGINAL |
| 444 | #endif |
| 445 | |