| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of TokuDB |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | TokuDBis is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | TokuDB is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with TokuDB. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ======= */ |
| 23 | |
| 24 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 25 | |
| 26 | #ifndef _HATOKU_CMP |
| 27 | #define _HATOKU_CMP |
| 28 | |
| 29 | #include "hatoku_defines.h" |
| 30 | #include "tokudb_debug.h" |
| 31 | |
| 32 | |
| 33 | // |
| 34 | // A MySQL row is encoded in TokuDB, as follows: |
| 35 | // Keys: |
| 36 | // Keys pack the defined columns in the order that they are declared. |
| 37 | // The primary key contains only the columns listed |
| 38 | // If no primary key is defined, then an eight byte hidden primary key is autogenerated (like an auto increment) and used |
| 39 | // Secondary keys contains the defined key and the primary key. |
| 40 | // Two examples: |
| 41 | // 1) table foo (a int, b int, c int, d int, key(b)) |
| 42 | // The key of the main dictionary contains an eight byte autogenerated hidden primary key |
| 43 | // The key of key-b is the column 'b' followed by the hidden primary key |
| 44 | // 2) table foo (a int, b int, c int, d int, primary key(a), key(b)) |
| 45 | // The key of the main dictionary contains 'a' |
| 46 | // The key of key-b is the column 'b followed by 'a' |
| 47 | // Vals: |
| 48 | // For secondary keys they are empty. |
| 49 | // For the main dictionary and clustering keys, they contain all columns that do not show up in the dictionary's key |
| 50 | // Two examples: |
| 51 | // 1) table foo (a int, b int, c int, d varchar(100), primary key(a), clustering key d(d), clustering key d2(d(20)) |
| 52 | // the val of the main dictionary contains (b,c,d) |
| 53 | // the val of d contains (b,c) |
| 54 | // the val of d2 contains (b,c,d). d is there because the entire row does not show up in the key |
| 55 | // Vals are encoded as follows. They have four components: |
| 56 | // 1) Null bytes: contains a bit field that states what columns are NULL. |
| 57 | // 2) Fixed fields: all fixed fields are then packed together. If a fixed field is NULL, its data is considered junk |
| 58 | // 3) varchars and varbinaries: stored in two pieces, first all the offsets and then all the data. If a var field is NULL, its data is considered junk |
| 59 | // 4) blobs: stored in (length, data) pairs. If a blob is NULL, its data is considered junk |
| 60 | // An example: |
| 61 | // Table: (a int, b varchar(20), c blob, d bigint, e varbinary(10), f largeblob, g varchar(10)) <-- no primary key defined |
| 62 | // Row inserted: (1, "bbb", "cc", 100, "eeeee", "ffff", "g") |
| 63 | // The packed format of the val looks like: |
| 64 | // NULL byte <-- 1 byte to encode nothing is NULL |
| 65 | // 1 <-- four bytes for 'a' |
| 66 | // 100 <-- four bytes for 'd' |
| 67 | // 3,8,9 <--offsets for location of data fields, note offsets point to where data ENDS |
| 68 | // "bbbeeeeeg" <-- data for variable length stuff |
| 69 | // 2,"cc",4,"ffff"<-- data that stores the blobs |
| 70 | // The structures below describe are used for the TokuDB encoding of a row |
| 71 | // |
| 72 | |
| 73 | // used for queries |
| 74 | typedef struct st_col_pack_info { |
| 75 | uint32_t col_pack_val; //offset if fixed, pack_index if var |
| 76 | } COL_PACK_INFO; |
| 77 | |
| 78 | // |
| 79 | // used to define a couple of characteristics of a packed val for the main dictionary or a clustering dictionary |
| 80 | // fixed_field_size is the size of the fixed fields in the val. |
| 81 | // len_of_offsets is the size of the bytes that make up the offsets of variable size columns |
| 82 | // Some notes: |
| 83 | // If the val has no fixed fields, fixed_field_size is 0 |
| 84 | // If the val has no variable fields, len_of_offsets is 0 |
| 85 | // The number of null bytes at the beginning of a row is not saved, it is derived from table_share->null_bytes |
| 86 | // The pointer to where the variable data in a val starts is table_share->null_bytes + fixed_field_size + len_of_offsets |
| 87 | // To figure out where the blobs start, find the last offset listed (if offsets exist) |
| 88 | // |
| 89 | typedef struct st_multi_col_pack_info { |
| 90 | uint32_t fixed_field_size; //where the fixed length stuff ends and the offsets for var stuff begins |
| 91 | uint32_t len_of_offsets; //length of the offset bytes in a packed row |
| 92 | } MULTI_COL_PACK_INFO; |
| 93 | |
| 94 | typedef struct st_key_and_col_info { |
| 95 | // |
| 96 | // bitmaps for each key. key_filters[i] is associated with the i'th dictionary |
| 97 | // States what columns are not stored in the vals of each key, because |
| 98 | // the column is stored in the key. So, for example, the table (a int, b int, c int, d int, primary key (b,d)) will |
| 99 | // have bit 1 (for 'b') and bit 3 (for 'd') of the primary key's bitmap set for the main dictionary's bitmap, |
| 100 | // because 'b' and 'd' do not show up in the val |
| 101 | // |
| 102 | MY_BITMAP key_filters[MAX_KEY+1]; |
| 103 | // |
| 104 | // following three arrays are used to identify the types of rows in the field |
| 105 | // If table->field[i] is a fixed field: |
| 106 | // field_lengths[i] stores the field length, which is fixed |
| 107 | // length_bytes[i] is 0 |
| 108 | // 'i' does not show up in the array blob_fields |
| 109 | // If table->field[i] is a varchar or varbinary: |
| 110 | // field_lengths[i] is 0 |
| 111 | // length_bytes[i] stores the number of bytes MySQL uses to encode the length of the field in table->record[0] |
| 112 | // 'i' does not show up in the array blob_fields |
| 113 | // If table->field[i] is a blob: |
| 114 | // field_lengths[i] is 0 |
| 115 | // length_bytes[i] is 0 |
| 116 | // 'i' shows up in blob_fields |
| 117 | // |
| 118 | void *multi_ptr; |
| 119 | enum { TOKUDB_FIXED_FIELD, TOKUDB_VARIABLE_FIELD, TOKUDB_BLOB_FIELD}; |
| 120 | uint8_t *field_types; |
| 121 | uint16_t* field_lengths; //stores the field lengths of fixed size fields (1<<16 - 1 max), |
| 122 | uint8_t* length_bytes; // stores the length of lengths of varchars and varbinaries |
| 123 | uint32_t* blob_fields; // list of indexes of blob fields, |
| 124 | uint32_t num_blobs; // number of blobs in the table |
| 125 | // |
| 126 | // val packing info for all dictionaries. i'th one represents info for i'th dictionary |
| 127 | // |
| 128 | MULTI_COL_PACK_INFO mcp_info[MAX_KEY+1]; |
| 129 | COL_PACK_INFO* cp_info[MAX_KEY+1]; |
| 130 | // |
| 131 | // number bytes used to represent an offset in a val. Can be 1 or 2. |
| 132 | // The number of var fields in a val for dictionary i can be evaluated by |
| 133 | // mcp_info[i].len_of_offsets/num_offset_bytes. |
| 134 | // |
| 135 | uint32_t num_offset_bytes; //number of bytes needed to encode the offset |
| 136 | } KEY_AND_COL_INFO; |
| 137 | |
| 138 | static bool is_fixed_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
| 139 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD; |
| 140 | } |
| 141 | |
| 142 | static bool is_variable_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
| 143 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD; |
| 144 | } |
| 145 | |
| 146 | static bool is_blob_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
| 147 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD; |
| 148 | } |
| 149 | |
| 150 | static bool field_valid_for_tokudb_table(Field* field); |
| 151 | |
| 152 | static void get_var_field_info( |
| 153 | uint32_t* field_len, |
| 154 | uint32_t* start_offset, |
| 155 | uint32_t var_field_index, |
| 156 | const uchar* var_field_offset_ptr, |
| 157 | uint32_t num_offset_bytes |
| 158 | ); |
| 159 | |
| 160 | static void get_blob_field_info( |
| 161 | uint32_t* start_offset, |
| 162 | uint32_t len_of_offsets, |
| 163 | const uchar* var_field_data_ptr, |
| 164 | uint32_t num_offset_bytes |
| 165 | ); |
| 166 | |
| 167 | static inline uint32_t get_blob_field_len(const uchar* from_tokudb, uint32_t len_bytes) { |
| 168 | uint32_t length = 0; |
| 169 | switch (len_bytes) { |
| 170 | case (1): |
| 171 | length = (uint32_t)(*from_tokudb); |
| 172 | break; |
| 173 | case (2): |
| 174 | length = uint2korr(from_tokudb); |
| 175 | break; |
| 176 | case (3): |
| 177 | length = tokudb_uint3korr(from_tokudb); |
| 178 | break; |
| 179 | case (4): |
| 180 | length = uint4korr(from_tokudb); |
| 181 | break; |
| 182 | default: |
| 183 | assert_unreachable(); |
| 184 | } |
| 185 | return length; |
| 186 | } |
| 187 | |
| 188 | |
| 189 | static inline const uchar* unpack_toku_field_blob(uchar *to_mysql, const uchar* from_tokudb, uint32_t len_bytes, bool skip) { |
| 190 | uint32_t length = 0; |
| 191 | const uchar* data_ptr = NULL; |
| 192 | if (!skip) { |
| 193 | memcpy(to_mysql, from_tokudb, len_bytes); |
| 194 | } |
| 195 | length = get_blob_field_len(from_tokudb,len_bytes); |
| 196 | |
| 197 | data_ptr = from_tokudb + len_bytes; |
| 198 | if (!skip) { |
| 199 | memcpy(to_mysql + len_bytes, (uchar *)(&data_ptr), sizeof data_ptr); |
| 200 | } |
| 201 | return from_tokudb + len_bytes + length; |
| 202 | } |
| 203 | |
| 204 | static inline uint get_null_offset(TABLE* table, Field* field) { |
| 205 | #if (50606 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ |
| 206 | (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) |
| 207 | return field->null_offset(table->record[0]); |
| 208 | #else |
| 209 | return (uint) ((uchar*) field->null_ptr - (uchar*) table->record[0]); |
| 210 | #endif |
| 211 | } |
| 212 | |
| 213 | typedef enum { |
| 214 | toku_type_int = 0, |
| 215 | toku_type_double, |
| 216 | toku_type_float, |
| 217 | toku_type_fixbinary, |
| 218 | toku_type_fixstring, |
| 219 | toku_type_varbinary, |
| 220 | toku_type_varstring, |
| 221 | toku_type_blob, |
| 222 | toku_type_hpk, //for hidden primary key |
| 223 | toku_type_unknown |
| 224 | } TOKU_TYPE; |
| 225 | |
| 226 | |
| 227 | static TOKU_TYPE mysql_to_toku_type (Field* field); |
| 228 | |
| 229 | static uchar* pack_toku_varbinary_from_desc( |
| 230 | uchar* to_tokudb, |
| 231 | const uchar* from_desc, |
| 232 | uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb |
| 233 | uint32_t field_length //length of field |
| 234 | ); |
| 235 | |
| 236 | static uchar* pack_toku_varstring_from_desc( |
| 237 | uchar* to_tokudb, |
| 238 | const uchar* from_desc, |
| 239 | uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb |
| 240 | uint32_t field_length, |
| 241 | uint32_t charset_num//length of field |
| 242 | ); |
| 243 | |
| 244 | |
| 245 | static uchar* pack_toku_key_field( |
| 246 | uchar* to_tokudb, |
| 247 | uchar* from_mysql, |
| 248 | Field* field, |
| 249 | uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff |
| 250 | ); |
| 251 | |
| 252 | static uchar* pack_key_toku_key_field( |
| 253 | uchar* to_tokudb, |
| 254 | uchar* from_mysql, |
| 255 | Field* field, |
| 256 | uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff |
| 257 | ); |
| 258 | |
| 259 | static uchar* unpack_toku_key_field( |
| 260 | uchar* to_mysql, |
| 261 | uchar* from_tokudb, |
| 262 | Field* field, |
| 263 | uint32_t key_part_length |
| 264 | ); |
| 265 | |
| 266 | |
| 267 | // |
| 268 | // for storing NULL byte in keys |
| 269 | // |
| 270 | #define NULL_COL_VAL 0 |
| 271 | #define NONNULL_COL_VAL 1 |
| 272 | |
| 273 | // |
| 274 | // for storing if rest of key is +/- infinity |
| 275 | // |
| 276 | #define COL_NEG_INF -1 |
| 277 | #define COL_ZERO 0 |
| 278 | #define COL_POS_INF 1 |
| 279 | |
| 280 | #define COL_FIX_FIELD 0x11 |
| 281 | #define COL_VAR_FIELD 0x22 |
| 282 | #define COL_BLOB_FIELD 0x33 |
| 283 | |
| 284 | // |
| 285 | // information for hidden primary keys |
| 286 | // |
| 287 | #define TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH 8 |
| 288 | |
| 289 | // |
| 290 | // function to convert a hidden primary key into a byte stream that can be stored in DBT |
| 291 | // |
| 292 | static inline void hpk_num_to_char(uchar* to, ulonglong num) { |
| 293 | int8store(to, num); |
| 294 | } |
| 295 | |
| 296 | // |
| 297 | // function that takes a byte stream of a hidden primary key and returns a ulonglong |
| 298 | // |
| 299 | static inline ulonglong hpk_char_to_num(uchar* val) { |
| 300 | return uint8korr(val); |
| 301 | } |
| 302 | |
| 303 | static int tokudb_compare_two_keys( |
| 304 | const void* new_key_data, |
| 305 | const uint32_t new_key_size, |
| 306 | const void* saved_key_data, |
| 307 | const uint32_t saved_key_size, |
| 308 | const void* row_desc, |
| 309 | const uint32_t row_desc_size, |
| 310 | bool cmp_prefix, |
| 311 | bool* read_string |
| 312 | ); |
| 313 | |
| 314 | static int tokudb_cmp_dbt_key(DB* db, const DBT *keya, const DBT *keyb); |
| 315 | |
| 316 | //TODO: QQQ Only do one direction for prefix. |
| 317 | static int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb); |
| 318 | |
| 319 | static int tokudb_compare_two_key_parts( |
| 320 | const void* new_key_data, |
| 321 | const uint32_t new_key_size, |
| 322 | const void* saved_key_data, |
| 323 | const uint32_t saved_key_size, |
| 324 | const void* row_desc, |
| 325 | const uint32_t row_desc_size, |
| 326 | uint max_parts |
| 327 | ); |
| 328 | |
| 329 | static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts); |
| 330 | |
| 331 | static int create_toku_key_descriptor( |
| 332 | uchar* buf, |
| 333 | bool is_first_hpk, |
| 334 | KEY* first_key, |
| 335 | bool is_second_hpk, |
| 336 | KEY* second_key |
| 337 | ); |
| 338 | |
| 339 | |
| 340 | static uint32_t create_toku_main_key_pack_descriptor ( |
| 341 | uchar* buf |
| 342 | ); |
| 343 | |
| 344 | static uint32_t get_max_clustering_val_pack_desc_size( |
| 345 | TABLE_SHARE* table_share |
| 346 | ); |
| 347 | |
| 348 | static uint32_t create_toku_clustering_val_pack_descriptor ( |
| 349 | uchar* buf, |
| 350 | uint pk_index, |
| 351 | TABLE_SHARE* table_share, |
| 352 | KEY_AND_COL_INFO* kc_info, |
| 353 | uint32_t keynr, |
| 354 | bool is_clustering |
| 355 | ); |
| 356 | |
| 357 | static inline bool is_key_clustering( |
| 358 | void* row_desc, |
| 359 | uint32_t row_desc_size |
| 360 | ) |
| 361 | { |
| 362 | return (row_desc_size > 0); |
| 363 | } |
| 364 | |
| 365 | static uint32_t pack_clustering_val_from_desc( |
| 366 | uchar* buf, |
| 367 | void* row_desc, |
| 368 | uint32_t row_desc_size, |
| 369 | const DBT* pk_val |
| 370 | ); |
| 371 | |
| 372 | static uint32_t get_max_secondary_key_pack_desc_size( |
| 373 | KEY_AND_COL_INFO* kc_info |
| 374 | ); |
| 375 | |
| 376 | static uint32_t create_toku_secondary_key_pack_descriptor ( |
| 377 | uchar* buf, |
| 378 | bool has_hpk, |
| 379 | uint pk_index, |
| 380 | TABLE_SHARE* table_share, |
| 381 | TABLE* table, |
| 382 | KEY_AND_COL_INFO* kc_info, |
| 383 | KEY* key_info, |
| 384 | KEY* prim_key |
| 385 | ); |
| 386 | |
| 387 | static inline bool is_key_pk( |
| 388 | void* row_desc, |
| 389 | uint32_t row_desc_size |
| 390 | ) |
| 391 | { |
| 392 | uchar* buf = (uchar *)row_desc; |
| 393 | return buf[0]; |
| 394 | } |
| 395 | |
| 396 | static uint32_t max_key_size_from_desc( |
| 397 | void* row_desc, |
| 398 | uint32_t row_desc_size |
| 399 | ); |
| 400 | |
| 401 | |
| 402 | static uint32_t pack_key_from_desc( |
| 403 | uchar* buf, |
| 404 | void* row_desc, |
| 405 | uint32_t row_desc_size, |
| 406 | const DBT* pk_key, |
| 407 | const DBT* pk_val |
| 408 | ); |
| 409 | |
| 410 | static bool fields_have_same_name( |
| 411 | Field* a, |
| 412 | Field* b |
| 413 | ); |
| 414 | |
| 415 | static bool fields_are_same_type( |
| 416 | Field* a, |
| 417 | Field* b |
| 418 | ); |
| 419 | |
| 420 | static bool are_two_fields_same( |
| 421 | Field* a, |
| 422 | Field* b |
| 423 | ); |
| 424 | |
| 425 | #endif |
| 426 | |
| 427 | |