1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of TokuDB |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | TokuDBis is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | TokuDB is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with TokuDB. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ======= */ |
23 | |
24 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
25 | |
26 | #ifndef _HATOKU_CMP |
27 | #define _HATOKU_CMP |
28 | |
29 | #include "hatoku_defines.h" |
30 | #include "tokudb_debug.h" |
31 | |
32 | |
33 | // |
34 | // A MySQL row is encoded in TokuDB, as follows: |
35 | // Keys: |
36 | // Keys pack the defined columns in the order that they are declared. |
37 | // The primary key contains only the columns listed |
38 | // If no primary key is defined, then an eight byte hidden primary key is autogenerated (like an auto increment) and used |
39 | // Secondary keys contains the defined key and the primary key. |
40 | // Two examples: |
41 | // 1) table foo (a int, b int, c int, d int, key(b)) |
42 | // The key of the main dictionary contains an eight byte autogenerated hidden primary key |
43 | // The key of key-b is the column 'b' followed by the hidden primary key |
44 | // 2) table foo (a int, b int, c int, d int, primary key(a), key(b)) |
45 | // The key of the main dictionary contains 'a' |
46 | // The key of key-b is the column 'b followed by 'a' |
47 | // Vals: |
48 | // For secondary keys they are empty. |
49 | // For the main dictionary and clustering keys, they contain all columns that do not show up in the dictionary's key |
50 | // Two examples: |
51 | // 1) table foo (a int, b int, c int, d varchar(100), primary key(a), clustering key d(d), clustering key d2(d(20)) |
52 | // the val of the main dictionary contains (b,c,d) |
53 | // the val of d contains (b,c) |
54 | // the val of d2 contains (b,c,d). d is there because the entire row does not show up in the key |
55 | // Vals are encoded as follows. They have four components: |
56 | // 1) Null bytes: contains a bit field that states what columns are NULL. |
57 | // 2) Fixed fields: all fixed fields are then packed together. If a fixed field is NULL, its data is considered junk |
58 | // 3) varchars and varbinaries: stored in two pieces, first all the offsets and then all the data. If a var field is NULL, its data is considered junk |
59 | // 4) blobs: stored in (length, data) pairs. If a blob is NULL, its data is considered junk |
60 | // An example: |
61 | // Table: (a int, b varchar(20), c blob, d bigint, e varbinary(10), f largeblob, g varchar(10)) <-- no primary key defined |
62 | // Row inserted: (1, "bbb", "cc", 100, "eeeee", "ffff", "g") |
63 | // The packed format of the val looks like: |
64 | // NULL byte <-- 1 byte to encode nothing is NULL |
65 | // 1 <-- four bytes for 'a' |
66 | // 100 <-- four bytes for 'd' |
67 | // 3,8,9 <--offsets for location of data fields, note offsets point to where data ENDS |
68 | // "bbbeeeeeg" <-- data for variable length stuff |
69 | // 2,"cc",4,"ffff"<-- data that stores the blobs |
70 | // The structures below describe are used for the TokuDB encoding of a row |
71 | // |
72 | |
73 | // used for queries |
74 | typedef struct st_col_pack_info { |
75 | uint32_t col_pack_val; //offset if fixed, pack_index if var |
76 | } COL_PACK_INFO; |
77 | |
78 | // |
79 | // used to define a couple of characteristics of a packed val for the main dictionary or a clustering dictionary |
80 | // fixed_field_size is the size of the fixed fields in the val. |
81 | // len_of_offsets is the size of the bytes that make up the offsets of variable size columns |
82 | // Some notes: |
83 | // If the val has no fixed fields, fixed_field_size is 0 |
84 | // If the val has no variable fields, len_of_offsets is 0 |
85 | // The number of null bytes at the beginning of a row is not saved, it is derived from table_share->null_bytes |
86 | // The pointer to where the variable data in a val starts is table_share->null_bytes + fixed_field_size + len_of_offsets |
87 | // To figure out where the blobs start, find the last offset listed (if offsets exist) |
88 | // |
89 | typedef struct st_multi_col_pack_info { |
90 | uint32_t fixed_field_size; //where the fixed length stuff ends and the offsets for var stuff begins |
91 | uint32_t len_of_offsets; //length of the offset bytes in a packed row |
92 | } MULTI_COL_PACK_INFO; |
93 | |
94 | typedef struct st_key_and_col_info { |
95 | // |
96 | // bitmaps for each key. key_filters[i] is associated with the i'th dictionary |
97 | // States what columns are not stored in the vals of each key, because |
98 | // the column is stored in the key. So, for example, the table (a int, b int, c int, d int, primary key (b,d)) will |
99 | // have bit 1 (for 'b') and bit 3 (for 'd') of the primary key's bitmap set for the main dictionary's bitmap, |
100 | // because 'b' and 'd' do not show up in the val |
101 | // |
102 | MY_BITMAP key_filters[MAX_KEY+1]; |
103 | // |
104 | // following three arrays are used to identify the types of rows in the field |
105 | // If table->field[i] is a fixed field: |
106 | // field_lengths[i] stores the field length, which is fixed |
107 | // length_bytes[i] is 0 |
108 | // 'i' does not show up in the array blob_fields |
109 | // If table->field[i] is a varchar or varbinary: |
110 | // field_lengths[i] is 0 |
111 | // length_bytes[i] stores the number of bytes MySQL uses to encode the length of the field in table->record[0] |
112 | // 'i' does not show up in the array blob_fields |
113 | // If table->field[i] is a blob: |
114 | // field_lengths[i] is 0 |
115 | // length_bytes[i] is 0 |
116 | // 'i' shows up in blob_fields |
117 | // |
118 | void *multi_ptr; |
119 | enum { TOKUDB_FIXED_FIELD, TOKUDB_VARIABLE_FIELD, TOKUDB_BLOB_FIELD}; |
120 | uint8_t *field_types; |
121 | uint16_t* field_lengths; //stores the field lengths of fixed size fields (1<<16 - 1 max), |
122 | uint8_t* length_bytes; // stores the length of lengths of varchars and varbinaries |
123 | uint32_t* blob_fields; // list of indexes of blob fields, |
124 | uint32_t num_blobs; // number of blobs in the table |
125 | // |
126 | // val packing info for all dictionaries. i'th one represents info for i'th dictionary |
127 | // |
128 | MULTI_COL_PACK_INFO mcp_info[MAX_KEY+1]; |
129 | COL_PACK_INFO* cp_info[MAX_KEY+1]; |
130 | // |
131 | // number bytes used to represent an offset in a val. Can be 1 or 2. |
132 | // The number of var fields in a val for dictionary i can be evaluated by |
133 | // mcp_info[i].len_of_offsets/num_offset_bytes. |
134 | // |
135 | uint32_t num_offset_bytes; //number of bytes needed to encode the offset |
136 | } KEY_AND_COL_INFO; |
137 | |
138 | static bool is_fixed_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
139 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD; |
140 | } |
141 | |
142 | static bool is_variable_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
143 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD; |
144 | } |
145 | |
146 | static bool is_blob_field(KEY_AND_COL_INFO *kcinfo, uint field_num) { |
147 | return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD; |
148 | } |
149 | |
150 | static bool field_valid_for_tokudb_table(Field* field); |
151 | |
152 | static void get_var_field_info( |
153 | uint32_t* field_len, |
154 | uint32_t* start_offset, |
155 | uint32_t var_field_index, |
156 | const uchar* var_field_offset_ptr, |
157 | uint32_t num_offset_bytes |
158 | ); |
159 | |
160 | static void get_blob_field_info( |
161 | uint32_t* start_offset, |
162 | uint32_t len_of_offsets, |
163 | const uchar* var_field_data_ptr, |
164 | uint32_t num_offset_bytes |
165 | ); |
166 | |
167 | static inline uint32_t get_blob_field_len(const uchar* from_tokudb, uint32_t len_bytes) { |
168 | uint32_t length = 0; |
169 | switch (len_bytes) { |
170 | case (1): |
171 | length = (uint32_t)(*from_tokudb); |
172 | break; |
173 | case (2): |
174 | length = uint2korr(from_tokudb); |
175 | break; |
176 | case (3): |
177 | length = tokudb_uint3korr(from_tokudb); |
178 | break; |
179 | case (4): |
180 | length = uint4korr(from_tokudb); |
181 | break; |
182 | default: |
183 | assert_unreachable(); |
184 | } |
185 | return length; |
186 | } |
187 | |
188 | |
189 | static inline const uchar* unpack_toku_field_blob(uchar *to_mysql, const uchar* from_tokudb, uint32_t len_bytes, bool skip) { |
190 | uint32_t length = 0; |
191 | const uchar* data_ptr = NULL; |
192 | if (!skip) { |
193 | memcpy(to_mysql, from_tokudb, len_bytes); |
194 | } |
195 | length = get_blob_field_len(from_tokudb,len_bytes); |
196 | |
197 | data_ptr = from_tokudb + len_bytes; |
198 | if (!skip) { |
199 | memcpy(to_mysql + len_bytes, (uchar *)(&data_ptr), sizeof data_ptr); |
200 | } |
201 | return from_tokudb + len_bytes + length; |
202 | } |
203 | |
204 | static inline uint get_null_offset(TABLE* table, Field* field) { |
205 | #if (50606 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ |
206 | (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) |
207 | return field->null_offset(table->record[0]); |
208 | #else |
209 | return (uint) ((uchar*) field->null_ptr - (uchar*) table->record[0]); |
210 | #endif |
211 | } |
212 | |
213 | typedef enum { |
214 | toku_type_int = 0, |
215 | toku_type_double, |
216 | toku_type_float, |
217 | toku_type_fixbinary, |
218 | toku_type_fixstring, |
219 | toku_type_varbinary, |
220 | toku_type_varstring, |
221 | toku_type_blob, |
222 | toku_type_hpk, //for hidden primary key |
223 | toku_type_unknown |
224 | } TOKU_TYPE; |
225 | |
226 | |
227 | static TOKU_TYPE mysql_to_toku_type (Field* field); |
228 | |
229 | static uchar* pack_toku_varbinary_from_desc( |
230 | uchar* to_tokudb, |
231 | const uchar* from_desc, |
232 | uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb |
233 | uint32_t field_length //length of field |
234 | ); |
235 | |
236 | static uchar* pack_toku_varstring_from_desc( |
237 | uchar* to_tokudb, |
238 | const uchar* from_desc, |
239 | uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb |
240 | uint32_t field_length, |
241 | uint32_t charset_num//length of field |
242 | ); |
243 | |
244 | |
245 | static uchar* pack_toku_key_field( |
246 | uchar* to_tokudb, |
247 | uchar* from_mysql, |
248 | Field* field, |
249 | uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff |
250 | ); |
251 | |
252 | static uchar* pack_key_toku_key_field( |
253 | uchar* to_tokudb, |
254 | uchar* from_mysql, |
255 | Field* field, |
256 | uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff |
257 | ); |
258 | |
259 | static uchar* unpack_toku_key_field( |
260 | uchar* to_mysql, |
261 | uchar* from_tokudb, |
262 | Field* field, |
263 | uint32_t key_part_length |
264 | ); |
265 | |
266 | |
267 | // |
268 | // for storing NULL byte in keys |
269 | // |
270 | #define NULL_COL_VAL 0 |
271 | #define NONNULL_COL_VAL 1 |
272 | |
273 | // |
274 | // for storing if rest of key is +/- infinity |
275 | // |
276 | #define COL_NEG_INF -1 |
277 | #define COL_ZERO 0 |
278 | #define COL_POS_INF 1 |
279 | |
280 | #define COL_FIX_FIELD 0x11 |
281 | #define COL_VAR_FIELD 0x22 |
282 | #define COL_BLOB_FIELD 0x33 |
283 | |
284 | // |
285 | // information for hidden primary keys |
286 | // |
287 | #define TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH 8 |
288 | |
289 | // |
290 | // function to convert a hidden primary key into a byte stream that can be stored in DBT |
291 | // |
292 | static inline void hpk_num_to_char(uchar* to, ulonglong num) { |
293 | int8store(to, num); |
294 | } |
295 | |
296 | // |
297 | // function that takes a byte stream of a hidden primary key and returns a ulonglong |
298 | // |
299 | static inline ulonglong hpk_char_to_num(uchar* val) { |
300 | return uint8korr(val); |
301 | } |
302 | |
303 | static int tokudb_compare_two_keys( |
304 | const void* new_key_data, |
305 | const uint32_t new_key_size, |
306 | const void* saved_key_data, |
307 | const uint32_t saved_key_size, |
308 | const void* row_desc, |
309 | const uint32_t row_desc_size, |
310 | bool cmp_prefix, |
311 | bool* read_string |
312 | ); |
313 | |
314 | static int tokudb_cmp_dbt_key(DB* db, const DBT *keya, const DBT *keyb); |
315 | |
316 | //TODO: QQQ Only do one direction for prefix. |
317 | static int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb); |
318 | |
319 | static int tokudb_compare_two_key_parts( |
320 | const void* new_key_data, |
321 | const uint32_t new_key_size, |
322 | const void* saved_key_data, |
323 | const uint32_t saved_key_size, |
324 | const void* row_desc, |
325 | const uint32_t row_desc_size, |
326 | uint max_parts |
327 | ); |
328 | |
329 | static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts); |
330 | |
331 | static int create_toku_key_descriptor( |
332 | uchar* buf, |
333 | bool is_first_hpk, |
334 | KEY* first_key, |
335 | bool is_second_hpk, |
336 | KEY* second_key |
337 | ); |
338 | |
339 | |
340 | static uint32_t create_toku_main_key_pack_descriptor ( |
341 | uchar* buf |
342 | ); |
343 | |
344 | static uint32_t get_max_clustering_val_pack_desc_size( |
345 | TABLE_SHARE* table_share |
346 | ); |
347 | |
348 | static uint32_t create_toku_clustering_val_pack_descriptor ( |
349 | uchar* buf, |
350 | uint pk_index, |
351 | TABLE_SHARE* table_share, |
352 | KEY_AND_COL_INFO* kc_info, |
353 | uint32_t keynr, |
354 | bool is_clustering |
355 | ); |
356 | |
357 | static inline bool is_key_clustering( |
358 | void* row_desc, |
359 | uint32_t row_desc_size |
360 | ) |
361 | { |
362 | return (row_desc_size > 0); |
363 | } |
364 | |
365 | static uint32_t pack_clustering_val_from_desc( |
366 | uchar* buf, |
367 | void* row_desc, |
368 | uint32_t row_desc_size, |
369 | const DBT* pk_val |
370 | ); |
371 | |
372 | static uint32_t get_max_secondary_key_pack_desc_size( |
373 | KEY_AND_COL_INFO* kc_info |
374 | ); |
375 | |
376 | static uint32_t create_toku_secondary_key_pack_descriptor ( |
377 | uchar* buf, |
378 | bool has_hpk, |
379 | uint pk_index, |
380 | TABLE_SHARE* table_share, |
381 | TABLE* table, |
382 | KEY_AND_COL_INFO* kc_info, |
383 | KEY* key_info, |
384 | KEY* prim_key |
385 | ); |
386 | |
387 | static inline bool is_key_pk( |
388 | void* row_desc, |
389 | uint32_t row_desc_size |
390 | ) |
391 | { |
392 | uchar* buf = (uchar *)row_desc; |
393 | return buf[0]; |
394 | } |
395 | |
396 | static uint32_t max_key_size_from_desc( |
397 | void* row_desc, |
398 | uint32_t row_desc_size |
399 | ); |
400 | |
401 | |
402 | static uint32_t pack_key_from_desc( |
403 | uchar* buf, |
404 | void* row_desc, |
405 | uint32_t row_desc_size, |
406 | const DBT* pk_key, |
407 | const DBT* pk_val |
408 | ); |
409 | |
410 | static bool fields_have_same_name( |
411 | Field* a, |
412 | Field* b |
413 | ); |
414 | |
415 | static bool fields_are_same_type( |
416 | Field* a, |
417 | Field* b |
418 | ); |
419 | |
420 | static bool are_two_fields_same( |
421 | Field* a, |
422 | Field* b |
423 | ); |
424 | |
425 | #endif |
426 | |
427 | |