1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of TokuDB
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22======= */
23
24#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26#ifndef _HATOKU_CMP
27#define _HATOKU_CMP
28
29#include "hatoku_defines.h"
30#include "tokudb_debug.h"
31
32
33//
34// A MySQL row is encoded in TokuDB, as follows:
35// Keys:
36// Keys pack the defined columns in the order that they are declared.
37// The primary key contains only the columns listed
38// If no primary key is defined, then an eight byte hidden primary key is autogenerated (like an auto increment) and used
39// Secondary keys contains the defined key and the primary key.
40// Two examples:
41// 1) table foo (a int, b int, c int, d int, key(b))
42// The key of the main dictionary contains an eight byte autogenerated hidden primary key
43// The key of key-b is the column 'b' followed by the hidden primary key
44// 2) table foo (a int, b int, c int, d int, primary key(a), key(b))
45// The key of the main dictionary contains 'a'
46// The key of key-b is the column 'b followed by 'a'
47// Vals:
48// For secondary keys they are empty.
49// For the main dictionary and clustering keys, they contain all columns that do not show up in the dictionary's key
50// Two examples:
51// 1) table foo (a int, b int, c int, d varchar(100), primary key(a), clustering key d(d), clustering key d2(d(20))
52// the val of the main dictionary contains (b,c,d)
53// the val of d contains (b,c)
54// the val of d2 contains (b,c,d). d is there because the entire row does not show up in the key
55// Vals are encoded as follows. They have four components:
56// 1) Null bytes: contains a bit field that states what columns are NULL.
57// 2) Fixed fields: all fixed fields are then packed together. If a fixed field is NULL, its data is considered junk
58// 3) varchars and varbinaries: stored in two pieces, first all the offsets and then all the data. If a var field is NULL, its data is considered junk
59// 4) blobs: stored in (length, data) pairs. If a blob is NULL, its data is considered junk
60// An example:
61// Table: (a int, b varchar(20), c blob, d bigint, e varbinary(10), f largeblob, g varchar(10)) <-- no primary key defined
62// Row inserted: (1, "bbb", "cc", 100, "eeeee", "ffff", "g")
63// The packed format of the val looks like:
64// NULL byte <-- 1 byte to encode nothing is NULL
65// 1 <-- four bytes for 'a'
66// 100 <-- four bytes for 'd'
67// 3,8,9 <--offsets for location of data fields, note offsets point to where data ENDS
68// "bbbeeeeeg" <-- data for variable length stuff
69// 2,"cc",4,"ffff"<-- data that stores the blobs
70// The structures below describe are used for the TokuDB encoding of a row
71//
72
73// used for queries
74typedef struct st_col_pack_info {
75 uint32_t col_pack_val; //offset if fixed, pack_index if var
76} COL_PACK_INFO;
77
78//
79// used to define a couple of characteristics of a packed val for the main dictionary or a clustering dictionary
80// fixed_field_size is the size of the fixed fields in the val.
81// len_of_offsets is the size of the bytes that make up the offsets of variable size columns
82// Some notes:
83// If the val has no fixed fields, fixed_field_size is 0
84// If the val has no variable fields, len_of_offsets is 0
85// The number of null bytes at the beginning of a row is not saved, it is derived from table_share->null_bytes
86// The pointer to where the variable data in a val starts is table_share->null_bytes + fixed_field_size + len_of_offsets
87// To figure out where the blobs start, find the last offset listed (if offsets exist)
88//
89typedef struct st_multi_col_pack_info {
90 uint32_t fixed_field_size; //where the fixed length stuff ends and the offsets for var stuff begins
91 uint32_t len_of_offsets; //length of the offset bytes in a packed row
92} MULTI_COL_PACK_INFO;
93
94typedef struct st_key_and_col_info {
95 //
96 // bitmaps for each key. key_filters[i] is associated with the i'th dictionary
97 // States what columns are not stored in the vals of each key, because
98 // the column is stored in the key. So, for example, the table (a int, b int, c int, d int, primary key (b,d)) will
99 // have bit 1 (for 'b') and bit 3 (for 'd') of the primary key's bitmap set for the main dictionary's bitmap,
100 // because 'b' and 'd' do not show up in the val
101 //
102 MY_BITMAP key_filters[MAX_KEY+1];
103 //
104 // following three arrays are used to identify the types of rows in the field
105 // If table->field[i] is a fixed field:
106 // field_lengths[i] stores the field length, which is fixed
107 // length_bytes[i] is 0
108 // 'i' does not show up in the array blob_fields
109 // If table->field[i] is a varchar or varbinary:
110 // field_lengths[i] is 0
111 // length_bytes[i] stores the number of bytes MySQL uses to encode the length of the field in table->record[0]
112 // 'i' does not show up in the array blob_fields
113 // If table->field[i] is a blob:
114 // field_lengths[i] is 0
115 // length_bytes[i] is 0
116 // 'i' shows up in blob_fields
117 //
118 void *multi_ptr;
119 enum { TOKUDB_FIXED_FIELD, TOKUDB_VARIABLE_FIELD, TOKUDB_BLOB_FIELD};
120 uint8_t *field_types;
121 uint16_t* field_lengths; //stores the field lengths of fixed size fields (1<<16 - 1 max),
122 uint8_t* length_bytes; // stores the length of lengths of varchars and varbinaries
123 uint32_t* blob_fields; // list of indexes of blob fields,
124 uint32_t num_blobs; // number of blobs in the table
125 //
126 // val packing info for all dictionaries. i'th one represents info for i'th dictionary
127 //
128 MULTI_COL_PACK_INFO mcp_info[MAX_KEY+1];
129 COL_PACK_INFO* cp_info[MAX_KEY+1];
130 //
131 // number bytes used to represent an offset in a val. Can be 1 or 2.
132 // The number of var fields in a val for dictionary i can be evaluated by
133 // mcp_info[i].len_of_offsets/num_offset_bytes.
134 //
135 uint32_t num_offset_bytes; //number of bytes needed to encode the offset
136} KEY_AND_COL_INFO;
137
138static bool is_fixed_field(KEY_AND_COL_INFO *kcinfo, uint field_num) {
139 return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
140}
141
142static bool is_variable_field(KEY_AND_COL_INFO *kcinfo, uint field_num) {
143 return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
144}
145
146static bool is_blob_field(KEY_AND_COL_INFO *kcinfo, uint field_num) {
147 return kcinfo->field_types[field_num] == KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
148}
149
150static bool field_valid_for_tokudb_table(Field* field);
151
152static void get_var_field_info(
153 uint32_t* field_len,
154 uint32_t* start_offset,
155 uint32_t var_field_index,
156 const uchar* var_field_offset_ptr,
157 uint32_t num_offset_bytes
158 );
159
160static void get_blob_field_info(
161 uint32_t* start_offset,
162 uint32_t len_of_offsets,
163 const uchar* var_field_data_ptr,
164 uint32_t num_offset_bytes
165 );
166
167static inline uint32_t get_blob_field_len(const uchar* from_tokudb, uint32_t len_bytes) {
168 uint32_t length = 0;
169 switch (len_bytes) {
170 case (1):
171 length = (uint32_t)(*from_tokudb);
172 break;
173 case (2):
174 length = uint2korr(from_tokudb);
175 break;
176 case (3):
177 length = tokudb_uint3korr(from_tokudb);
178 break;
179 case (4):
180 length = uint4korr(from_tokudb);
181 break;
182 default:
183 assert_unreachable();
184 }
185 return length;
186}
187
188
189static inline const uchar* unpack_toku_field_blob(uchar *to_mysql, const uchar* from_tokudb, uint32_t len_bytes, bool skip) {
190 uint32_t length = 0;
191 const uchar* data_ptr = NULL;
192 if (!skip) {
193 memcpy(to_mysql, from_tokudb, len_bytes);
194 }
195 length = get_blob_field_len(from_tokudb,len_bytes);
196
197 data_ptr = from_tokudb + len_bytes;
198 if (!skip) {
199 memcpy(to_mysql + len_bytes, (uchar *)(&data_ptr), sizeof data_ptr);
200 }
201 return from_tokudb + len_bytes + length;
202}
203
204static inline uint get_null_offset(TABLE* table, Field* field) {
205#if (50606 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
206 (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
207 return field->null_offset(table->record[0]);
208#else
209 return (uint) ((uchar*) field->null_ptr - (uchar*) table->record[0]);
210#endif
211}
212
213typedef enum {
214 toku_type_int = 0,
215 toku_type_double,
216 toku_type_float,
217 toku_type_fixbinary,
218 toku_type_fixstring,
219 toku_type_varbinary,
220 toku_type_varstring,
221 toku_type_blob,
222 toku_type_hpk, //for hidden primary key
223 toku_type_unknown
224} TOKU_TYPE;
225
226
227static TOKU_TYPE mysql_to_toku_type (Field* field);
228
229static uchar* pack_toku_varbinary_from_desc(
230 uchar* to_tokudb,
231 const uchar* from_desc,
232 uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb
233 uint32_t field_length //length of field
234 );
235
236static uchar* pack_toku_varstring_from_desc(
237 uchar* to_tokudb,
238 const uchar* from_desc,
239 uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb
240 uint32_t field_length,
241 uint32_t charset_num//length of field
242 );
243
244
245static uchar* pack_toku_key_field(
246 uchar* to_tokudb,
247 uchar* from_mysql,
248 Field* field,
249 uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff
250 );
251
252static uchar* pack_key_toku_key_field(
253 uchar* to_tokudb,
254 uchar* from_mysql,
255 Field* field,
256 uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff
257 );
258
259static uchar* unpack_toku_key_field(
260 uchar* to_mysql,
261 uchar* from_tokudb,
262 Field* field,
263 uint32_t key_part_length
264 );
265
266
267//
268// for storing NULL byte in keys
269//
270#define NULL_COL_VAL 0
271#define NONNULL_COL_VAL 1
272
273//
274// for storing if rest of key is +/- infinity
275//
276#define COL_NEG_INF -1
277#define COL_ZERO 0
278#define COL_POS_INF 1
279
280#define COL_FIX_FIELD 0x11
281#define COL_VAR_FIELD 0x22
282#define COL_BLOB_FIELD 0x33
283
284//
285// information for hidden primary keys
286//
287#define TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH 8
288
289//
290// function to convert a hidden primary key into a byte stream that can be stored in DBT
291//
292static inline void hpk_num_to_char(uchar* to, ulonglong num) {
293 int8store(to, num);
294}
295
296//
297// function that takes a byte stream of a hidden primary key and returns a ulonglong
298//
299static inline ulonglong hpk_char_to_num(uchar* val) {
300 return uint8korr(val);
301}
302
303static int tokudb_compare_two_keys(
304 const void* new_key_data,
305 const uint32_t new_key_size,
306 const void* saved_key_data,
307 const uint32_t saved_key_size,
308 const void* row_desc,
309 const uint32_t row_desc_size,
310 bool cmp_prefix,
311 bool* read_string
312 );
313
314static int tokudb_cmp_dbt_key(DB* db, const DBT *keya, const DBT *keyb);
315
316//TODO: QQQ Only do one direction for prefix.
317static int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb);
318
319static int tokudb_compare_two_key_parts(
320 const void* new_key_data,
321 const uint32_t new_key_size,
322 const void* saved_key_data,
323 const uint32_t saved_key_size,
324 const void* row_desc,
325 const uint32_t row_desc_size,
326 uint max_parts
327 );
328
329static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts);
330
331static int create_toku_key_descriptor(
332 uchar* buf,
333 bool is_first_hpk,
334 KEY* first_key,
335 bool is_second_hpk,
336 KEY* second_key
337 );
338
339
340static uint32_t create_toku_main_key_pack_descriptor (
341 uchar* buf
342 );
343
344static uint32_t get_max_clustering_val_pack_desc_size(
345 TABLE_SHARE* table_share
346 );
347
348static uint32_t create_toku_clustering_val_pack_descriptor (
349 uchar* buf,
350 uint pk_index,
351 TABLE_SHARE* table_share,
352 KEY_AND_COL_INFO* kc_info,
353 uint32_t keynr,
354 bool is_clustering
355 );
356
357static inline bool is_key_clustering(
358 void* row_desc,
359 uint32_t row_desc_size
360 )
361{
362 return (row_desc_size > 0);
363}
364
365static uint32_t pack_clustering_val_from_desc(
366 uchar* buf,
367 void* row_desc,
368 uint32_t row_desc_size,
369 const DBT* pk_val
370 );
371
372static uint32_t get_max_secondary_key_pack_desc_size(
373 KEY_AND_COL_INFO* kc_info
374 );
375
376static uint32_t create_toku_secondary_key_pack_descriptor (
377 uchar* buf,
378 bool has_hpk,
379 uint pk_index,
380 TABLE_SHARE* table_share,
381 TABLE* table,
382 KEY_AND_COL_INFO* kc_info,
383 KEY* key_info,
384 KEY* prim_key
385 );
386
387static inline bool is_key_pk(
388 void* row_desc,
389 uint32_t row_desc_size
390 )
391{
392 uchar* buf = (uchar *)row_desc;
393 return buf[0];
394}
395
396static uint32_t max_key_size_from_desc(
397 void* row_desc,
398 uint32_t row_desc_size
399 );
400
401
402static uint32_t pack_key_from_desc(
403 uchar* buf,
404 void* row_desc,
405 uint32_t row_desc_size,
406 const DBT* pk_key,
407 const DBT* pk_val
408 );
409
410static bool fields_have_same_name(
411 Field* a,
412 Field* b
413 );
414
415static bool fields_are_same_type(
416 Field* a,
417 Field* b
418 );
419
420static bool are_two_fields_same(
421 Field* a,
422 Field* b
423 );
424
425#endif
426
427