1/*
2 Copyright (c) 2009, 2011, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16
17/**
18 @defgroup DS-MRR declarations
19 @{
20*/
21
22/**
23 A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
24
25 This is a "plugin"(*) for storage engines that allows to
26 1. When doing index scans, read table rows in rowid order;
27 2. when making many index lookups, do them in key order and don't
28 lookup the same key value multiple times;
29 3. Do both #1 and #2, when applicable.
30 These changes are expected to speed up query execution for disk-based
31 storage engines running io-bound loads and "big" queries (ie. queries that
32 do joins and enumerate lots of records).
33
34 (*) - only conceptually. No dynamic loading or binary compatibility of any
35 kind.
36
37 General scheme of things:
38
39 SQL Layer code
40 | | |
41 v v v
42 -|---|---|---- handler->multi_range_read_XXX() function calls
43 | | |
44 _____________________________________
45 / DS-MRR module \
46 | (order/de-duplicate lookup keys, |
47 | scan indexes in key order, |
48 | order/de-duplicate rowids, |
49 | retrieve full record reads in rowid |
50 | order) |
51 \_____________________________________/
52 | | |
53 -|---|---|----- handler->read_range_first()/read_range_next(),
54 | | | handler->index_read(), handler->rnd_pos() calls.
55 | | |
56 v v v
57 Storage engine internals
58
59
60 Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines.
61 Potentially it can be used with any table handler that has disk-based data
62 storage and has better performance when reading data in rowid order.
63*/
64
65#include "sql_lifo_buffer.h"
66
67class DsMrr_impl;
68class Mrr_ordered_index_reader;
69
70
71/* A structure with key parameters that's shared among several classes */
72class Key_parameters
73{
74public:
75 uint key_tuple_length; /* Length of index lookup tuple, in bytes */
76 key_part_map key_tuple_map; /* keyparts used in index lookup tuples */
77
78 /*
79 This is
80 = key_tuple_length if we copy keys to buffer
81 = sizeof(void*) if we're using pointers to materialized keys.
82 */
83 uint key_size_in_keybuf;
84
85 /* TRUE <=> don't copy key values, use pointers to them instead. */
86 bool use_key_pointers;
87
88 /* TRUE <=> We can get at most one index tuple for a lookup key */
89 bool index_ranges_unique;
90};
91
92
93/**
94 A class to enumerate (record, range_id) pairs that match given key value.
95
96 @note
97
98 The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
99 ordered by key value. From the front of the buffer we see
100
101 (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
102
103 we take the first elements that have the same key value (key_val1 in the
104 example above), and make lookup into the table. The table will have
105 multiple matches for key_val1:
106
107 == Table Index ==
108 ...
109 key_val1 -> key_val1, index_tuple1
110 key_val1, index_tuple2
111 ...
112 key_val1, index_tupleN
113 ...
114
115 Our goal is to produce all possible combinations, i.e. we need:
116
117 {(key_val1, index_tuple1), range_id1}
118 {(key_val1, index_tuple1), range_id2}
119 ... ... |
120 {(key_val1, index_tuple1), range_idN},
121
122 {(key_val1, index_tuple2), range_id1}
123 {(key_val1, index_tuple2), range_id2}
124 ... ... |
125 {(key_val1, index_tuple2), range_idN},
126
127 ... ... ...
128
129 {(key_val1, index_tupleK), range_idN}
130*/
131
132class Key_value_records_iterator
133{
134 /* Use this to get table handler, key buffer and other parameters */
135 Mrr_ordered_index_reader *owner;
136
137 /* Iterator to get (key, range_id) pairs from */
138 Lifo_buffer_iterator identical_key_it;
139
140 /*
141 Last of the identical key values (when we get this pointer from
142 identical_key_it, it will be time to stop).
143 */
144 uchar *last_identical_key_ptr;
145
146 /*
147 FALSE <=> we're right after the init() call, the record has been already
148 read with owner->file->index_read_map() call
149 */
150 bool get_next_row;
151
152public:
153 int init(Mrr_ordered_index_reader *owner_arg);
154 int get_next(range_id_t *range_info);
155 void move_to_next_key_value();
156};
157
158
159/*
160 Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
161 to manage buffer space for them.
162*/
163typedef struct st_buffer_manager
164{
165public:
166 /* Opaque value to be passed as the first argument to all member functions */
167 void *arg;
168
169 /*
170 This is called when we've freed more space from the rowid buffer. The
171 callee will get the unused space from the rowid buffer and give it to the
172 key buffer.
173 */
174 void (*redistribute_buffer_space)(void *arg);
175
176 /*
177 This is called when both key and rowid buffers are empty, and so it's time
178 to reset them to their original size (They've lost their original size,
179 because we were dynamically growing rowid buffer and shrinking key buffer).
180 */
181 void (*reset_buffer_sizes)(void *arg);
182
183} Buffer_manager;
184
185
186/*
187 Mrr_reader - DS-MRR execution strategy abstraction
188
189 A reader produces ([index]_record, range_info) pairs, and requires periodic
190 refill operations.
191
192 - one starts using the reader by calling reader->get_next(),
193 - when a get_next() call returns HA_ERR_END_OF_FILE, one must call
194 refill_buffer() before they can make more get_next() calls.
195 - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
196 end of stream and get_next() should not be called anymore.
197
198 Both functions can return other error codes, these mean unrecoverable errors
199 after which one cannot continue.
200*/
201
202class Mrr_reader
203{
204public:
205 virtual int get_next(range_id_t *range_info) = 0;
206 virtual int refill_buffer(bool initial) = 0;
207 virtual ~Mrr_reader() {}; /* just to remove compiler warning */
208};
209
210
211/*
212 A common base for readers that do index scans and produce index tuples
213*/
214
215class Mrr_index_reader : public Mrr_reader
216{
217protected:
218 handler *file; /* Handler object to use */
219public:
220 virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
221 void *seq_init_param, uint n_ranges,
222 uint mode, Key_parameters *key_par,
223 Lifo_buffer *key_buffer,
224 Buffer_manager *buf_manager_arg) = 0;
225
226 /* Get pointer to place where every get_next() call will put rowid */
227 virtual uchar *get_rowid_ptr() = 0;
228 /* Get the rowid (call this after get_next() call) */
229 virtual void position();
230 virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0;
231
232 virtual void interrupt_read() {}
233 virtual void resume_read() {}
234};
235
236
237/*
238 A "bypass" index reader that just does and index scan. The index scan is done
239 by calling default MRR implementation (i.e. handler::multi_range_read_XXX())
240 functions.
241*/
242
243class Mrr_simple_index_reader : public Mrr_index_reader
244{
245public:
246 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
247 void *seq_init_param, uint n_ranges,
248 uint mode, Key_parameters *key_par,
249 Lifo_buffer *key_buffer,
250 Buffer_manager *buf_manager_arg);
251 int get_next(range_id_t *range_info);
252 int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; }
253 uchar *get_rowid_ptr() { return file->ref; }
254 bool skip_record(range_id_t range_id, uchar *rowid)
255 {
256 return (file->mrr_funcs.skip_record &&
257 file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
258 }
259};
260
261
262/*
263 A reader that sorts the key values before it makes the index lookups.
264*/
265
266class Mrr_ordered_index_reader : public Mrr_index_reader
267{
268public:
269 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
270 void *seq_init_param, uint n_ranges,
271 uint mode, Key_parameters *key_par,
272 Lifo_buffer *key_buffer,
273 Buffer_manager *buf_manager_arg);
274 int get_next(range_id_t *range_info);
275 int refill_buffer(bool initial);
276 uchar *get_rowid_ptr() { return file->ref; }
277
278 bool skip_record(range_id_t range_info, uchar *rowid)
279 {
280 return (mrr_funcs.skip_record &&
281 mrr_funcs.skip_record(mrr_iter, range_info, rowid));
282 }
283
284 bool skip_index_tuple(range_id_t range_info)
285 {
286 return (mrr_funcs.skip_index_tuple &&
287 mrr_funcs.skip_index_tuple(mrr_iter, range_info));
288 }
289
290 bool set_interruption_temp_buffer(uint rowid_length, uint key_len,
291 uint saved_pk_len,
292 uchar **space_start, uchar *space_end);
293 void set_no_interruption_temp_buffer();
294
295 void interrupt_read();
296 void resume_read();
297 void position();
298private:
299 Key_value_records_iterator kv_it;
300
301 bool scanning_key_val_iter;
302
303 /* Buffer to store (key, range_id) pairs */
304 Lifo_buffer *key_buffer;
305
306 /* This manages key buffer allocation and sizing for us */
307 Buffer_manager *buf_manager;
308
309 Key_parameters keypar; /* index scan and lookup tuple parameters */
310
311 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
312 bool is_mrr_assoc;
313
314 /* Range sequence iteration members */
315 RANGE_SEQ_IF mrr_funcs;
316 range_seq_t mrr_iter;
317
318 /* TRUE == reached eof when enumerating ranges */
319 bool source_exhausted;
320
321 /*
322 Following members are for interrupt_read()/resume_read(). The idea is that
323 in some cases index scan that is done by this object is interrupted by
324 rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
325 we're sharing handler->record[0] with that object, and it destroys its
326 contents.
327 We need to save/restore our current
328 - index tuple (for pushed index condition checks)
329 - clustered primary key values (again, for pushed index condition checks)
330 - rowid of the last record we've retrieved (in case this rowid matches
331 multiple ranges and we'll need to return it again)
332 */
333 bool support_scan_interruptions;
334 /* Space where we save the rowid of the last record we've returned */
335 uchar *saved_rowid;
336
337 /* TRUE <=> saved_rowid has the last saved rowid */
338 bool have_saved_rowid;
339
340 uchar *saved_key_tuple; /* Saved current key tuple */
341 uchar *saved_primary_key; /* Saved current primary key tuple */
342
343 /*
344 TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have
345 valid values.
346 */
347 bool read_was_interrupted;
348
349 static int compare_keys(void* arg, uchar* key1, uchar* key2);
350 static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
351
352 friend class Key_value_records_iterator;
353 friend class DsMrr_impl;
354 friend class Mrr_ordered_rndpos_reader;
355};
356
357
358/*
359 A reader that gets rowids from an Mrr_index_reader, and then sorts them
360 before getting full records with handler->rndpos() calls.
361*/
362
363class Mrr_ordered_rndpos_reader : public Mrr_reader
364{
365public:
366 int init(handler *file, Mrr_index_reader *index_reader, uint mode,
367 Lifo_buffer *buf);
368 int get_next(range_id_t *range_info);
369 int refill_buffer(bool initial);
370private:
371 handler *file; /* Handler to use */
372
373 /* This what we get (rowid, range_info) pairs from */
374 Mrr_index_reader *index_reader;
375
376 /* index_reader->get_next() puts rowid here */
377 uchar *index_rowid;
378
379 /* TRUE <=> index_reader->refill_buffer() call has returned EOF */
380 bool index_reader_exhausted;
381
382 /*
383 TRUE <=> We should call index_reader->refill_buffer(). This happens if
384 1. we've made index_reader->get_next() call which returned EOF
385 2. we haven't made any index_reader calls (and our first call should
386 be index_reader->refill_buffer(initial=TRUE)
387 */
388 bool index_reader_needs_refill;
389
390 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
391 bool is_mrr_assoc;
392
393 /*
394 When reading from ordered rowid buffer: the rowid element of the last
395 buffer element that has rowid identical to this one.
396 */
397 uchar *last_identical_rowid;
398
399 /* Buffer to store (rowid, range_id) pairs */
400 Lifo_buffer *rowid_buffer;
401
402 int refill_from_index_reader();
403};
404
405
406/*
407 A primitive "factory" of various Mrr_*_reader classes (the point is to
408 get various kinds of readers without having to allocate them on the heap)
409*/
410
411class Mrr_reader_factory
412{
413public:
414 Mrr_ordered_rndpos_reader ordered_rndpos_reader;
415 Mrr_ordered_index_reader ordered_index_reader;
416 Mrr_simple_index_reader simple_index_reader;
417};
418
419
420#define DSMRR_IMPL_SORT_KEYS HA_MRR_IMPLEMENTATION_FLAG1
421#define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2
422
423/*
424 DS-MRR implementation for one table. Create/use one object of this class for
425 each ha_{myisam/innobase/etc} object. That object will be further referred to
426 as "the handler"
427
428 DsMrr_impl supports has the following execution strategies:
429
430 - Bypass DS-MRR, pass all calls to default MRR implementation, which is
431 an MRR-to-non-MRR call converter.
432 - Key-Ordered Retrieval
433 - Rowid-Ordered Retrieval
434
435 DsMrr_impl will use one of the above strategies, or a combination of them,
436 according to the following diagram:
437
438 (mrr function calls)
439 |
440 +----------------->-----------------+
441 | |
442 ___________v______________ _______________v________________
443 / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \
444 | (or ranges) in whatever | | sort lookup keys and then make |
445 | order they are supplied | | index lookups in index order |
446 \__________________________/ \________________________________/
447 | | | | |
448 +---<---+ | +--------------->-----------|----+
449 | | | |
450 | | +---------------+ |
451 | ______v___ ______ | _______________v_______________
452 | / default: read \ | / ROWID-ORDERED RETRIEVAL: \
453 | | table records | | | Before reading table records, |
454 v | in random order | v | sort their rowids and then |
455 | \_________________/ | | read them in rowid order |
456 | | | \_______________________________/
457 | | | |
458 | | | |
459 +-->---+ | +----<------+-----------<--------+
460 | | |
461 v v v
462 (table records and range_ids)
463
464 The choice of strategy depends on MRR scan properties, table properties
465 (whether we're scanning clustered primary key), and @@optimizer_switch
466 settings.
467
468 Key-Ordered Retrieval
469 ---------------------
470 The idea is: if MRR scan is essentially a series of lookups on
471
472 tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
473
474 then it makes sense to collect and order the set of lookup values, i.e.
475
476 sort(value1, value2, .. valueN)
477
478 and then do index lookups in index order. This results in fewer index page
479 fetch operations, and we also can avoid making multiple index lookups for the
480 same value. That is, if value1=valueN we can easily discover that after
481 sorting and make one index lookup for them instead of two.
482
483 Rowid-Ordered Retrieval
484 -----------------------
485 If we do a regular index scan or a series of index lookups, we'll be hitting
486 table records at random. For disk-based engines, this is much slower than
487 reading the same records in disk order. We assume that disk ordering of
488 rows is the same as ordering of their rowids (which is provided by
489 handler::cmp_ref())
490 In order to retrieve records in different order, we must separate index
491 scanning and record fetching, that is, MRR scan uses the following steps:
492
493 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and
494 fill a buffer with {rowid, range_id} pairs
495 2. Sort the buffer by rowid value
496 3. for each {rowid, range_id} pair in the buffer
497 get record by rowid and return the {record, range_id} pair
498 4. Repeat the above steps until we've exhausted the list of ranges we're
499 scanning.
500
501 Buffer space management considerations
502 --------------------------------------
503 With regards to buffer/memory management, MRR interface specifies that
504 - SQL layer provides multi_range_read_init() with buffer of certain size.
505 - MRR implementation may use (i.e. have at its disposal till the end of
506 the MRR scan) all of the buffer, or return the unused end of the buffer
507 to SQL layer.
508
509 DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
510 we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
511
512 When we need to accumulate/sort both keys and rowids, efficient buffer use
513 gets complicated. We need to:
514 - First, accumulate keys and sort them
515 - Then use the keys (smaller values go first) to obtain rowids. A key is not
516 needed after we've got matching rowids for it.
517 - Make sure that rowids are accumulated at the front of the buffer, so that we
518 can return the end part of the buffer to SQL layer, should there be too
519 few rowid values to occupy the buffer.
520
521 All of these goals are achieved by using the following scheme:
522
523 | | We get an empty buffer from SQL layer.
524
525 | *-|
526 | *----| First, we fill the buffer with keys. Key_buffer
527 | *-------| part grows from end of the buffer space to start
528 | *----------| (In this picture, the buffer is big enough to
529 | *-------------| accomodate all keys and even have some space left)
530
531 | *=============| We want to do key-ordered index scan, so we sort
532 the keys
533
534 |-x *===========| Then we use the keys get rowids. Rowids are
535 |----x *========| stored from start of buffer space towards the end.
536 |--------x *=====| The part of the buffer occupied with keys
537 |------------x *===| gradually frees up space for rowids. In this
538 |--------------x *=| picture we run out of keys before we've ran out
539 |----------------x | of buffer space (it can be other way as well).
540
541 |================x | Then we sort the rowids.
542
543 | |~~~| The unused part of the buffer is at the end, so
544 we can return it to the SQL layer.
545
546 |================* Sorted rowids are then used to read table records
547 in disk order
548
549*/
550
551class DsMrr_impl
552{
553public:
554 typedef void (handler::*range_check_toggle_func_t)(bool on);
555
556 DsMrr_impl()
557 : secondary_file(NULL) {};
558
559 void init(handler *h_arg, TABLE *table_arg)
560 {
561 primary_file= h_arg;
562 table= table_arg;
563 }
564 int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
565 void *seq_init_param, uint n_ranges, uint mode,
566 HANDLER_BUFFER *buf);
567 void dsmrr_close();
568 int dsmrr_next(range_id_t *range_info);
569
570 ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts,
571 uint *bufsz, uint *flags, Cost_estimate *cost);
572
573 ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
574 void *seq_init_param, uint n_ranges, uint *bufsz,
575 uint *flags, Cost_estimate *cost);
576
577 int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
578private:
579 /* Buffer to store (key, range_id) pairs */
580 Lifo_buffer *key_buffer;
581
582 /*
583 The "owner" handler object (the one that is expected to "own" this object
584 and call its functions).
585 */
586 handler *primary_file;
587 TABLE *table; /* Always equal to primary_file->table */
588
589 /*
590 Secondary handler object. (created when needed, we need it when we need
591 to run both index scan and rnd_pos() scan at the same time)
592 */
593 handler *secondary_file;
594
595 uint keyno; /* index we're running the scan on */
596 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
597 bool is_mrr_assoc;
598
599 Mrr_reader_factory reader_factory;
600
601 Mrr_reader *strategy;
602 bool strategy_exhausted;
603
604 Mrr_index_reader *index_strategy;
605
606 /* The whole buffer space that we're using */
607 uchar *full_buf;
608 uchar *full_buf_end;
609
610 /*
611 When using both rowid and key buffers: the boundary between key and rowid
612 parts of the buffer. This is the "original" value, actual memory ranges
613 used by key and rowid parts may be different because of dynamic space
614 reallocation between them.
615 */
616 uchar *rowid_buffer_end;
617
618 /*
619 One of the following two is used for key buffer: forward is used when
620 we only need key buffer, backward is used when we need both key and rowid
621 buffers.
622 */
623 Forward_lifo_buffer forward_key_buf;
624 Backward_lifo_buffer backward_key_buf;
625
626 /*
627 Buffer to store (rowid, range_id) pairs, or just rowids if
628 is_mrr_assoc==FALSE
629 */
630 Forward_lifo_buffer rowid_buffer;
631
632 bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz,
633 Cost_estimate *cost);
634 bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
635 uint *buffer_size, Cost_estimate *cost);
636 bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags);
637
638 bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
639
640 /* Buffer_manager and its member functions */
641 Buffer_manager buf_manager;
642 static void redistribute_buffer_space(void *dsmrr_arg);
643 static void reset_buffer_sizes(void *dsmrr_arg);
644 static void do_nothing(void *dsmrr_arg);
645
646 Lifo_buffer* get_key_buffer() { return key_buffer; }
647
648 friend class Key_value_records_iterator;
649 friend class Mrr_ordered_index_reader;
650 friend class Mrr_ordered_rndpos_reader;
651
652 int setup_two_handlers();
653 void close_second_handler();
654};
655
656/**
657 @} (end of group DS-MRR declarations)
658*/
659
660