multi_range_read.h source code [MariaDB/sql/multi_range_read.h]

1	/*
2	Copyright (c) 2009, 2011, Monty Program Ab
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA /*
16
17	/**
18	@defgroup DS-MRR declarations
19	@{
20	*/
21
22	/**
23	A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
24
25	This is a "plugin"() for storage engines that allows to*
26	1. When doing index scans, read table rows in rowid order;
27	2. when making many index lookups, do them in key order and don't
28	lookup the same key value multiple times;
29	3. Do both #1 and #2, when applicable.
30	These changes are expected to speed up query execution for disk-based
31	storage engines running io-bound loads and "big" queries (ie. queries that
32	do joins and enumerate lots of records).
33
34	() - only conceptually. No dynamic loading or binary compatibility of any*
35	kind.
36
37	General scheme of things:
38
39	SQL Layer code
40	\| \| \|
41	v v v
42	-\|---\|---\|---- handler->multi_range_read_XXX() function calls
43	\| \| \|
44	_____________________________________
45	/ DS-MRR module \
46	\| (order/de-duplicate lookup keys, \|
47	\| scan indexes in key order, \|
48	\| order/de-duplicate rowids, \|
49	\| retrieve full record reads in rowid \|
50	\| order) \|
51	\_____________________________________/
52	\| \| \|
53	-\|---\|---\|----- handler->read_range_first()/read_range_next(),
54	\| \| \| handler->index_read(), handler->rnd_pos() calls.
55	\| \| \|
56	v v v
57	Storage engine internals
58
59
60	Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines.
61	Potentially it can be used with any table handler that has disk-based data
62	storage and has better performance when reading data in rowid order.
63	*/
64
65	#include "sql_lifo_buffer.h"
66
67	class DsMrr_impl;
68	class Mrr_ordered_index_reader;
69
70
71	/ A structure with key parameters that's shared among several classes /
72	class Key_parameters
73	{
74	public:
75	uint key_tuple_length; / Length of index lookup tuple, in bytes /
76	key_part_map key_tuple_map; / keyparts used in index lookup tuples /
77
78	/*
79	This is
80	= key_tuple_length if we copy keys to buffer
81	= sizeof(void) if we're using pointers to materialized keys.*
82	*/
83	uint key_size_in_keybuf;
84
85	/ TRUE <=> don't copy key values, use pointers to them instead. /
86	bool use_key_pointers;
87
88	/ TRUE <=> We can get at most one index tuple for a lookup key /
89	bool index_ranges_unique;
90	};
91
92
93	/**
94	A class to enumerate (record, range_id) pairs that match given key value.
95
96	@note
97
98	The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
99	ordered by key value. From the front of the buffer we see
100
101	(key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
102
103	we take the first elements that have the same key value (key_val1 in the
104	example above), and make lookup into the table. The table will have
105	multiple matches for key_val1:
106
107	== Table Index ==
108	...
109	key_val1 -> key_val1, index_tuple1
110	key_val1, index_tuple2
111	...
112	key_val1, index_tupleN
113	...
114
115	Our goal is to produce all possible combinations, i.e. we need:
116
117	{(key_val1, index_tuple1), range_id1}
118	{(key_val1, index_tuple1), range_id2}
119	... ... \|
120	{(key_val1, index_tuple1), range_idN},
121
122	{(key_val1, index_tuple2), range_id1}
123	{(key_val1, index_tuple2), range_id2}
124	... ... \|
125	{(key_val1, index_tuple2), range_idN},
126
127	... ... ...
128
129	{(key_val1, index_tupleK), range_idN}
130	*/
131
132	class Key_value_records_iterator
133	{
134	/ Use this to get table handler, key buffer and other parameters /
135	Mrr_ordered_index_reader *owner;
136
137	/ Iterator to get (key, range_id) pairs from /
138	Lifo_buffer_iterator identical_key_it;
139
140	/*
141	Last of the identical key values (when we get this pointer from
142	identical_key_it, it will be time to stop).
143	*/
144	uchar *last_identical_key_ptr;
145
146	/*
147	FALSE <=> we're right after the init() call, the record has been already
148	read with owner->file->index_read_map() call
149	*/
150	bool get_next_row;
151
152	public:
153	int init(Mrr_ordered_index_reader *owner_arg);
154	int get_next(range_id_t *range_info);
155	void move_to_next_key_value();
156	};
157
158
159	/*
160	Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
161	to manage buffer space for them.
162	*/
163	typedef struct st_buffer_manager
164	{
165	public:
166	/ Opaque value to be passed as the first argument to all member functions /
167	void *arg;
168
169	/*
170	This is called when we've freed more space from the rowid buffer. The
171	callee will get the unused space from the rowid buffer and give it to the
172	key buffer.
173	*/
174	void (redistribute_buffer_space)(void* *arg);
175
176	/*
177	This is called when both key and rowid buffers are empty, and so it's time
178	to reset them to their original size (They've lost their original size,
179	because we were dynamically growing rowid buffer and shrinking key buffer).
180	*/
181	void (reset_buffer_sizes)(void* *arg);
182
183	} Buffer_manager;
184
185
186	/*
187	Mrr_reader - DS-MRR execution strategy abstraction
188
189	A reader produces ([index]_record, range_info) pairs, and requires periodic
190	refill operations.
191
192	- one starts using the reader by calling reader->get_next(),
193	- when a get_next() call returns HA_ERR_END_OF_FILE, one must call
194	refill_buffer() before they can make more get_next() calls.
195	- when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
196	end of stream and get_next() should not be called anymore.
197
198	Both functions can return other error codes, these mean unrecoverable errors
199	after which one cannot continue.
200	*/
201
202	class Mrr_reader
203	{
204	public:
205	virtual int get_next(range_id_t *range_info) = `0`;
206	virtual int refill_buffer(bool initial) = `0`;
207	virtual ~Mrr_reader() {}; / just to remove compiler warning /
208	};
209
210
211	/*
212	A common base for readers that do index scans and produce index tuples
213	*/
214
215	class Mrr_index_reader : public Mrr_reader
216	{
217	protected:
218	handler file; /* Handler object to use /
219	public:
220	virtual int init(handler h_arg, RANGE_SEQ_IF seq_funcs,
221	void *seq_init_param, uint n_ranges,
222	uint mode, Key_parameters *key_par,
223	Lifo_buffer *key_buffer,
224	Buffer_manager *buf_manager_arg) = `0`;
225
226	/ Get pointer to place where every get_next() call will put rowid /
227	virtual uchar *get_rowid_ptr() = `0`;
228	/ Get the rowid (call this after get_next() call) /
229	virtual void position();
230	virtual bool skip_record(range_id_t range_id, uchar *rowid) = `0`;
231
232	virtual void interrupt_read() {}
233	virtual void resume_read() {}
234	};
235
236
237	/*
238	A "bypass" index reader that just does and index scan. The index scan is done
239	by calling default MRR implementation (i.e. handler::multi_range_read_XXX())
240	functions.
241	*/
242
243	class Mrr_simple_index_reader : public Mrr_index_reader
244	{
245	public:
246	int init(handler h_arg, RANGE_SEQ_IF seq_funcs,
247	void *seq_init_param, uint n_ranges,
248	uint mode, Key_parameters *key_par,
249	Lifo_buffer *key_buffer,
250	Buffer_manager *buf_manager_arg);
251	int get_next(range_id_t *range_info);
252	int refill_buffer(bool initial) { return initial? `0`: HA_ERR_END_OF_FILE; }
253	uchar get_rowid_ptr() { return* file->ref; }
254	bool skip_record(range_id_t range_id, uchar *rowid)
255	{
256	return (file->mrr_funcs.skip_record &&
257	file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
258	}
259	};
260
261
262	/*
263	A reader that sorts the key values before it makes the index lookups.
264	*/
265
266	class Mrr_ordered_index_reader : public Mrr_index_reader
267	{
268	public:
269	int init(handler h_arg, RANGE_SEQ_IF seq_funcs,
270	void *seq_init_param, uint n_ranges,
271	uint mode, Key_parameters *key_par,
272	Lifo_buffer *key_buffer,
273	Buffer_manager *buf_manager_arg);
274	int get_next(range_id_t *range_info);
275	int refill_buffer(bool initial);
276	uchar get_rowid_ptr() { return* file->ref; }
277
278	bool skip_record(range_id_t range_info, uchar *rowid)
279	{
280	return (mrr_funcs.skip_record &&
281	mrr_funcs.skip_record(mrr_iter, range_info, rowid));
282	}
283
284	bool skip_index_tuple(range_id_t range_info)
285	{
286	return (mrr_funcs.skip_index_tuple &&
287	mrr_funcs.skip_index_tuple(mrr_iter, range_info));
288	}
289
290	bool set_interruption_temp_buffer(uint rowid_length, uint key_len,
291	uint saved_pk_len,
292	uchar *space_start, uchar space_end);
293	void set_no_interruption_temp_buffer();
294
295	void interrupt_read();
296	void resume_read();
297	void position();
298	private:
299	Key_value_records_iterator kv_it;
300
301	bool scanning_key_val_iter;
302
303	/ Buffer to store (key, range_id) pairs /
304	Lifo_buffer *key_buffer;
305
306	/ This manages key buffer allocation and sizing for us /
307	Buffer_manager *buf_manager;
308
309	Key_parameters keypar; / index scan and lookup tuple parameters /
310
311	/ TRUE <=> need range association, buffers hold {rowid, range_id} pairs /
312	bool is_mrr_assoc;
313
314	/ Range sequence iteration members /
315	RANGE_SEQ_IF mrr_funcs;
316	range_seq_t mrr_iter;
317
318	/ TRUE == reached eof when enumerating ranges /
319	bool source_exhausted;
320
321	/*
322	Following members are for interrupt_read()/resume_read(). The idea is that
323	in some cases index scan that is done by this object is interrupted by
324	rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
325	we're sharing handler->record[0] with that object, and it destroys its
326	contents.
327	We need to save/restore our current
328	- index tuple (for pushed index condition checks)
329	- clustered primary key values (again, for pushed index condition checks)
330	- rowid of the last record we've retrieved (in case this rowid matches
331	multiple ranges and we'll need to return it again)
332	*/
333	bool support_scan_interruptions;
334	/ Space where we save the rowid of the last record we've returned /
335	uchar *saved_rowid;
336
337	/ TRUE <=> saved_rowid has the last saved rowid /
338	bool have_saved_rowid;
339
340	uchar saved_key_tuple; /* Saved current key tuple /
341	uchar saved_primary_key; /* Saved current primary key tuple /
342
343	/*
344	TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have
345	valid values.
346	*/
347	bool read_was_interrupted;
348
349	static int compare_keys(void* arg, uchar* key1, uchar* key2);
350	static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
351
352	friend class Key_value_records_iterator;
353	friend class DsMrr_impl;
354	friend class Mrr_ordered_rndpos_reader;
355	};
356
357
358	/*
359	A reader that gets rowids from an Mrr_index_reader, and then sorts them
360	before getting full records with handler->rndpos() calls.
361	*/
362
363	class Mrr_ordered_rndpos_reader : public Mrr_reader
364	{
365	public:
366	int init(handler file, Mrr_index_reader index_reader, uint mode,
367	Lifo_buffer *buf);
368	int get_next(range_id_t *range_info);
369	int refill_buffer(bool initial);
370	private:
371	handler file; /* Handler to use /
372
373	/ This what we get (rowid, range_info) pairs from /
374	Mrr_index_reader *index_reader;
375
376	/ index_reader->get_next() puts rowid here /
377	uchar *index_rowid;
378
379	/ TRUE <=> index_reader->refill_buffer() call has returned EOF /
380	bool index_reader_exhausted;
381
382	/*
383	TRUE <=> We should call index_reader->refill_buffer(). This happens if
384	1. we've made index_reader->get_next() call which returned EOF
385	2. we haven't made any index_reader calls (and our first call should
386	be index_reader->refill_buffer(initial=TRUE)
387	*/
388	bool index_reader_needs_refill;
389
390	/ TRUE <=> need range association, buffers hold {rowid, range_id} pairs /
391	bool is_mrr_assoc;
392
393	/*
394	When reading from ordered rowid buffer: the rowid element of the last
395	buffer element that has rowid identical to this one.
396	*/
397	uchar *last_identical_rowid;
398
399	/ Buffer to store (rowid, range_id) pairs /
400	Lifo_buffer *rowid_buffer;
401
402	int refill_from_index_reader();
403	};
404
405
406	/*
407	A primitive "factory" of various Mrr__reader classes (the point is to*
408	get various kinds of readers without having to allocate them on the heap)
409	*/
410
411	class Mrr_reader_factory
412	{
413	public:
414	Mrr_ordered_rndpos_reader ordered_rndpos_reader;
415	Mrr_ordered_index_reader ordered_index_reader;
416	Mrr_simple_index_reader simple_index_reader;
417	};
418
419
420	#define DSMRR_IMPL_SORT_KEYS HA_MRR_IMPLEMENTATION_FLAG1
421	#define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2
422
423	/*
424	DS-MRR implementation for one table. Create/use one object of this class for
425	each ha_{myisam/innobase/etc} object. That object will be further referred to
426	as "the handler"
427
428	DsMrr_impl supports has the following execution strategies:
429
430	- Bypass DS-MRR, pass all calls to default MRR implementation, which is
431	an MRR-to-non-MRR call converter.
432	- Key-Ordered Retrieval
433	- Rowid-Ordered Retrieval
434
435	DsMrr_impl will use one of the above strategies, or a combination of them,
436	according to the following diagram:
437
438	(mrr function calls)
439	\|
440	+----------------->-----------------+
441	\| \|
442	___________v______________ _______________v________________
443	/ default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \
444	\| (or ranges) in whatever \| \| sort lookup keys and then make \|
445	\| order they are supplied \| \| index lookups in index order \|
446	\__________________________/ \________________________________/
447	\| \| \| \| \|
448	+---<---+ \| +--------------->-----------\|----+
449	\| \| \| \|
450	\| \| +---------------+ \|
451	\| ______v___ ______ \| _______________v_______________
452	\| / default: read \ \| / ROWID-ORDERED RETRIEVAL: \
453	\| \| table records \| \| \| Before reading table records, \|
454	v \| in random order \| v \| sort their rowids and then \|
455	\| \_________________/ \| \| read them in rowid order \|
456	\| \| \| \_______________________________/
457	\| \| \| \|
458	\| \| \| \|
459	+-->---+ \| +----<------+-----------<--------+
460	\| \| \|
461	v v v
462	(table records and range_ids)
463
464	The choice of strategy depends on MRR scan properties, table properties
465	(whether we're scanning clustered primary key), and @@optimizer_switch
466	settings.
467
468	Key-Ordered Retrieval
469	---------------------
470	The idea is: if MRR scan is essentially a series of lookups on
471
472	tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
473
474	then it makes sense to collect and order the set of lookup values, i.e.
475
476	sort(value1, value2, .. valueN)
477
478	and then do index lookups in index order. This results in fewer index page
479	fetch operations, and we also can avoid making multiple index lookups for the
480	same value. That is, if value1=valueN we can easily discover that after
481	sorting and make one index lookup for them instead of two.
482
483	Rowid-Ordered Retrieval
484	-----------------------
485	If we do a regular index scan or a series of index lookups, we'll be hitting
486	table records at random. For disk-based engines, this is much slower than
487	reading the same records in disk order. We assume that disk ordering of
488	rows is the same as ordering of their rowids (which is provided by
489	handler::cmp_ref())
490	In order to retrieve records in different order, we must separate index
491	scanning and record fetching, that is, MRR scan uses the following steps:
492
493	1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and
494	fill a buffer with {rowid, range_id} pairs
495	2. Sort the buffer by rowid value
496	3. for each {rowid, range_id} pair in the buffer
497	get record by rowid and return the {record, range_id} pair
498	4. Repeat the above steps until we've exhausted the list of ranges we're
499	scanning.
500
501	Buffer space management considerations
502	--------------------------------------
503	With regards to buffer/memory management, MRR interface specifies that
504	- SQL layer provides multi_range_read_init() with buffer of certain size.
505	- MRR implementation may use (i.e. have at its disposal till the end of
506	the MRR scan) all of the buffer, or return the unused end of the buffer
507	to SQL layer.
508
509	DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
510	we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
511
512	When we need to accumulate/sort both keys and rowids, efficient buffer use
513	gets complicated. We need to:
514	- First, accumulate keys and sort them
515	- Then use the keys (smaller values go first) to obtain rowids. A key is not
516	needed after we've got matching rowids for it.
517	- Make sure that rowids are accumulated at the front of the buffer, so that we
518	can return the end part of the buffer to SQL layer, should there be too
519	few rowid values to occupy the buffer.
520
521	All of these goals are achieved by using the following scheme:
522
523	\| \| We get an empty buffer from SQL layer.
524
525	\| -\|*
526	\| ----\| First, we fill the buffer with keys. Key_buffer*
527	\| -------\| part grows from end of the buffer space to start*
528	\| ----------\| (In this picture, the buffer is big enough to*
529	\| -------------\| accomodate all keys and even have some space left)*
530
531	\| =============\| We want to do key-ordered index scan, so we sort*
532	the keys
533
534	\|-x ===========\| Then we use the keys get rowids. Rowids are*
535	\|----x ========\| stored from start of buffer space towards the end.*
536	\|--------x =====\| The part of the buffer occupied with keys*
537	\|------------x ===\| gradually frees up space for rowids. In this*
538	\|--------------x =\| picture we run out of keys before we've ran out*
539	\|----------------x \| of buffer space (it can be other way as well).
540
541	\|================x \| Then we sort the rowids.
542
543	\| \|~~~\| The unused part of the buffer is at the end, so
544	we can return it to the SQL layer.
545
546	\|================ Sorted rowids are then used to read table records*
547	in disk order
548
549	*/
550
551	class DsMrr_impl
552	{
553	public:
554	typedef void (handler::range_check_toggle_func_t)(bool* on);
555
556	DsMrr_impl()
557	: secondary_file(NULL) {};
558
559	void init(handler h_arg, TABLE table_arg)
560	{
561	primary_file= h_arg;
562	table= table_arg;
563	}
564	int dsmrr_init(handler h_arg, RANGE_SEQ_IF seq_funcs,
565	void *seq_init_param, uint n_ranges, uint mode,
566	HANDLER_BUFFER *buf);
567	void dsmrr_close();
568	int dsmrr_next(range_id_t *range_info);
569
570	ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts,
571	uint bufsz, uint flags, Cost_estimate *cost);
572
573	ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
574	void seq_init_param, uint n_ranges, uint bufsz,
575	uint flags, Cost_estimate cost);
576
577	int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
578	private:
579	/ Buffer to store (key, range_id) pairs /
580	Lifo_buffer *key_buffer;
581
582	/*
583	The "owner" handler object (the one that is expected to "own" this object
584	and call its functions).
585	*/
586	handler *primary_file;
587	TABLE table; /* Always equal to primary_file->table /
588
589	/*
590	Secondary handler object. (created when needed, we need it when we need
591	to run both index scan and rnd_pos() scan at the same time)
592	*/
593	handler *secondary_file;
594
595	uint keyno; / index we're running the scan on /
596	/ TRUE <=> need range association, buffers hold {rowid, range_id} pairs /
597	bool is_mrr_assoc;
598
599	Mrr_reader_factory reader_factory;
600
601	Mrr_reader *strategy;
602	bool strategy_exhausted;
603
604	Mrr_index_reader *index_strategy;
605
606	/ The whole buffer space that we're using /
607	uchar *full_buf;
608	uchar *full_buf_end;
609
610	/*
611	When using both rowid and key buffers: the boundary between key and rowid
612	parts of the buffer. This is the "original" value, actual memory ranges
613	used by key and rowid parts may be different because of dynamic space
614	reallocation between them.
615	*/
616	uchar *rowid_buffer_end;
617
618	/*
619	One of the following two is used for key buffer: forward is used when
620	we only need key buffer, backward is used when we need both key and rowid
621	buffers.
622	*/
623	Forward_lifo_buffer forward_key_buf;
624	Backward_lifo_buffer backward_key_buf;
625
626	/*
627	Buffer to store (rowid, range_id) pairs, or just rowids if
628	is_mrr_assoc==FALSE
629	*/
630	Forward_lifo_buffer rowid_buffer;
631
632	bool choose_mrr_impl(uint keyno, ha_rows rows, uint flags, uint bufsz,
633	Cost_estimate *cost);
634	bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
635	uint buffer_size, Cost_estimate cost);
636	bool check_cpk_scan(THD thd, TABLE_SHARE share, uint keyno, uint mrr_flags);
637
638	bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
639
640	/ Buffer_manager and its member functions /
641	Buffer_manager buf_manager;
642	static void redistribute_buffer_space(void *dsmrr_arg);
643	static void reset_buffer_sizes(void *dsmrr_arg);
644	static void do_nothing(void *dsmrr_arg);
645
646	Lifo_buffer* get_key_buffer() { return key_buffer; }
647
648	friend class Key_value_records_iterator;
649	friend class Mrr_ordered_index_reader;
650	friend class Mrr_ordered_rndpos_reader;
651
652	int setup_two_handlers();
653	void close_second_handler();
654	};
655
656	/**
657	@} (end of group DS-MRR declarations)
658	*/
659
660

Browse the source code of MariaDB/sql/multi_range_read.h