ma_checkpoint.c source code [MariaDB/storage/maria/ma_checkpoint.c]

1	/ Copyright (C) 2006,2007 MySQL AB*
2
3	This program is free software; you can redistribute it and/or modify
4	it under the terms of the GNU General Public License as published by
5	the Free Software Foundation; version 2 of the License.
6
7	This program is distributed in the hope that it will be useful,
8	but WITHOUT ANY WARRANTY; without even the implied warranty of
9	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	GNU General Public License for more details.
11
12	You should have received a copy of the GNU General Public License
13	along with this program; if not, write to the Free Software
14	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA /*
15
16	/*
17	WL#3071 Maria checkpoint
18	First version written by Guilhem Bichot on 2006-04-27.
19	*/
20
21	/ Here is the implementation of this module /
22
23	/* @todo RECOVERY BUG this is unreviewed code /
24	/*
25	Summary:
26	checkpoints are done either by a background thread (checkpoint every Nth
27	second) or by a client.
28	In ha_maria, it's not made available to clients, and will soon be done by a
29	background thread (periodically taking checkpoints and flushing dirty
30	pages).
31	*/
32
33	#include "maria_def.h"
34	#include "ma_pagecache.h"
35	#include "ma_blockrec.h"
36	#include "ma_checkpoint.h"
37	#include "ma_loghandler_lsn.h"
38	#include "ma_servicethread.h"
39	#include "ma_crypt.h"
40
41	/* @brief type of checkpoint currently running /
42	static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
43	/* @brief protects checkpoint_in_progress /
44	static mysql_mutex_t LOCK_checkpoint;
45	/* @brief for killing the background checkpoint thread /
46	static mysql_cond_t COND_checkpoint;
47	/* @brief control structure for checkpoint background thread /
48	static MA_SERVICE_THREAD_CONTROL checkpoint_control=
49	{`0`, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint};
50	/ is ulong like pagecache->blocks_changed /
51	static uint pages_to_flush_before_next_checkpoint;
52	static PAGECACHE_FILE dfiles, /*< data files to flush in background /*
53	dfiles_end; /*< list of data files ends here /*
54	static PAGECACHE_FILE kfiles, /*< index files to flush in background /*
55	kfiles_end; /*< list of index files ends here /*
56	/ those two statistics below could serve in SHOW GLOBAL STATUS /
57	static uint checkpoints_total= `0`, /< all checkpoint requests made /*
58	checkpoints_ok_total= `0`; /< all checkpoints which succeeded /*
59
60	struct st_filter_param
61	{
62	LSN up_to_lsn; /< only pages with rec_lsn < this LSN /*
63	uint max_pages; /< stop after flushing this number pages /*
64	}; /< information to determine which dirty pages should be flushed /*
65
66	static enum pagecache_flush_filter_result
67	filter_flush_file_medium(enum pagecache_page_type type,
68	pgcache_page_no_t page,
69	LSN rec_lsn, void *arg);
70	static enum pagecache_flush_filter_result
71	filter_flush_file_full(enum pagecache_page_type type,
72	pgcache_page_no_t page,
73	LSN rec_lsn, void *arg);
74	static enum pagecache_flush_filter_result
75	filter_flush_file_evenly(enum pagecache_page_type type,
76	pgcache_page_no_t pageno,
77	LSN rec_lsn, void *arg);
78	static int really_execute_checkpoint(void);
79	pthread_handler_t ma_checkpoint_background(void *arg);
80	static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
81
82	/**
83	@brief Does a checkpoint
84
85	@param level what level of checkpoint to do
86	@param no_wait if another checkpoint of same or stronger level
87	is already running, consider our job done
88
89	@note In ha_maria, there can never be two threads trying a checkpoint at
90	the same time.
91
92	@return Operation status
93	@retval 0 ok
94	@retval !=0 error
95	*/
96
97	int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
98	{
99	int result= `0`;
100	DBUG_ENTER("ma_checkpoint_execute");
101
102	if (!checkpoint_control.inited)
103	{
104	/*
105	If ha_maria failed to start, maria_panic_hton is called, we come here.
106	*/
107	DBUG_RETURN(`0`);
108	}
109	DBUG_ASSERT(level > CHECKPOINT_NONE);
110
111	/ look for already running checkpoints /
112	mysql_mutex_lock(&LOCK_checkpoint);
113	while (checkpoint_in_progress != CHECKPOINT_NONE)
114	{
115	if (no_wait && (checkpoint_in_progress >= level))
116	{
117	/*
118	If we are the checkpoint background thread, we don't wait (it's
119	smarter to flush pages instead of waiting here while the other thread
120	finishes its checkpoint).
121	*/
122	mysql_mutex_unlock(&LOCK_checkpoint);
123	goto end;
124	}
125	mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
126	}
127
128	checkpoint_in_progress= level;
129	mysql_mutex_unlock(&LOCK_checkpoint);
130	/ from then on, we are sure to be and stay the only checkpointer /
131
132	result= really_execute_checkpoint();
133	DBUG_EXECUTE_IF("maria_crash_after_checkpoint",
134	{ DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
135
136	mysql_cond_broadcast(&COND_checkpoint);
137	end:
138	DBUG_RETURN(result);
139	}
140
141
142	/**
143	@brief Does a checkpoint, really; expects no other checkpoints
144	running.
145
146	Checkpoint level requested is read from checkpoint_in_progress.
147
148	@return Operation status
149	@retval 0 ok
150	@retval !=0 error
151	*/
152
153	static int really_execute_checkpoint(void)
154	{
155	uint i, error= `0`;
156	/* @brief checkpoint_start_log_horizon will be stored there /
157	char *ptr;
158	LEX_STRING record_pieces[`4`]; /< only malloc-ed pieces /*
159	LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
160	TRANSLOG_ADDRESS checkpoint_start_log_horizon;
161	char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
162	DBUG_ENTER("really_execute_checkpoint");
163	DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
164	bzero(&record_pieces, sizeof(record_pieces));
165
166	/*
167	STEP 1: record current end-of-log position using log's lock. It is
168	critical for the correctness of Checkpoint (related to memory visibility
169	rules, the log's lock is a mutex).
170	"Horizon" is a lower bound of the LSN of the next log record.
171	*/
172	checkpoint_start_log_horizon= translog_get_horizon();
173	DBUG_PRINT("info",("checkpoint_start_log_horizon " LSN_FMT,
174	LSN_IN_PARTS(checkpoint_start_log_horizon)));
175	lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
176
177	/*
178	STEP 2: fetch information about transactions.
179	We must fetch transactions before dirty pages. Indeed, a transaction
180	first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
181	to 0. If we fetched pages first, we may see no dirty page yet, then we
182	fetch transactions but the transaction has already reset its rec_lsn to 0
183	so we miss rec_lsn again.
184	For a similar reason (over-allocated bitmap pages) we have to fetch
185	transactions before flushing bitmap pages.
186
187	min_trn_rec_lsn will serve to lower the starting point of the REDO phase
188	(down from checkpoint_start_log_horizon).
189	*/
190	if (unlikely(trnman_collect_transactions(&record_pieces[`0`],
191	&record_pieces[`1`],
192	&min_trn_rec_lsn,
193	&min_first_undo_lsn)))
194	goto err;
195
196
197	/ STEP 3: fetch information about table files /
198	if (unlikely(collect_tables(&record_pieces[`2`],
199	checkpoint_start_log_horizon)))
200	goto err;
201
202
203	/ STEP 4: fetch information about dirty pages /
204	/*
205	It's better to do it _after_ having flushed some data pages (which
206	collect_tables() may have done), because those are now non-dirty and so we
207	have a more up-to-date dirty pages list to put into the checkpoint record,
208	and thus we will have less work at Recovery.
209	*/
210	/ Using default pagecache for now /
211	if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
212	&record_pieces[`3`],
213	&min_page_rec_lsn)))
214	goto err;
215
216
217	/ LAST STEP: now write the checkpoint log record /
218	{
219	LSN lsn;
220	translog_size_t total_rec_length;
221	/*
222	the log handler is allowed to modify "str" and "length" (but not "str")*
223	of its argument, so we must not pass it record_pieces directly,
224	otherwise we would later not know what memory pieces to my_free().
225	*/
226	LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + `5`];
227	log_array[TRANSLOG_INTERNAL_PARTS + `0`].str=
228	(uchar*) checkpoint_start_log_horizon_char;
229	log_array[TRANSLOG_INTERNAL_PARTS + `0`].length= total_rec_length=
230	sizeof(checkpoint_start_log_horizon_char);
231	for (i= `0`; i < (sizeof(record_pieces)/sizeof(record_pieces[`0`])); i++)
232	{
233	log_array[TRANSLOG_INTERNAL_PARTS + `1` + i].str= (uchar*)record_pieces[i].str;
234	log_array[TRANSLOG_INTERNAL_PARTS + `1` + i].length= record_pieces[i].length;
235	total_rec_length+= (translog_size_t) record_pieces[i].length;
236	}
237	if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
238	&dummy_transaction_object, NULL,
239	total_rec_length,
240	sizeof(log_array)/sizeof(log_array[`0`]),
241	log_array, NULL, NULL) \|\|
242	translog_flush(lsn)))
243	goto err;
244	translog_lock();
245	/*
246	This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
247	such hook would be called before translog_flush (and we must be sure
248	that log was flushed before we write to the control file).
249	*/
250	if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
251	max_trid_in_control_file,
252	recovery_failures)))
253	{
254	translog_unlock();
255	goto err;
256	}
257	translog_unlock();
258	}
259
260	/*
261	Note that we should not alter memory structures until we have successfully
262	written the checkpoint record and control file.
263	*/
264	/ checkpoint succeeded /
265	ptr= record_pieces[`3`].str;
266	pages_to_flush_before_next_checkpoint= uint4korr(ptr);
267	DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
268	pages_to_flush_before_next_checkpoint));
269
270	/ compute log's low-water mark /
271	{
272	TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
273	set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
274	set_if_smaller(log_low_water_mark, min_first_undo_lsn);
275	set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
276	/**
277	Now purge unneeded logs.
278	As some systems have an unreliable fsync (drive lying), we could try to
279	be robust against that: remember a few previous checkpoints in the
280	control file, and not purge logs immediately... Think about it.
281	*/
282	if (translog_purge(log_low_water_mark))
283	ma_message_no_user(`0`, "log purging failed");
284	}
285
286	goto end;
287
288	err:
289	error= `1`;
290	ma_message_no_user(`0`, "checkpoint failed");
291	/ we were possibly not able to determine what pages to flush /
292	pages_to_flush_before_next_checkpoint= `0`;
293
294	end:
295	for (i= `0`; i < (sizeof(record_pieces)/sizeof(record_pieces[`0`])); i++)
296	my_free(record_pieces[i].str);
297	mysql_mutex_lock(&LOCK_checkpoint);
298	checkpoint_in_progress= CHECKPOINT_NONE;
299	checkpoints_total++;
300	checkpoints_ok_total+= !error;
301	mysql_mutex_unlock(&LOCK_checkpoint);
302	DBUG_RETURN(error);
303	}
304
305
306	/**
307	@brief Initializes the checkpoint module
308
309	@param interval If one wants the module to create a
310	thread which will periodically do
311	checkpoints, and flush dirty pages, in the
312	background, it should specify a non-zero
313	interval in seconds. The thread will then be
314	created and will take checkpoints separated by
315	approximately 'interval' second.
316
317	@note A checkpoint is taken only if there has been some significant
318	activity since the previous checkpoint. Between checkpoint N and N+1 the
319	thread flushes all dirty pages which were already dirty at the time of
320	checkpoint N.
321
322	@return Operation status
323	@retval 0 ok
324	@retval !=0 error
325	*/
326
327	int ma_checkpoint_init(ulong interval)
328	{
329	int res= `0`;
330	DBUG_ENTER("ma_checkpoint_init");
331	if (ma_service_thread_control_init(&checkpoint_control))
332	res= `1`;
333	else if (interval > `0`)
334	{
335	size_t intv= interval;
336	compile_time_assert(sizeof(void ) >= sizeof*(ulong));
337	if ((res= mysql_thread_create(key_thread_checkpoint,
338	&checkpoint_control.thread, NULL,
339	ma_checkpoint_background,
340	(void*) intv)))
341	checkpoint_control.killed= TRUE;
342	}
343	else
344	checkpoint_control.killed= TRUE;
345	DBUG_RETURN(res);
346	}
347
348
349	#ifndef DBUG_OFF
350	/**
351	Function used to test recovery: flush some table pieces and then caller
352	crashes.
353
354	@param what_to_flush 0: current bitmap and all data pages
355	1: state
356	2: all bitmap pages
357	*/
358	static void flush_all_tables(int what_to_flush)
359	{
360	int res= `0`;
361	LIST pos; /*< to iterate over open tables /*
362	mysql_mutex_lock(&THR_LOCK_maria);
363	for (pos= maria_open_list; pos; pos= pos->next)
364	{
365	MARIA_HA info= (MARIA_HA)pos->data;
366	if (info->s->now_transactional)
367	{
368	switch (what_to_flush)
369	{
370	case `0`:
371	res= _ma_flush_table_files(info, MARIA_FLUSH_DATA \| MARIA_FLUSH_INDEX,
372	FLUSH_KEEP, FLUSH_KEEP);
373	break;
374	case `1`:
375	res= _ma_state_info_write(info->s,
376	MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET\|
377	MA_STATE_INFO_WRITE_LOCK);
378	DBUG_PRINT("maria_flush_states",
379	("is_of_horizon: LSN " LSN_FMT,
380	LSN_IN_PARTS(info->s->state.is_of_horizon)));
381	break;
382	case `2`:
383	res= _ma_bitmap_flush_all(info->s);
384	break;
385	}
386	}
387	DBUG_ASSERT(res == `0`);
388	}
389	mysql_mutex_unlock(&THR_LOCK_maria);
390	}
391	#endif
392
393
394	/**
395	@brief Destroys the checkpoint module
396	*/
397
398	void ma_checkpoint_end(void)
399	{
400	DBUG_ENTER("ma_checkpoint_end");
401	/*
402	Some intentional crash methods, usually triggered by
403	SET MARIA_CHECKPOINT_INTERVAL=X
404	*/
405	DBUG_EXECUTE_IF("maria_flush_bitmap",
406	{
407	DBUG_PRINT("maria_flush_bitmap", ("now"));
408	flush_all_tables(`2`);
409	});
410	DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
411	{
412	DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
413	flush_all_tables(`0`);
414	});
415	DBUG_EXECUTE_IF("maria_flush_whole_log",
416	{
417	DBUG_PRINT("maria_flush_whole_log", ("now"));
418	translog_flush(translog_get_horizon());
419	});
420	/*
421	Note that for WAL reasons, maria_flush_states requires
422	maria_flush_whole_log.
423	*/
424	DBUG_EXECUTE_IF("maria_flush_states",
425	{
426	DBUG_PRINT("maria_flush_states", ("now"));
427	flush_all_tables(`1`);
428	});
429	DBUG_EXECUTE_IF("maria_crash",
430	{ DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
431
432	if (checkpoint_control.inited)
433	{
434	ma_service_thread_control_end(&checkpoint_control);
435	my_free(dfiles);
436	my_free(kfiles);
437	dfiles= kfiles= NULL;
438	}
439	DBUG_VOID_RETURN;
440	}
441
442
443	/**
444	@brief dirty-page filtering criteria for MEDIUM checkpoint.
445
446	We flush data/index pages which have been dirty since the previous
447	checkpoint (this is the two-checkpoint rule: the REDO phase will not have
448	to start from earlier than the next-to-last checkpoint).
449	Bitmap pages are handled by _ma_bitmap_flush_all().
450
451	@param type Page's type
452	@param pageno Page's number
453	@param rec_lsn Page's rec_lsn
454	@param arg filter_param
455	*/
456
457	static enum pagecache_flush_filter_result
458	filter_flush_file_medium(enum pagecache_page_type type,
459	pgcache_page_no_t pageno __attribute__ ((unused)),
460	LSN rec_lsn, void *arg)
461	{
462	struct st_filter_param param= (struct* st_filter_param *)arg;
463	return (type == PAGECACHE_LSN_PAGE) &&
464	(cmp_translog_addr(rec_lsn, param->up_to_lsn) <= `0`);
465	}
466
467
468	/**
469	@brief dirty-page filtering criteria for FULL checkpoint.
470
471	We flush all dirty data/index pages.
472	Bitmap pages are handled by _ma_bitmap_flush_all().
473
474	@param type Page's type
475	@param pageno Page's number
476	@param rec_lsn Page's rec_lsn
477	@param arg filter_param
478	*/
479
480	static enum pagecache_flush_filter_result
481	filter_flush_file_full(enum pagecache_page_type type,
482	pgcache_page_no_t pageno __attribute__ ((unused)),
483	LSN rec_lsn __attribute__ ((unused)),
484	void arg __attribute__* ((unused)))
485	{
486	return (type == PAGECACHE_LSN_PAGE);
487	}
488
489
490	/**
491	@brief dirty-page filtering criteria for background flushing thread.
492
493	We flush data/index pages which have been dirty since the previous
494	checkpoint (this is the two-checkpoint rule: the REDO phase will not have
495	to start from earlier than the next-to-last checkpoint), and no
496	bitmap pages. But we flush no more than a certain number of pages (to have
497	an even flushing, no write burst).
498	The reason to not flush bitmap pages is that they may not be in a flushable
499	state at this moment and we don't want to wait for them.
500
501	@param type Page's type
502	@param pageno Page's number
503	@param rec_lsn Page's rec_lsn
504	@param arg filter_param
505	*/
506
507	static enum pagecache_flush_filter_result
508	filter_flush_file_evenly(enum pagecache_page_type type,
509	pgcache_page_no_t pageno __attribute__ ((unused)),
510	LSN rec_lsn, void *arg)
511	{
512	struct st_filter_param param= (struct* st_filter_param *)arg;
513	if (unlikely(param->max_pages == `0`)) / all flushed already /
514	return FLUSH_FILTER_SKIP_ALL;
515	if ((type == PAGECACHE_LSN_PAGE) &&
516	(cmp_translog_addr(rec_lsn, param->up_to_lsn) <= `0`))
517	{
518	param->max_pages--;
519	return FLUSH_FILTER_OK;
520	}
521	return FLUSH_FILTER_SKIP_TRY_NEXT;
522	}
523
524
525	/**
526	@brief Background thread which does checkpoints and flushes periodically.
527
528	Takes a checkpoint. After this, all pages dirty at the time of that
529	checkpoint are flushed evenly until it is time to take another checkpoint.
530	This ensures that the REDO phase starts at earliest (in LSN time) at the
531	next-to-last checkpoint record ("two-checkpoint rule").
532
533	@note MikaelR questioned why the same thread does two different jobs, the
534	risk could be that while a checkpoint happens no LRD flushing happens.
535	*/
536
537	static ulong maria_checkpoint_min_cache_activity= `10``1024``1024`;
538	/ Set in ha_maria.cc /
539	ulong maria_checkpoint_min_log_activity= `1``1024``1024`;
540
541	pthread_handler_t ma_checkpoint_background(void *arg)
542	{
543	/* @brief At least this of log/page bytes written between checkpoints /
544	/*
545	If the interval could be changed by the user while we are in this thread,
546	it could be annoying: for example it could cause "case 2" to be executed
547	right after "case 0", thus having 'dfile' unset. So the thread cares only
548	about the interval's value when it started.
549	*/
550	const size_t interval= (size_t)arg;
551	size_t sleeps, sleep_time;
552	TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
553	translog_get_horizon();
554	ulonglong pagecache_flushes_at_last_checkpoint=
555	maria_pagecache->global_cache_write;
556	uint UNINIT_VAR(pages_bunch_size);
557	struct st_filter_param filter_param;
558	PAGECACHE_FILE UNINIT_VAR(dfile); /*< data file currently being flushed /*
559	PAGECACHE_FILE UNINIT_VAR(kfile); /*< index file currently being flushed /*
560
561	my_thread_init();
562	DBUG_PRINT("info",("Maria background checkpoint thread starts"));
563	DBUG_ASSERT(interval > `0`);
564
565	PSI_CALL_set_thread_user_host(`0`,`0`,`0`,`0`);
566
567	/*
568	Recovery ended with all tables closed and a checkpoint: no need to take
569	one immediately.
570	*/
571	sleeps= `1`;
572	pages_to_flush_before_next_checkpoint= `0`;
573
574	for(;;) / iterations of checkpoints and dirty page flushing /
575	{
576	#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
577	sleeps=`0`;
578	#endif
579	switch (sleeps % interval)
580	{
581	case `0`:
582	{
583	/ If checkpoints are disabled, wait 1 second and try again /
584	if (maria_checkpoint_disabled)
585	{
586	sleep_time= `1`;
587	break;
588	}
589	{
590	TRANSLOG_ADDRESS horizon= translog_get_horizon();
591
592	/*
593	With background flushing evenly distributed over the time
594	between two checkpoints, we should have only little flushing to do
595	in the checkpoint.
596	*/
597	/*
598	No checkpoint if little work of interest for recovery was done
599	since last checkpoint. Such work includes log writing (lengthens
600	recovery, checkpoint would shorten it), page flushing (checkpoint
601	would decrease the amount of read pages in recovery).
602	In case of one short statement per minute (very low load), we don't
603	want to checkpoint every minute, hence the positive
604	maria_checkpoint_min_activity.
605	*/
606	if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <=
607	maria_checkpoint_min_log_activity &&
608	((ulonglong) (maria_pagecache->global_cache_write -
609	pagecache_flushes_at_last_checkpoint) *
610	maria_pagecache->block_size) <=
611	maria_checkpoint_min_cache_activity)
612	{
613	/*
614	Not enough has happend since last checkpoint.
615	Sleep for a while and try again later
616	*/
617	sleep_time= interval;
618	break;
619	}
620	sleep_time= `1`;
621	ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
622	/*
623	Snapshot this kind of "state" of the engine. Note that the value
624	below is possibly greater than last_checkpoint_lsn.
625	*/
626	log_horizon_at_last_checkpoint= translog_get_horizon();
627	pagecache_flushes_at_last_checkpoint=
628	maria_pagecache->global_cache_write;
629	/*
630	If the checkpoint above succeeded it has set d\|kfiles and
631	d\|kfiles_end. If is has failed, it has set
632	pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
633	and sleep until the next checkpoint.
634	*/
635	}
636	break;
637	}
638	case `1`:
639	/ set up parameters for background page flushing /
640	filter_param.up_to_lsn= last_checkpoint_lsn;
641	pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval;
642	dfile= dfiles;
643	kfile= kfiles;
644	/ fall through /
645	default:
646	if (pages_bunch_size > `0`)
647	{
648	DBUG_PRINT("checkpoint",
649	("Maria background checkpoint thread: %u pages",
650	pages_bunch_size));
651	/ flush a bunch of dirty pages /
652	filter_param.max_pages= pages_bunch_size;
653	while (dfile != dfiles_end)
654	{
655	/*
656	We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
657	smarter to move to the next file than wait for this one to be
658	completely flushed, which may take long.
659	StaleFilePointersInFlush: notice how below we use "dfile" which
660	is an OS file descriptor plus some function and MARIA_SHARE
661	pointers; this data dates from a previous checkpoint; since then,
662	the table may have been closed (so MARIA_SHARE became stale), and*
663	the file descriptor reassigned to another table which does not
664	have the same CRC-read-set callbacks: it is thus important that
665	flush_pagecache_blocks_with_filter() does not use the pointers,
666	only the OS file descriptor.
667	*/
668	int res=
669	flush_pagecache_blocks_with_filter(maria_pagecache,
670	dfile, FLUSH_KEEP_LAZY,
671	filter_flush_file_evenly,
672	&filter_param);
673	if (unlikely(res & PCFLUSH_ERROR))
674	ma_message_no_user(`0`, "background data page flush failed");
675	if (filter_param.max_pages == `0`) / bunch all flushed, sleep /
676	break; / and we will continue with the same file /
677	dfile++; / otherwise all this file is flushed, move to next file /
678	/*
679	MikaelR noted that he observed that Linux's file cache may never
680	fsync to disk until this cache is full, at which point it decides
681	to empty the cache, making the machine very slow. A solution was
682	to fsync after writing 2 MB. So we might want to fsync() here if
683	we wrote enough pages.
684	*/
685	}
686	while (kfile != kfiles_end)
687	{
688	int res=
689	flush_pagecache_blocks_with_filter(maria_pagecache,
690	kfile, FLUSH_KEEP_LAZY,
691	filter_flush_file_evenly,
692	&filter_param);
693	if (unlikely(res & PCFLUSH_ERROR))
694	ma_message_no_user(`0`, "background index page flush failed");
695	if (filter_param.max_pages == `0`) / bunch all flushed, sleep /
696	break; / and we will continue with the same file /
697	kfile++; / otherwise all this file is flushed, move to next file /
698	}
699	sleep_time= `1`;
700	}
701	else
702	{
703	/ Can directly sleep until the next checkpoint moment /
704	sleep_time= interval - (sleeps % interval);
705	}
706	}
707	if (my_service_thread_sleep(&checkpoint_control,
708	sleep_time * `1000000000ULL`))
709	break;
710	sleeps+= sleep_time;
711	}
712	DBUG_PRINT("info",("Maria background checkpoint thread ends"));
713	{
714	CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
715	/*
716	That's the final one, which guarantees that a clean shutdown always ends
717	with a checkpoint.
718	*/
719	DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
720	ma_checkpoint_execute(level, FALSE);
721	}
722	my_thread_end();
723	return `0`;
724	}
725
726
727	/**
728	@brief Allocates buffer and stores in it some info about open tables,
729	does some flushing on those.
730
731	Does the allocation because the caller cannot know the size itself.
732	Memory freeing is to be done by the caller (if the "str" member of the
733	LEX_STRING is not NULL).
734	The caller is taking a checkpoint.
735
736	@param[out] str pointer to where the allocated buffer,
737	and its size, will be put; buffer will be filled
738	with info about open tables
739	@param checkpoint_start_log_horizon Of the in-progress checkpoint
740	record.
741
742	@return Operation status
743	@retval 0 OK
744	@retval 1 Error
745	*/
746
747	static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
748	{
749	MARIA_SHARE **distinct_shares= NULL;
750	char *ptr;
751	uint error= `1`, sync_error= `0`, nb, nb_stored, i;
752	my_bool unmark_tables= TRUE;
753	size_t total_names_length;
754	LIST pos; /*< to iterate over open tables /*
755	struct st_state_copy {
756	uint index;
757	MARIA_STATE_INFO state;
758	};
759	struct st_state_copy state_copies= NULL, /*< fixed-size cache of states /*
760	state_copies_end, /*< cache ends here /*
761	state_copy; /*< iterator in cache /*
762	TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /< horizon of states' _copies_ /*
763	struct st_filter_param filter_param;
764	PAGECACHE_FLUSH_FILTER filter;
765	DBUG_ENTER("collect_tables");
766
767	/ let's make a list of distinct shares /
768	mysql_mutex_lock(&THR_LOCK_maria);
769	for (nb= `0`, pos= maria_open_list; pos; pos= pos->next)
770	{
771	MARIA_HA info= (MARIA_HA)pos->data;
772	MARIA_SHARE *share= info->s;
773	/ the first three variables below can never change /
774	if (share->base.born_transactional && !share->temporary &&
775	share->mode != O_RDONLY &&
776	!(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
777	{
778	/*
779	Apart from us, only maria_close() reads/sets in_checkpoint but cannot
780	run now as we hold THR_LOCK_maria.
781	*/
782	/*
783	This table is relevant for checkpoint and not already seen. Mark it,
784	so that it is not seen again in the loop.
785	*/
786	nb++;
787	DBUG_ASSERT(share->in_checkpoint == `0`);
788	/ This flag ensures that we count only _distinct_ shares. /
789	share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
790	}
791	}
792	if (unlikely((distinct_shares=
793	(MARIA_SHARE *)my_malloc(nb sizeof(MARIA_SHARE *),
794	MYF(MY_WME))) == NULL))
795	goto err;
796	for (total_names_length= `0`, i= `0`, pos= maria_open_list; pos; pos= pos->next)
797	{
798	MARIA_HA info= (MARIA_HA)pos->data;
799	MARIA_SHARE *share= info->s;
800	if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
801	{
802	distinct_shares[i++]= share;
803	/*
804	With this we prevent the share from going away while we later flush
805	and force it without holding THR_LOCK_maria. For example if the share
806	could be my_free()d by maria_close() we would have a problem when we
807	access it to flush the table. We "pin" the share pointer.
808	And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
809	not seen again in the loop.
810	*/
811	share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
812	total_names_length+= share->open_file_name.length;
813	}
814	}
815
816	DBUG_ASSERT(i == nb);
817	mysql_mutex_unlock(&THR_LOCK_maria);
818	DBUG_PRINT("info",("found %u table shares", nb));
819
820	str->length=
821	`4` + / number of tables /
822	(`2` + / short id /
823	LSN_STORE_SIZE + / first_log_write_at_lsn /
824	`1` / end-of-name 0 /
825	) * nb + total_names_length;
826	if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
827	goto err;
828
829	ptr= str->str;
830	ptr+= `4`; / real number of stored tables is not yet know /
831
832	/ only possible checkpointer, so can do the read below without mutex /
833	filter_param.up_to_lsn= last_checkpoint_lsn;
834	switch(checkpoint_in_progress)
835	{
836	case CHECKPOINT_MEDIUM:
837	filter= &filter_flush_file_medium;
838	break;
839	case CHECKPOINT_FULL:
840	filter= &filter_flush_file_full;
841	break;
842	case CHECKPOINT_INDIRECT:
843	filter= NULL;
844	break;
845	default:
846	DBUG_ASSERT(`0`);
847	goto err;
848	}
849
850	/*
851	The principle of reading/writing the state below is explained in
852	ma_recovery.c, look for "Recovery of the state".
853	*/
854	#define STATE_COPIES 1024
855	state_copies= (struct st_state_copy *)
856	my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
857	dfiles= (PAGECACHE_FILE )my_realloc((uchar )dfiles,
858	/ avoid size of 0 for my_realloc /
859	MY_MAX(`1`, nb) * sizeof(PAGECACHE_FILE),
860	MYF(MY_WME \| MY_ALLOW_ZERO_PTR));
861	kfiles= (PAGECACHE_FILE )my_realloc((uchar )kfiles,
862	/ avoid size of 0 for my_realloc /
863	MY_MAX(`1`, nb) * sizeof(PAGECACHE_FILE),
864	MYF(MY_WME \| MY_ALLOW_ZERO_PTR));
865	if (unlikely((state_copies == NULL) \|\|
866	(dfiles == NULL) \|\| (kfiles == NULL)))
867	goto err;
868	state_copy= state_copies_end= NULL;
869	dfiles_end= dfiles;
870	kfiles_end= kfiles;
871
872	for (nb_stored= `0`, i= `0`; i < nb; i++)
873	{
874	MARIA_SHARE *share= distinct_shares[i];
875	PAGECACHE_FILE kfile, dfile;
876	my_bool ignore_share;
877	if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
878	{
879	/*
880	No need for a mutex to read the above, only us can write this* bit of*
881	the in_checkpoint bitmap
882	*/
883	continue;
884	}
885	/**
886	@todo We should not look at tables which didn't change since last
887	checkpoint.
888	*/
889	DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
890	if (state_copy == state_copies_end) / we have no more cached states /
891	{
892	/*
893	Collect and cache a bunch of states. We do this for many states at a
894	time, to not lock/unlock the log's lock too often.
895	*/
896	uint j, bound= MY_MIN(nb, i + STATE_COPIES);
897	state_copy= state_copies;
898	/ part of the state is protected by log's lock /
899	translog_lock();
900	state_copies_horizon= translog_get_horizon_no_lock();
901	for (j= i; j < bound; j++)
902	{
903	MARIA_SHARE *share2= distinct_shares[j];
904	if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
905	continue;
906	state_copy->index= j;
907	state_copy->state= share2->state; / we copy the state /
908	state_copy++;
909	/*
910	data_file_length is not updated under log's lock by the bitmap
911	code, but writing a wrong data_file_length is ok: a next
912	maria_close() will correct it; if we crash before, Recovery will
913	set it to the true physical size.
914	*/
915	}
916	translog_unlock();
917	if (state_copy == state_copies)
918	break; / Nothing to do /
919
920	/**
921	We are going to flush these states.
922	Before, all records describing how to undo such state must be
923	in the log (WAL). Usually this means UNDOs. In the special case of
924	data\|key_file_length, recovery just needs to open the table to fix the
925	length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
926	understand it must open a table, is enough; so as long as
927	data\|key_file_length is updated after writing any log record it's ok:
928	if we copied new value above, it means the record was before
929	state_copies_horizon and we flush such record below.
930	Apart from data\|key_file_length which are easily recoverable from the
931	real file's size, all other state members must be updated only when
932	writing the UNDO; otherwise, if updated before, if their new value is
933	flushed by a checkpoint and there is a crash before UNDO is written,
934	their REDO group will be missing or at least incomplete and skipped
935	by recovery, so bad state value will stay. For example, setting
936	key_root before writing the UNDO: the table would have old index
937	pages (they were pinned at time of crash) and a new, thus wrong,
938	key_root.
939	@todo RECOVERY BUG check that all code honours that.
940	*/
941	if (translog_flush(state_copies_horizon))
942	goto err;
943	/ now we have cached states and they are WAL-safe/
944	state_copies_end= state_copy-`1`;
945	state_copy= state_copies;
946	}
947
948	/ locate our state among these cached ones /
949	for ( ; state_copy->index != i; state_copy++)
950	DBUG_ASSERT(state_copy <= state_copies_end);
951
952	/ OS file descriptors are ints which we stored in 4 bytes /
953	compile_time_assert(sizeof(int) <= `4`);
954	/*
955	Protect against maria_close() (which does some memory freeing in
956	MARIA_FILE_BITMAP) with close_lock. intern_lock is not
957	sufficient as we, as well as maria_close(), are going to unlock
958	intern_lock in the middle of manipulating the table. Serializing us and
959	maria_close() should help avoid problems.
960	*/
961	mysql_mutex_lock(&share->close_lock);
962	mysql_mutex_lock(&share->intern_lock);
963	/*
964	Tables in a normal state have their two file descriptors open.
965	In some rare cases like REPAIR, some descriptor may be closed or even
966	-1. If that happened, the _ma_state_info_write() may fail. This is
967	prevented by enclosing all all places which close/change kfile.file with
968	intern_lock.
969	*/
970	kfile= share->kfile;
971	dfile= share->bitmap.file;
972	/*
973	Ignore table which has no logged writes (all its future log records will
974	be found naturally by Recovery). Ignore obsolete shares (_before_
975	setting themselves to last_version=0 they already did all flush and
976	sync; if we flush their state now we may be flushing an obsolete state
977	onto a newer one (assuming the table has been reopened with a different
978	share but of course same physical index file).
979	*/
980	ignore_share= (share->id == `0`) \| (share->last_version == `0`);
981	DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
982	if (!ignore_share)
983	{
984	size_t open_file_name_len= share->open_file_name.length + `1`;
985	/ remember the descriptors for background flush /
986	*(dfiles_end++)= dfile;
987	*(kfiles_end++)= kfile;
988	/ we will store this table in the record /
989	nb_stored++;
990	int2store(ptr, share->id);
991	ptr+= `2`;
992	lsn_store(ptr, share->lsn_of_file_id);
993	ptr+= LSN_STORE_SIZE;
994	/*
995	first_bitmap_with_space is not updated under log's lock, and is
996	important. We would need the bitmap's lock to get it right. Recovery
997	of this is not clear, so we just play safe: write it out as
998	unknown: if crash, _ma_bitmap_init() at next open (for example in
999	Recovery) will convert it to 0 and thus the first insertion will
1000	search for free space from the file's first bitmap (0) -
1001	under-optimal but safe.
1002	If no crash, maria_close() will write the exact value.
1003	*/
1004	state_copy->state.first_bitmap_with_space= ~(ulonglong)`0`;
1005	memcpy(ptr, share->open_file_name.str, open_file_name_len);
1006	ptr+= open_file_name_len;
1007	if (cmp_translog_addr(share->state.is_of_horizon,
1008	checkpoint_start_log_horizon) >= `0`)
1009	{
1010	/*
1011	State was flushed recently, it does not hold down the log's
1012	low-water mark and will not give avoidable work to Recovery. So we
1013	needn't flush it. Also, it is possible that while we copied the
1014	state above (under log's lock, without intern_lock) it was being
1015	modified in memory or flushed to disk (without log's lock, under
1016	intern_lock, like in maria_extra()), so our copy may be incorrect
1017	and we should not flush it.
1018	It may also be a share which got last_version==0 since we checked
1019	last_version; in this case, it flushed its state and the LSN test
1020	above will catch it.
1021	*/
1022	}
1023	else
1024	{
1025	/*
1026	We could do the state flush only if share->changed, but it's
1027	tricky.
1028	Consider a maria_write() which has written REDO,UNDO, and before it
1029	calls _ma_writeinfo() (setting share->changed=1), checkpoint
1030	happens and sees share->changed=0, does not flush state. It is
1031	possible that Recovery does not start from before the REDO and thus
1032	the state is not recovered. A solution may be to set
1033	share->changed=1 under log mutex when writing log records.
1034
1035	The current solution is to keep a copy the last saved state and
1036	not write the state if it was same as last time. It's ok if
1037	is_of_horizon would be different on disk if all other data is
1038	the same.
1039	*/
1040	DBUG_ASSERT(share->last_version != `0`);
1041	state_copy->state.is_of_horizon= share->state.is_of_horizon=
1042	share->checkpoint_state.is_of_horizon= state_copies_horizon;
1043	if (kfile.file >= `0` && memcmp(&share->checkpoint_state,
1044	&state_copy->state,
1045	sizeof(state_copy->state)))
1046	{
1047	sync_error\|=
1048	_ma_state_info_write_sub(kfile.file, &state_copy->state,
1049	MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
1050	memcpy(&share->checkpoint_state,
1051	&state_copy->state, sizeof(state_copy->state));
1052	}
1053	/*
1054	We don't set share->changed=0 because it may interfere with a
1055	concurrent _ma_writeinfo() doing share->changed=1 (cancel its
1056	effect). The sad consequence is that we will flush the same state at
1057	each checkpoint if the table was once written and then not anymore.
1058	*/
1059	}
1060	}
1061	#ifdef EXTRA_DEBUG_BITMAP
1062	else
1063	{
1064	DBUG_ASSERT(share->bitmap.changed == `0` &&
1065	share->bitmap.changed_not_flushed == `0`);
1066	}
1067	#endif
1068
1069	/*
1070	_ma_bitmap_flush_all() may wait, so don't keep intern_lock as
1071	otherwise this would deadlock with allocate_and_write_block_record()
1072	calling _ma_set_share_data_file_length()
1073	*/
1074	mysql_mutex_unlock(&share->intern_lock);
1075
1076	if (!ignore_share)
1077	{
1078	/*
1079	share->bitmap is valid because it's destroyed under close_lock which
1080	we hold.
1081	*/
1082	if (_ma_bitmap_flush_all(share))
1083	{
1084	sync_error= `1`;
1085	/* @todo all write failures should mark table corrupted /
1086	ma_message_no_user(`0`, "checkpoint bitmap page flush failed");
1087	}
1088	DBUG_ASSERT(share->pagecache == maria_pagecache);
1089	}
1090	/*
1091	Clean up any unused states.
1092	TODO: Only do this call if there has been # (10?) ended transactions
1093	since last call.
1094	We had to release intern_lock to respect lock order with LOCK_trn_list.
1095	*/
1096	_ma_remove_not_visible_states_with_lock(share, FALSE);
1097
1098	if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1099	{
1100	/*
1101	maria_close() left us free the share. When it run it set share->id
1102	to 0. As it run before we locked close_lock, we should have seen this
1103	and so this assertion should be true:
1104	*/
1105	DBUG_ASSERT(ignore_share);
1106	mysql_mutex_destroy(&share->intern_lock);
1107	mysql_mutex_unlock(&share->close_lock);
1108	mysql_mutex_destroy(&share->close_lock);
1109	ma_crypt_free(share);
1110	my_free(share);
1111	}
1112	else
1113	{
1114	/ share goes back to normal state /
1115	share->in_checkpoint= `0`;
1116	mysql_mutex_unlock(&share->close_lock);
1117	}
1118
1119	/*
1120	We do the big disk writes out of intern_lock to not block other
1121	users of this table (intern_lock is taken at the start and end of
1122	every statement). This means that file descriptors may be invalid
1123	(files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
1124	under Windows, or REPAIR). This should not be a problem as we use
1125	MY_IGNORE_BADFD. Descriptors may even point to other files but then
1126	the old blocks (of before the close) must have been flushed for sure,
1127	so our flush will flush new blocks (of after the latest open) and that
1128	should do no harm.
1129	*/
1130	/*
1131	If CHECKPOINT_MEDIUM, this big flush below may result in a
1132	serious write burst. Realize that all pages dirtied between the
1133	last checkpoint and the one we are doing now, will be flushed at
1134	next checkpoint, except those evicted by LRU eviction (depending on
1135	the size of the page cache compared to the size of the working data
1136	set, eviction may be rare or frequent).
1137	We avoid that burst by anticipating: those pages are flushed
1138	in bunches spanned regularly over the time interval between now and
1139	the next checkpoint, by a background thread. Thus the next checkpoint
1140	will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
1141	only a little slower than CHECKPOINT_INDIRECT).
1142	*/
1143
1144	/*
1145	PageCacheFlushConcurrencyBugs
1146	Inside the page cache, calls to flush_pagecache_blocks_int() on the same
1147	file are serialized. Examples of concurrency bugs which happened when we
1148	didn't have this serialization:
1149	- maria_chk_size() (via CHECK TABLE) happens concurrently with
1150	Checkpoint: Checkpoint is flushing a page: it pins the page and is
1151	pre-empted, maria_chk_size() wants to flush this page too so gets an
1152	error because Checkpoint pinned this page. Such error makes
1153	maria_chk_size() mark the table as corrupted.
1154	- maria_close() happens concurrently with Checkpoint:
1155	Checkpoint is flushing a page: it registers a request on the page, is
1156	pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
1157	FLUSH_RELEASE will cause a free_block() which assumes the page is in the
1158	LRU, but it is not (as Checkpoint registered a request). Crash.
1159	- one thread is evicting a page of the file out of the LRU: it marks it
1160	iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
1161	of the same file concurrently (like above). Then one flusher sees the
1162	page is in switch, removes it from changed_blocks[] and puts it in its
1163	first_in_switch, so the other flusher will not see the page at all and
1164	return too early. If it's maria_close() which returns too early, then
1165	maria_close() may close the file descriptor, and the other flusher, and
1166	the evicter will fail to write their page: corruption.
1167	*/
1168
1169	if (!ignore_share)
1170	{
1171	if (filter != NULL)
1172	{
1173	if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1174	&dfile, FLUSH_KEEP_LAZY,
1175	filter, &filter_param) &
1176	PCFLUSH_ERROR))
1177	ma_message_no_user(`0`, "checkpoint data page flush failed");
1178	if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1179	&kfile, FLUSH_KEEP_LAZY,
1180	filter, &filter_param) &
1181	PCFLUSH_ERROR))
1182	ma_message_no_user(`0`, "checkpoint index page flush failed");
1183	}
1184	/*
1185	fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
1186	per second, so if you have touched 1000 files it's 7 seconds).
1187	*/
1188	sync_error\|=
1189	mysql_file_sync(dfile.file, MYF(MY_WME \| MY_IGNORE_BADFD)) \|
1190	mysql_file_sync(kfile.file, MYF(MY_WME \| MY_IGNORE_BADFD));
1191	/*
1192	in case of error, we continue because writing other tables to disk is
1193	still useful.
1194	*/
1195	}
1196	}
1197
1198	if (sync_error)
1199	goto err;
1200	/ We maybe over-estimated (due to share->id==0 or last_version==0) /
1201	DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
1202	str->length= (uint)(ptr - str->str);
1203	/*
1204	As we support max 65k tables open at a time (2-byte short id), we
1205	assume uint is enough for the cumulated length of table names; and
1206	LEX_STRING::length is uint.
1207	*/
1208	int4store(str->str, nb_stored);
1209	error= unmark_tables= `0`;
1210
1211	err:
1212	if (unlikely(unmark_tables))
1213	{
1214	/ maria_close() uses THR_LOCK_maria from start to end /
1215	mysql_mutex_lock(&THR_LOCK_maria);
1216	for (i= `0`; i < nb; i++)
1217	{
1218	MARIA_SHARE *share= distinct_shares[i];
1219	if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1220	{
1221	/ maria_close() left us to free the share /
1222	mysql_mutex_destroy(&share->intern_lock);
1223	ma_crypt_free(share);
1224	my_free(share);
1225	}
1226	else
1227	{
1228	/ share goes back to normal state /
1229	share->in_checkpoint= `0`;
1230	}
1231	}
1232	mysql_mutex_unlock(&THR_LOCK_maria);
1233	}
1234	my_free(distinct_shares);
1235	my_free(state_copies);
1236	DBUG_RETURN(error);
1237	}
1238

Browse the source code of MariaDB/storage/maria/ma_checkpoint.c