ram.c source code [qemu/migration/ram.c]

1	/*
2	* QEMU System Emulator
3	*
4	* Copyright (c) 2003-2008 Fabrice Bellard
5	* Copyright (c) 2011-2015 Red Hat Inc
6	*
7	* Authors:
8	* Juan Quintela <quintela@redhat.com>
9	*
10	* Permission is hereby granted, free of charge, to any person obtaining a copy
11	* of this software and associated documentation files (the "Software"), to deal
12	* in the Software without restriction, including without limitation the rights
13	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14	* copies of the Software, and to permit persons to whom the Software is
15	* furnished to do so, subject to the following conditions:
16	*
17	* The above copyright notice and this permission notice shall be included in
18	* all copies or substantial portions of the Software.
19	*
20	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26	* THE SOFTWARE.
27	*/
28
29	#include "qemu/osdep.h"
30	#include "cpu.h"
31	#include <zlib.h>
32	#include "qemu/cutils.h"
33	#include "qemu/bitops.h"
34	#include "qemu/bitmap.h"
35	#include "qemu/main-loop.h"
36	#include "qemu/pmem.h"
37	#include "xbzrle.h"
38	#include "ram.h"
39	#include "migration.h"
40	#include "socket.h"
41	#include "migration/register.h"
42	#include "migration/misc.h"
43	#include "qemu-file.h"
44	#include "postcopy-ram.h"
45	#include "page_cache.h"
46	#include "qemu/error-report.h"
47	#include "qapi/error.h"
48	#include "qapi/qapi-events-migration.h"
49	#include "qapi/qmp/qerror.h"
50	#include "trace.h"
51	#include "exec/ram_addr.h"
52	#include "exec/target_page.h"
53	#include "qemu/rcu_queue.h"
54	#include "migration/colo.h"
55	#include "block.h"
56	#include "sysemu/sysemu.h"
57	#include "qemu/uuid.h"
58	#include "savevm.h"
59	#include "qemu/iov.h"
60
61	/*********************************************************/
62	/ ram save/restore /
63
64	/ RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it*
65	* worked for pages that where filled with the same char. We switched
66	* it to only search for the zero value. And to avoid confusion with
67	* RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68	*/
69
70	#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
71	#define RAM_SAVE_FLAG_ZERO 0x02
72	#define RAM_SAVE_FLAG_MEM_SIZE 0x04
73	#define RAM_SAVE_FLAG_PAGE 0x08
74	#define RAM_SAVE_FLAG_EOS 0x10
75	#define RAM_SAVE_FLAG_CONTINUE 0x20
76	#define RAM_SAVE_FLAG_XBZRLE 0x40
77	/ 0x80 is reserved in migration.h start with 0x100 next /
78	#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
80	static inline bool is_zero_range(uint8_t *p, uint64_t size)
81	{
82	return buffer_is_zero(p, size);
83	}
84
85	XBZRLECacheStats xbzrle_counters;
86
87	/ struct contains XBZRLE cache and a static page*
88	used by the compression /*
89	static struct {
90	/ buffer used for XBZRLE encoding /
91	uint8_t *encoded_buf;
92	/ buffer for storing page content /
93	uint8_t *current_buf;
94	/ Cache for XBZRLE, Protected by lock. /
95	PageCache *cache;
96	QemuMutex lock;
97	/ it will store a page full of zeros /
98	uint8_t *zero_target_page;
99	/ buffer used for XBZRLE decoding /
100	uint8_t *decoded_buf;
101	} XBZRLE;
102
103	static void XBZRLE_cache_lock(void)
104	{
105	if (migrate_use_xbzrle())
106	qemu_mutex_lock(&XBZRLE.lock);
107	}
108
109	static void XBZRLE_cache_unlock(void)
110	{
111	if (migrate_use_xbzrle())
112	qemu_mutex_unlock(&XBZRLE.lock);
113	}
114
115	/**
116	* xbzrle_cache_resize: resize the xbzrle cache
117	*
118	* This function is called from qmp_migrate_set_cache_size in main
119	* thread, possibly while a migration is in progress. A running
120	* migration may be using the cache and might finish during this call,
121	* hence changes to the cache are protected by XBZRLE.lock().
122	*
123	* Returns 0 for success or -1 for error
124	*
125	* @new_size: new cache size
126	* @errp: set *errp if the check failed, with reason
127	*/
128	int xbzrle_cache_resize(int64_t new_size, Error **errp)
129	{
130	PageCache *new_cache;
131	int64_t ret = `0`;
132
133	/ Check for truncation /
134	if (new_size != (size_t)new_size) {
135	error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136	"exceeding address space");
137	return -`1`;
138	}
139
140	if (new_size == migrate_xbzrle_cache_size()) {
141	/ nothing to do /
142	return `0`;
143	}
144
145	XBZRLE_cache_lock();
146
147	if (XBZRLE.cache != NULL) {
148	new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149	if (!new_cache) {
150	ret = -`1`;
151	goto out;
152	}
153
154	cache_fini(XBZRLE.cache);
155	XBZRLE.cache = new_cache;
156	}
157	out:
158	XBZRLE_cache_unlock();
159	return ret;
160	}
161
162	static bool ramblock_is_ignored(RAMBlock *block)
163	{
164	return !qemu_ram_is_migratable(block) \|\|
165	(migrate_ignore_shared() && qemu_ram_is_shared(block));
166	}
167
168	/ Should be holding either ram_list.mutex, or the RCU lock. /
169	#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
170	INTERNAL_RAMBLOCK_FOREACH(block) \
171	if (ramblock_is_ignored(block)) {} else
172
173	#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
174	INTERNAL_RAMBLOCK_FOREACH(block) \
175	if (!qemu_ram_is_migratable(block)) {} else
176
177	#undef RAMBLOCK_FOREACH
178
179	int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180	{
181	RAMBlock *block;
182	int ret = `0`;
183
184	rcu_read_lock();
185	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186	ret = func(block, opaque);
187	if (ret) {
188	break;
189	}
190	}
191	rcu_read_unlock();
192	return ret;
193	}
194
195	static void ramblock_recv_map_init(void)
196	{
197	RAMBlock *rb;
198
199	RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200	assert(!rb->receivedmap);
201	rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202	}
203	}
204
205	int ramblock_recv_bitmap_test(RAMBlock rb, void* *host_addr)
206	{
207	return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208	rb->receivedmap);
209	}
210
211	bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212	{
213	return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214	}
215
216	void ramblock_recv_bitmap_set(RAMBlock rb, void* *host_addr)
217	{
218	set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219	}
220
221	void ramblock_recv_bitmap_set_range(RAMBlock rb, void* *host_addr,
222	size_t nr)
223	{
224	bitmap_set_atomic(rb->receivedmap,
225	ramblock_recv_bitmap_offset(host_addr, rb),
226	nr);
227	}
228
229	#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
230
231	/*
232	* Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233	*
234	* Returns >0 if success with sent bytes, or <0 if error.
235	*/
236	int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237	const char *block_name)
238	{
239	RAMBlock *block = qemu_ram_block_by_name(block_name);
240	unsigned long *le_bitmap, nbits;
241	uint64_t size;
242
243	if (!block) {
244	error_report("%s: invalid block name: %s", __func__, block_name);
245	return -`1`;
246	}
247
248	nbits = block->used_length >> TARGET_PAGE_BITS;
249
250	/*
251	* Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252	* machines we may need 4 more bytes for padding (see below
253	* comment). So extend it a bit before hand.
254	*/
255	le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256
257	/*
258	* Always use little endian when sending the bitmap. This is
259	* required that when source and destination VMs are not using the
260	* same endianess. (Note: big endian won't work.)
261	*/
262	bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263
264	/ Size of the bitmap, in bytes /
265	size = DIV_ROUND_UP(nbits, `8`);
266
267	/*
268	* size is always aligned to 8 bytes for 64bit machines, but it
269	* may not be true for 32bit machines. We need this padding to
270	* make sure the migration can survive even between 32bit and
271	* 64bit machines.
272	*/
273	size = ROUND_UP(size, `8`);
274
275	qemu_put_be64(file, size);
276	qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277	/*
278	* Mark as an end, in case the middle part is screwed up due to
279	* some "misterious" reason.
280	*/
281	qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282	qemu_fflush(file);
283
284	g_free(le_bitmap);
285
286	if (qemu_file_get_error(file)) {
287	return qemu_file_get_error(file);
288	}
289
290	return size + sizeof(size);
291	}
292
293	/*
294	* An outstanding page request, on the source, having been received
295	* and queued
296	*/
297	struct RAMSrcPageRequest {
298	RAMBlock *rb;
299	hwaddr offset;
300	hwaddr len;
301
302	QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303	};
304
305	/ State of RAM for migration /
306	struct RAMState {
307	/ QEMUFile used for this migration /
308	QEMUFile *f;
309	/ Last block that we have visited searching for dirty pages /
310	RAMBlock *last_seen_block;
311	/ Last block from where we have sent data /
312	RAMBlock *last_sent_block;
313	/ Last dirty target page we have sent /
314	ram_addr_t last_page;
315	/ last ram version we have seen /
316	uint32_t last_version;
317	/ We are in the first round /
318	bool ram_bulk_stage;
319	/ The free page optimization is enabled /
320	bool fpo_enabled;
321	/ How many times we have dirty too many pages /
322	int dirty_rate_high_cnt;
323	/ these variables are used for bitmap sync /
324	/ last time we did a full bitmap_sync /
325	int64_t time_last_bitmap_sync;
326	/ bytes transferred at start_time /
327	uint64_t bytes_xfer_prev;
328	/ number of dirty pages since start_time /
329	uint64_t num_dirty_pages_period;
330	/ xbzrle misses since the beginning of the period /
331	uint64_t xbzrle_cache_miss_prev;
332
333	/ compression statistics since the beginning of the period /
334	/ amount of count that no free thread to compress data /
335	uint64_t compress_thread_busy_prev;
336	/ amount bytes after compression /
337	uint64_t compressed_size_prev;
338	/ amount of compressed pages /
339	uint64_t compress_pages_prev;
340
341	/ total handled target pages at the beginning of period /
342	uint64_t target_page_count_prev;
343	/ total handled target pages since start /
344	uint64_t target_page_count;
345	/ number of dirty bits in the bitmap /
346	uint64_t migration_dirty_pages;
347	/ Protects modification of the bitmap and migration dirty pages /
348	QemuMutex bitmap_mutex;
349	/ The RAMBlock used in the last src_page_requests /
350	RAMBlock *last_req_rb;
351	/ Queue of outstanding page requests from the destination /
352	QemuMutex src_page_req_mutex;
353	QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354	};
355	typedef struct RAMState RAMState;
356
357	static RAMState *ram_state;
358
359	static NotifierWithReturnList precopy_notifier_list;
360
361	void precopy_infrastructure_init(void)
362	{
363	notifier_with_return_list_init(&precopy_notifier_list);
364	}
365
366	void precopy_add_notifier(NotifierWithReturn *n)
367	{
368	notifier_with_return_list_add(&precopy_notifier_list, n);
369	}
370
371	void precopy_remove_notifier(NotifierWithReturn *n)
372	{
373	notifier_with_return_remove(n);
374	}
375
376	int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377	{
378	PrecopyNotifyData pnd;
379	pnd.reason = reason;
380	pnd.errp = errp;
381
382	return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383	}
384
385	void precopy_enable_free_page_optimization(void)
386	{
387	if (!ram_state) {
388	return;
389	}
390
391	ram_state->fpo_enabled = true;
392	}
393
394	uint64_t ram_bytes_remaining(void)
395	{
396	return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397	`0`;
398	}
399
400	MigrationStats ram_counters;
401
402	/ used by the search for pages to send /
403	struct PageSearchStatus {
404	/ Current block being searched /
405	RAMBlock *block;
406	/ Current page to search from /
407	unsigned long page;
408	/ Set once we wrap around /
409	bool complete_round;
410	};
411	typedef struct PageSearchStatus PageSearchStatus;
412
413	CompressionStats compression_counters;
414
415	struct CompressParam {
416	bool done;
417	bool quit;
418	bool zero_page;
419	QEMUFile *file;
420	QemuMutex mutex;
421	QemuCond cond;
422	RAMBlock *block;
423	ram_addr_t offset;
424
425	/ internally used fields /
426	z_stream stream;
427	uint8_t *originbuf;
428	};
429	typedef struct CompressParam CompressParam;
430
431	struct DecompressParam {
432	bool done;
433	bool quit;
434	QemuMutex mutex;
435	QemuCond cond;
436	void *des;
437	uint8_t *compbuf;
438	int len;
439	z_stream stream;
440	};
441	typedef struct DecompressParam DecompressParam;
442
443	static CompressParam *comp_param;
444	static QemuThread *compress_threads;
445	/ comp_done_cond is used to wake up the migration thread when*
446	* one of the compression threads has finished the compression.
447	* comp_done_lock is used to co-work with comp_done_cond.
448	*/
449	static QemuMutex comp_done_lock;
450	static QemuCond comp_done_cond;
451	/ The empty QEMUFileOps will be used by file in CompressParam /
452	static const QEMUFileOps empty_ops = { };
453
454	static QEMUFile *decomp_file;
455	static DecompressParam *decomp_param;
456	static QemuThread *decompress_threads;
457	static QemuMutex decomp_done_lock;
458	static QemuCond decomp_done_cond;
459
460	static bool do_compress_ram_page(QEMUFile f, z_stream stream, RAMBlock *block,
461	ram_addr_t offset, uint8_t *source_buf);
462
463	static void do_data_compress(void* *opaque)
464	{
465	CompressParam *param = opaque;
466	RAMBlock *block;
467	ram_addr_t offset;
468	bool zero_page;
469
470	qemu_mutex_lock(&param->mutex);
471	while (!param->quit) {
472	if (param->block) {
473	block = param->block;
474	offset = param->offset;
475	param->block = NULL;
476	qemu_mutex_unlock(&param->mutex);
477
478	zero_page = do_compress_ram_page(param->file, &param->stream,
479	block, offset, param->originbuf);
480
481	qemu_mutex_lock(&comp_done_lock);
482	param->done = true;
483	param->zero_page = zero_page;
484	qemu_cond_signal(&comp_done_cond);
485	qemu_mutex_unlock(&comp_done_lock);
486
487	qemu_mutex_lock(&param->mutex);
488	} else {
489	qemu_cond_wait(&param->cond, &param->mutex);
490	}
491	}
492	qemu_mutex_unlock(&param->mutex);
493
494	return NULL;
495	}
496
497	static void compress_threads_save_cleanup(void)
498	{
499	int i, thread_count;
500
501	if (!migrate_use_compression() \|\| !comp_param) {
502	return;
503	}
504
505	thread_count = migrate_compress_threads();
506	for (i = `0`; i < thread_count; i++) {
507	/*
508	* we use it as a indicator which shows if the thread is
509	* properly init'd or not
510	*/
511	if (!comp_param[i].file) {
512	break;
513	}
514
515	qemu_mutex_lock(&comp_param[i].mutex);
516	comp_param[i].quit = true;
517	qemu_cond_signal(&comp_param[i].cond);
518	qemu_mutex_unlock(&comp_param[i].mutex);
519
520	qemu_thread_join(compress_threads + i);
521	qemu_mutex_destroy(&comp_param[i].mutex);
522	qemu_cond_destroy(&comp_param[i].cond);
523	deflateEnd(&comp_param[i].stream);
524	g_free(comp_param[i].originbuf);
525	qemu_fclose(comp_param[i].file);
526	comp_param[i].file = NULL;
527	}
528	qemu_mutex_destroy(&comp_done_lock);
529	qemu_cond_destroy(&comp_done_cond);
530	g_free(compress_threads);
531	g_free(comp_param);
532	compress_threads = NULL;
533	comp_param = NULL;
534	}
535
536	static int compress_threads_save_setup(void)
537	{
538	int i, thread_count;
539
540	if (!migrate_use_compression()) {
541	return `0`;
542	}
543	thread_count = migrate_compress_threads();
544	compress_threads = g_new0(QemuThread, thread_count);
545	comp_param = g_new0(CompressParam, thread_count);
546	qemu_cond_init(&comp_done_cond);
547	qemu_mutex_init(&comp_done_lock);
548	for (i = `0`; i < thread_count; i++) {
549	comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550	if (!comp_param[i].originbuf) {
551	goto exit;
552	}
553
554	if (deflateInit(&comp_param[i].stream,
555	migrate_compress_level()) != Z_OK) {
556	g_free(comp_param[i].originbuf);
557	goto exit;
558	}
559
560	/ comp_param[i].file is just used as a dummy buffer to save data,*
561	* set its ops to empty.
562	*/
563	comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564	comp_param[i].done = true;
565	comp_param[i].quit = false;
566	qemu_mutex_init(&comp_param[i].mutex);
567	qemu_cond_init(&comp_param[i].cond);
568	qemu_thread_create(compress_threads + i, "compress",
569	do_data_compress, comp_param + i,
570	QEMU_THREAD_JOINABLE);
571	}
572	return `0`;
573
574	exit:
575	compress_threads_save_cleanup();
576	return -`1`;
577	}
578
579	/ Multiple fd's /
580
581	#define MULTIFD_MAGIC 0x11223344U
582	#define MULTIFD_VERSION 1
583
584	#define MULTIFD_FLAG_SYNC (1 << 0)
585
586	/ This value needs to be a multiple of qemu_target_page_size() /
587	#define MULTIFD_PACKET_SIZE (512 * 1024)
588
589	typedef struct {
590	uint32_t magic;
591	uint32_t version;
592	unsigned char uuid[`16`]; / QemuUUID /
593	uint8_t id;
594	uint8_t unused1[`7`]; / Reserved for future use /
595	uint64_t unused2[`4`]; / Reserved for future use /
596	} __attribute__((packed)) MultiFDInit_t;
597
598	typedef struct {
599	uint32_t magic;
600	uint32_t version;
601	uint32_t flags;
602	/ maximum number of allocated pages /
603	uint32_t pages_alloc;
604	uint32_t pages_used;
605	/ size of the next packet that contains pages /
606	uint32_t next_packet_size;
607	uint64_t packet_num;
608	uint64_t unused[`4`]; / Reserved for future use /
609	char ramblock[`256`];
610	uint64_t offset[];
611	} __attribute__((packed)) MultiFDPacket_t;
612
613	typedef struct {
614	/ number of used pages /
615	uint32_t used;
616	/ number of allocated pages /
617	uint32_t allocated;
618	/ global number of generated multifd packets /
619	uint64_t packet_num;
620	/ offset of each page /
621	ram_addr_t *offset;
622	/ pointer to each page /
623	struct iovec *iov;
624	RAMBlock *block;
625	} MultiFDPages_t;
626
627	typedef struct {
628	/ this fields are not changed once the thread is created /
629	/ channel number /
630	uint8_t id;
631	/ channel thread name /
632	char *name;
633	/ channel thread id /
634	QemuThread thread;
635	/ communication channel /
636	QIOChannel *c;
637	/ sem where to wait for more work /
638	QemuSemaphore sem;
639	/ this mutex protects the following parameters /
640	QemuMutex mutex;
641	/ is this channel thread running /
642	bool running;
643	/ should this thread finish /
644	bool quit;
645	/ thread has work to do /
646	int pending_job;
647	/ array of pages to sent /
648	MultiFDPages_t *pages;
649	/ packet allocated len /
650	uint32_t packet_len;
651	/ pointer to the packet /
652	MultiFDPacket_t *packet;
653	/ multifd flags for each packet /
654	uint32_t flags;
655	/ size of the next packet that contains pages /
656	uint32_t next_packet_size;
657	/ global number of generated multifd packets /
658	uint64_t packet_num;
659	/ thread local variables /
660	/ packets sent through this channel /
661	uint64_t num_packets;
662	/ pages sent through this channel /
663	uint64_t num_pages;
664	/ syncs main thread and channels /
665	QemuSemaphore sem_sync;
666	} MultiFDSendParams;
667
668	typedef struct {
669	/ this fields are not changed once the thread is created /
670	/ channel number /
671	uint8_t id;
672	/ channel thread name /
673	char *name;
674	/ channel thread id /
675	QemuThread thread;
676	/ communication channel /
677	QIOChannel *c;
678	/ this mutex protects the following parameters /
679	QemuMutex mutex;
680	/ is this channel thread running /
681	bool running;
682	/ should this thread finish /
683	bool quit;
684	/ array of pages to receive /
685	MultiFDPages_t *pages;
686	/ packet allocated len /
687	uint32_t packet_len;
688	/ pointer to the packet /
689	MultiFDPacket_t *packet;
690	/ multifd flags for each packet /
691	uint32_t flags;
692	/ global number of generated multifd packets /
693	uint64_t packet_num;
694	/ thread local variables /
695	/ size of the next packet that contains pages /
696	uint32_t next_packet_size;
697	/ packets sent through this channel /
698	uint64_t num_packets;
699	/ pages sent through this channel /
700	uint64_t num_pages;
701	/ syncs main thread and channels /
702	QemuSemaphore sem_sync;
703	} MultiFDRecvParams;
704
705	static int multifd_send_initial_packet(MultiFDSendParams p, Error *errp)
706	{
707	MultiFDInit_t msg;
708	int ret;
709
710	msg.magic = cpu_to_be32(MULTIFD_MAGIC);
711	msg.version = cpu_to_be32(MULTIFD_VERSION);
712	msg.id = p->id;
713	memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
714
715	ret = qio_channel_write_all(p->c, (char )&msg, sizeof*(msg), errp);
716	if (ret != `0`) {
717	return -`1`;
718	}
719	return `0`;
720	}
721
722	static int multifd_recv_initial_packet(QIOChannel c, Error *errp)
723	{
724	MultiFDInit_t msg;
725	int ret;
726
727	ret = qio_channel_read_all(c, (char )&msg, sizeof*(msg), errp);
728	if (ret != `0`) {
729	return -`1`;
730	}
731
732	msg.magic = be32_to_cpu(msg.magic);
733	msg.version = be32_to_cpu(msg.version);
734
735	if (msg.magic != MULTIFD_MAGIC) {
736	error_setg(errp, "multifd: received packet magic %x "
737	"expected %x", msg.magic, MULTIFD_MAGIC);
738	return -`1`;
739	}
740
741	if (msg.version != MULTIFD_VERSION) {
742	error_setg(errp, "multifd: received packet version %d "
743	"expected %d", msg.version, MULTIFD_VERSION);
744	return -`1`;
745	}
746
747	if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
748	char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
749	char msg_uuid = qemu_uuid_unparse_strdup((const* QemuUUID *)msg.uuid);
750
751	error_setg(errp, "multifd: received uuid '%s' and expected "
752	"uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
753	g_free(uuid);
754	g_free(msg_uuid);
755	return -`1`;
756	}
757
758	if (msg.id > migrate_multifd_channels()) {
759	error_setg(errp, "multifd: received channel version %d "
760	"expected %d", msg.version, MULTIFD_VERSION);
761	return -`1`;
762	}
763
764	return msg.id;
765	}
766
767	static MultiFDPages_t *multifd_pages_init(size_t size)
768	{
769	MultiFDPages_t *pages = g_new0(MultiFDPages_t, `1`);
770
771	pages->allocated = size;
772	pages->iov = g_new0(struct iovec, size);
773	pages->offset = g_new0(ram_addr_t, size);
774
775	return pages;
776	}
777
778	static void multifd_pages_clear(MultiFDPages_t *pages)
779	{
780	pages->used = `0`;
781	pages->allocated = `0`;
782	pages->packet_num = `0`;
783	pages->block = NULL;
784	g_free(pages->iov);
785	pages->iov = NULL;
786	g_free(pages->offset);
787	pages->offset = NULL;
788	g_free(pages);
789	}
790
791	static void multifd_send_fill_packet(MultiFDSendParams *p)
792	{
793	MultiFDPacket_t *packet = p->packet;
794	uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
795	int i;
796
797	packet->magic = cpu_to_be32(MULTIFD_MAGIC);
798	packet->version = cpu_to_be32(MULTIFD_VERSION);
799	packet->flags = cpu_to_be32(p->flags);
800	packet->pages_alloc = cpu_to_be32(page_max);
801	packet->pages_used = cpu_to_be32(p->pages->used);
802	packet->next_packet_size = cpu_to_be32(p->next_packet_size);
803	packet->packet_num = cpu_to_be64(p->packet_num);
804
805	if (p->pages->block) {
806	strncpy(packet->ramblock, p->pages->block->idstr, `256`);
807	}
808
809	for (i = `0`; i < p->pages->used; i++) {
810	packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
811	}
812	}
813
814	static int multifd_recv_unfill_packet(MultiFDRecvParams p, Error *errp)
815	{
816	MultiFDPacket_t *packet = p->packet;
817	uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
818	RAMBlock *block;
819	int i;
820
821	packet->magic = be32_to_cpu(packet->magic);
822	if (packet->magic != MULTIFD_MAGIC) {
823	error_setg(errp, "multifd: received packet "
824	"magic %x and expected magic %x",
825	packet->magic, MULTIFD_MAGIC);
826	return -`1`;
827	}
828
829	packet->version = be32_to_cpu(packet->version);
830	if (packet->version != MULTIFD_VERSION) {
831	error_setg(errp, "multifd: received packet "
832	"version %d and expected version %d",
833	packet->version, MULTIFD_VERSION);
834	return -`1`;
835	}
836
837	p->flags = be32_to_cpu(packet->flags);
838
839	packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
840	/*
841	* If we recevied a packet that is 100 times bigger than expected
842	* just stop migration. It is a magic number.
843	*/
844	if (packet->pages_alloc > pages_max * `100`) {
845	error_setg(errp, "multifd: received packet "
846	"with size %d and expected a maximum size of %d",
847	packet->pages_alloc, pages_max * `100`) ;
848	return -`1`;
849	}
850	/*
851	* We received a packet that is bigger than expected but inside
852	* reasonable limits (see previous comment). Just reallocate.
853	*/
854	if (packet->pages_alloc > p->pages->allocated) {
855	multifd_pages_clear(p->pages);
856	p->pages = multifd_pages_init(packet->pages_alloc);
857	}
858
859	p->pages->used = be32_to_cpu(packet->pages_used);
860	if (p->pages->used > packet->pages_alloc) {
861	error_setg(errp, "multifd: received packet "
862	"with %d pages and expected maximum pages are %d",
863	p->pages->used, packet->pages_alloc) ;
864	return -`1`;
865	}
866
867	p->next_packet_size = be32_to_cpu(packet->next_packet_size);
868	p->packet_num = be64_to_cpu(packet->packet_num);
869
870	if (p->pages->used) {
871	/ make sure that ramblock is 0 terminated /
872	packet->ramblock[`255`] = `0`;
873	block = qemu_ram_block_by_name(packet->ramblock);
874	if (!block) {
875	error_setg(errp, "multifd: unknown ram block %s",
876	packet->ramblock);
877	return -`1`;
878	}
879	}
880
881	for (i = `0`; i < p->pages->used; i++) {
882	ram_addr_t offset = be64_to_cpu(packet->offset[i]);
883
884	if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
885	error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
886	" (max " RAM_ADDR_FMT ")",
887	offset, block->max_length);
888	return -`1`;
889	}
890	p->pages->iov[i].iov_base = block->host + offset;
891	p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
892	}
893
894	return `0`;
895	}
896
897	struct {
898	MultiFDSendParams *params;
899	/ array of pages to sent /
900	MultiFDPages_t *pages;
901	/ global number of generated multifd packets /
902	uint64_t packet_num;
903	/ send channels ready /
904	QemuSemaphore channels_ready;
905	} *multifd_send_state;
906
907	/*
908	* How we use multifd_send_state->pages and channel->pages?
909	*
910	* We create a pages for each channel, and a main one. Each time that
911	* we need to send a batch of pages we interchange the ones between
912	* multifd_send_state and the channel that is sending it. There are
913	* two reasons for that:
914	* - to not have to do so many mallocs during migration
915	* - to make easier to know what to free at the end of migration
916	*
917	* This way we always know who is the owner of each "pages" struct,
918	* and we don't need any locking. It belongs to the migration thread
919	* or to the channel thread. Switching is safe because the migration
920	* thread is using the channel mutex when changing it, and the channel
921	* have to had finish with its own, otherwise pending_job can't be
922	* false.
923	*/
924
925	static int multifd_send_pages(RAMState *rs)
926	{
927	int i;
928	static int next_channel;
929	MultiFDSendParams p = NULL; /* make happy gcc /
930	MultiFDPages_t *pages = multifd_send_state->pages;
931	uint64_t transferred;
932
933	qemu_sem_wait(&multifd_send_state->channels_ready);
934	for (i = next_channel;; i = (i + `1`) % migrate_multifd_channels()) {
935	p = &multifd_send_state->params[i];
936
937	qemu_mutex_lock(&p->mutex);
938	if (p->quit) {
939	error_report("%s: channel %d has already quit!", __func__, i);
940	qemu_mutex_unlock(&p->mutex);
941	return -`1`;
942	}
943	if (!p->pending_job) {
944	p->pending_job++;
945	next_channel = (i + `1`) % migrate_multifd_channels();
946	break;
947	}
948	qemu_mutex_unlock(&p->mutex);
949	}
950	p->pages->used = `0`;
951
952	p->packet_num = multifd_send_state->packet_num++;
953	p->pages->block = NULL;
954	multifd_send_state->pages = p->pages;
955	p->pages = pages;
956	transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
957	qemu_file_update_transfer(rs->f, transferred);
958	ram_counters.multifd_bytes += transferred;
959	ram_counters.transferred += transferred;;
960	qemu_mutex_unlock(&p->mutex);
961	qemu_sem_post(&p->sem);
962
963	return `1`;
964	}
965
966	static int multifd_queue_page(RAMState rs, RAMBlock block, ram_addr_t offset)
967	{
968	MultiFDPages_t *pages = multifd_send_state->pages;
969
970	if (!pages->block) {
971	pages->block = block;
972	}
973
974	if (pages->block == block) {
975	pages->offset[pages->used] = offset;
976	pages->iov[pages->used].iov_base = block->host + offset;
977	pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
978	pages->used++;
979
980	if (pages->used < pages->allocated) {
981	return `1`;
982	}
983	}
984
985	if (multifd_send_pages(rs) < `0`) {
986	return -`1`;
987	}
988
989	if (pages->block != block) {
990	return multifd_queue_page(rs, block, offset);
991	}
992
993	return `1`;
994	}
995
996	static void multifd_send_terminate_threads(Error *err)
997	{
998	int i;
999
1000	trace_multifd_send_terminate_threads(err != NULL);
1001
1002	if (err) {
1003	MigrationState *s = migrate_get_current();
1004	migrate_set_error(s, err);
1005	if (s->state == MIGRATION_STATUS_SETUP \|\|
1006	s->state == MIGRATION_STATUS_PRE_SWITCHOVER \|\|
1007	s->state == MIGRATION_STATUS_DEVICE \|\|
1008	s->state == MIGRATION_STATUS_ACTIVE) {
1009	migrate_set_state(&s->state, s->state,
1010	MIGRATION_STATUS_FAILED);
1011	}
1012	}
1013
1014	for (i = `0`; i < migrate_multifd_channels(); i++) {
1015	MultiFDSendParams *p = &multifd_send_state->params[i];
1016
1017	qemu_mutex_lock(&p->mutex);
1018	p->quit = true;
1019	qemu_sem_post(&p->sem);
1020	qemu_mutex_unlock(&p->mutex);
1021	}
1022	}
1023
1024	void multifd_save_cleanup(void)
1025	{
1026	int i;
1027
1028	if (!migrate_use_multifd()) {
1029	return;
1030	}
1031	multifd_send_terminate_threads(NULL);
1032	for (i = `0`; i < migrate_multifd_channels(); i++) {
1033	MultiFDSendParams *p = &multifd_send_state->params[i];
1034
1035	if (p->running) {
1036	qemu_thread_join(&p->thread);
1037	}
1038	socket_send_channel_destroy(p->c);
1039	p->c = NULL;
1040	qemu_mutex_destroy(&p->mutex);
1041	qemu_sem_destroy(&p->sem);
1042	qemu_sem_destroy(&p->sem_sync);
1043	g_free(p->name);
1044	p->name = NULL;
1045	multifd_pages_clear(p->pages);
1046	p->pages = NULL;
1047	p->packet_len = `0`;
1048	g_free(p->packet);
1049	p->packet = NULL;
1050	}
1051	qemu_sem_destroy(&multifd_send_state->channels_ready);
1052	g_free(multifd_send_state->params);
1053	multifd_send_state->params = NULL;
1054	multifd_pages_clear(multifd_send_state->pages);
1055	multifd_send_state->pages = NULL;
1056	g_free(multifd_send_state);
1057	multifd_send_state = NULL;
1058	}
1059
1060	static void multifd_send_sync_main(RAMState *rs)
1061	{
1062	int i;
1063
1064	if (!migrate_use_multifd()) {
1065	return;
1066	}
1067	if (multifd_send_state->pages->used) {
1068	if (multifd_send_pages(rs) < `0`) {
1069	error_report("%s: multifd_send_pages fail", __func__);
1070	return;
1071	}
1072	}
1073	for (i = `0`; i < migrate_multifd_channels(); i++) {
1074	MultiFDSendParams *p = &multifd_send_state->params[i];
1075
1076	trace_multifd_send_sync_main_signal(p->id);
1077
1078	qemu_mutex_lock(&p->mutex);
1079
1080	if (p->quit) {
1081	error_report("%s: channel %d has already quit", __func__, i);
1082	qemu_mutex_unlock(&p->mutex);
1083	return;
1084	}
1085
1086	p->packet_num = multifd_send_state->packet_num++;
1087	p->flags \|= MULTIFD_FLAG_SYNC;
1088	p->pending_job++;
1089	qemu_file_update_transfer(rs->f, p->packet_len);
1090	ram_counters.multifd_bytes += p->packet_len;
1091	ram_counters.transferred += p->packet_len;
1092	qemu_mutex_unlock(&p->mutex);
1093	qemu_sem_post(&p->sem);
1094	}
1095	for (i = `0`; i < migrate_multifd_channels(); i++) {
1096	MultiFDSendParams *p = &multifd_send_state->params[i];
1097
1098	trace_multifd_send_sync_main_wait(p->id);
1099	qemu_sem_wait(&p->sem_sync);
1100	}
1101	trace_multifd_send_sync_main(multifd_send_state->packet_num);
1102	}
1103
1104	static void multifd_send_thread(void* *opaque)
1105	{
1106	MultiFDSendParams *p = opaque;
1107	Error *local_err = NULL;
1108	int ret = `0`;
1109	uint32_t flags = `0`;
1110
1111	trace_multifd_send_thread_start(p->id);
1112	rcu_register_thread();
1113
1114	if (multifd_send_initial_packet(p, &local_err) < `0`) {
1115	goto out;
1116	}
1117	/ initial packet /
1118	p->num_packets = `1`;
1119
1120	while (true) {
1121	qemu_sem_wait(&p->sem);
1122	qemu_mutex_lock(&p->mutex);
1123
1124	if (p->pending_job) {
1125	uint32_t used = p->pages->used;
1126	uint64_t packet_num = p->packet_num;
1127	flags = p->flags;
1128
1129	p->next_packet_size = used * qemu_target_page_size();
1130	multifd_send_fill_packet(p);
1131	p->flags = `0`;
1132	p->num_packets++;
1133	p->num_pages += used;
1134	p->pages->used = `0`;
1135	qemu_mutex_unlock(&p->mutex);
1136
1137	trace_multifd_send(p->id, packet_num, used, flags,
1138	p->next_packet_size);
1139
1140	ret = qio_channel_write_all(p->c, (void *)p->packet,
1141	p->packet_len, &local_err);
1142	if (ret != `0`) {
1143	break;
1144	}
1145
1146	if (used) {
1147	ret = qio_channel_writev_all(p->c, p->pages->iov,
1148	used, &local_err);
1149	if (ret != `0`) {
1150	break;
1151	}
1152	}
1153
1154	qemu_mutex_lock(&p->mutex);
1155	p->pending_job--;
1156	qemu_mutex_unlock(&p->mutex);
1157
1158	if (flags & MULTIFD_FLAG_SYNC) {
1159	qemu_sem_post(&p->sem_sync);
1160	}
1161	qemu_sem_post(&multifd_send_state->channels_ready);
1162	} else if (p->quit) {
1163	qemu_mutex_unlock(&p->mutex);
1164	break;
1165	} else {
1166	qemu_mutex_unlock(&p->mutex);
1167	/ sometimes there are spurious wakeups /
1168	}
1169	}
1170
1171	out:
1172	if (local_err) {
1173	trace_multifd_send_error(p->id);
1174	multifd_send_terminate_threads(local_err);
1175	}
1176
1177	/*
1178	* Error happen, I will exit, but I can't just leave, tell
1179	* who pay attention to me.
1180	*/
1181	if (ret != `0`) {
1182	if (flags & MULTIFD_FLAG_SYNC) {
1183	qemu_sem_post(&p->sem_sync);
1184	}
1185	qemu_sem_post(&multifd_send_state->channels_ready);
1186	}
1187
1188	qemu_mutex_lock(&p->mutex);
1189	p->running = false;
1190	qemu_mutex_unlock(&p->mutex);
1191
1192	rcu_unregister_thread();
1193	trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1194
1195	return NULL;
1196	}
1197
1198	static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1199	{
1200	MultiFDSendParams *p = opaque;
1201	QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1202	Error *local_err = NULL;
1203
1204	trace_multifd_new_send_channel_async(p->id);
1205	if (qio_task_propagate_error(task, &local_err)) {
1206	migrate_set_error(migrate_get_current(), local_err);
1207	multifd_save_cleanup();
1208	} else {
1209	p->c = QIO_CHANNEL(sioc);
1210	qio_channel_set_delay(p->c, false);
1211	p->running = true;
1212	qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1213	QEMU_THREAD_JOINABLE);
1214	}
1215	}
1216
1217	int multifd_save_setup(void)
1218	{
1219	int thread_count;
1220	uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1221	uint8_t i;
1222
1223	if (!migrate_use_multifd()) {
1224	return `0`;
1225	}
1226	thread_count = migrate_multifd_channels();
1227	multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1228	multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1229	multifd_send_state->pages = multifd_pages_init(page_count);
1230	qemu_sem_init(&multifd_send_state->channels_ready, `0`);
1231
1232	for (i = `0`; i < thread_count; i++) {
1233	MultiFDSendParams *p = &multifd_send_state->params[i];
1234
1235	qemu_mutex_init(&p->mutex);
1236	qemu_sem_init(&p->sem, `0`);
1237	qemu_sem_init(&p->sem_sync, `0`);
1238	p->quit = false;
1239	p->pending_job = `0`;
1240	p->id = i;
1241	p->pages = multifd_pages_init(page_count);
1242	p->packet_len = sizeof(MultiFDPacket_t)
1243	+ sizeof(ram_addr_t) * page_count;
1244	p->packet = g_malloc0(p->packet_len);
1245	p->name = g_strdup_printf("multifdsend_%d", i);
1246	socket_send_channel_create(multifd_new_send_channel_async, p);
1247	}
1248	return `0`;
1249	}
1250
1251	struct {
1252	MultiFDRecvParams *params;
1253	/ number of created threads /
1254	int count;
1255	/ syncs main thread and channels /
1256	QemuSemaphore sem_sync;
1257	/ global number of generated multifd packets /
1258	uint64_t packet_num;
1259	} *multifd_recv_state;
1260
1261	static void multifd_recv_terminate_threads(Error *err)
1262	{
1263	int i;
1264
1265	trace_multifd_recv_terminate_threads(err != NULL);
1266
1267	if (err) {
1268	MigrationState *s = migrate_get_current();
1269	migrate_set_error(s, err);
1270	if (s->state == MIGRATION_STATUS_SETUP \|\|
1271	s->state == MIGRATION_STATUS_ACTIVE) {
1272	migrate_set_state(&s->state, s->state,
1273	MIGRATION_STATUS_FAILED);
1274	}
1275	}
1276
1277	for (i = `0`; i < migrate_multifd_channels(); i++) {
1278	MultiFDRecvParams *p = &multifd_recv_state->params[i];
1279
1280	qemu_mutex_lock(&p->mutex);
1281	p->quit = true;
1282	/ We could arrive here for two reasons:*
1283	- normal quit, i.e. everything went fine, just finished
1284	- error quit: We close the channels so the channel threads
1285	finish the qio_channel_read_all_eof() /*
1286	qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1287	qemu_mutex_unlock(&p->mutex);
1288	}
1289	}
1290
1291	int multifd_load_cleanup(Error **errp)
1292	{
1293	int i;
1294	int ret = `0`;
1295
1296	if (!migrate_use_multifd()) {
1297	return `0`;
1298	}
1299	multifd_recv_terminate_threads(NULL);
1300	for (i = `0`; i < migrate_multifd_channels(); i++) {
1301	MultiFDRecvParams *p = &multifd_recv_state->params[i];
1302
1303	if (p->running) {
1304	p->quit = true;
1305	/*
1306	* multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1307	* however try to wakeup it without harm in cleanup phase.
1308	*/
1309	qemu_sem_post(&p->sem_sync);
1310	qemu_thread_join(&p->thread);
1311	}
1312	object_unref(OBJECT(p->c));
1313	p->c = NULL;
1314	qemu_mutex_destroy(&p->mutex);
1315	qemu_sem_destroy(&p->sem_sync);
1316	g_free(p->name);
1317	p->name = NULL;
1318	multifd_pages_clear(p->pages);
1319	p->pages = NULL;
1320	p->packet_len = `0`;
1321	g_free(p->packet);
1322	p->packet = NULL;
1323	}
1324	qemu_sem_destroy(&multifd_recv_state->sem_sync);
1325	g_free(multifd_recv_state->params);
1326	multifd_recv_state->params = NULL;
1327	g_free(multifd_recv_state);
1328	multifd_recv_state = NULL;
1329
1330	return ret;
1331	}
1332
1333	static void multifd_recv_sync_main(void)
1334	{
1335	int i;
1336
1337	if (!migrate_use_multifd()) {
1338	return;
1339	}
1340	for (i = `0`; i < migrate_multifd_channels(); i++) {
1341	MultiFDRecvParams *p = &multifd_recv_state->params[i];
1342
1343	trace_multifd_recv_sync_main_wait(p->id);
1344	qemu_sem_wait(&multifd_recv_state->sem_sync);
1345	}
1346	for (i = `0`; i < migrate_multifd_channels(); i++) {
1347	MultiFDRecvParams *p = &multifd_recv_state->params[i];
1348
1349	qemu_mutex_lock(&p->mutex);
1350	if (multifd_recv_state->packet_num < p->packet_num) {
1351	multifd_recv_state->packet_num = p->packet_num;
1352	}
1353	qemu_mutex_unlock(&p->mutex);
1354	trace_multifd_recv_sync_main_signal(p->id);
1355	qemu_sem_post(&p->sem_sync);
1356	}
1357	trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1358	}
1359
1360	static void multifd_recv_thread(void* *opaque)
1361	{
1362	MultiFDRecvParams *p = opaque;
1363	Error *local_err = NULL;
1364	int ret;
1365
1366	trace_multifd_recv_thread_start(p->id);
1367	rcu_register_thread();
1368
1369	while (true) {
1370	uint32_t used;
1371	uint32_t flags;
1372
1373	if (p->quit) {
1374	break;
1375	}
1376
1377	ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1378	p->packet_len, &local_err);
1379	if (ret == `0`) { / EOF /
1380	break;
1381	}
1382	if (ret == -`1`) { / Error /
1383	break;
1384	}
1385
1386	qemu_mutex_lock(&p->mutex);
1387	ret = multifd_recv_unfill_packet(p, &local_err);
1388	if (ret) {
1389	qemu_mutex_unlock(&p->mutex);
1390	break;
1391	}
1392
1393	used = p->pages->used;
1394	flags = p->flags;
1395	trace_multifd_recv(p->id, p->packet_num, used, flags,
1396	p->next_packet_size);
1397	p->num_packets++;
1398	p->num_pages += used;
1399	qemu_mutex_unlock(&p->mutex);
1400
1401	if (used) {
1402	ret = qio_channel_readv_all(p->c, p->pages->iov,
1403	used, &local_err);
1404	if (ret != `0`) {
1405	break;
1406	}
1407	}
1408
1409	if (flags & MULTIFD_FLAG_SYNC) {
1410	qemu_sem_post(&multifd_recv_state->sem_sync);
1411	qemu_sem_wait(&p->sem_sync);
1412	}
1413	}
1414
1415	if (local_err) {
1416	multifd_recv_terminate_threads(local_err);
1417	}
1418	qemu_mutex_lock(&p->mutex);
1419	p->running = false;
1420	qemu_mutex_unlock(&p->mutex);
1421
1422	rcu_unregister_thread();
1423	trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1424
1425	return NULL;
1426	}
1427
1428	int multifd_load_setup(void)
1429	{
1430	int thread_count;
1431	uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1432	uint8_t i;
1433
1434	if (!migrate_use_multifd()) {
1435	return `0`;
1436	}
1437	thread_count = migrate_multifd_channels();
1438	multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1439	multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1440	atomic_set(&multifd_recv_state->count, `0`);
1441	qemu_sem_init(&multifd_recv_state->sem_sync, `0`);
1442
1443	for (i = `0`; i < thread_count; i++) {
1444	MultiFDRecvParams *p = &multifd_recv_state->params[i];
1445
1446	qemu_mutex_init(&p->mutex);
1447	qemu_sem_init(&p->sem_sync, `0`);
1448	p->quit = false;
1449	p->id = i;
1450	p->pages = multifd_pages_init(page_count);
1451	p->packet_len = sizeof(MultiFDPacket_t)
1452	+ sizeof(ram_addr_t) * page_count;
1453	p->packet = g_malloc0(p->packet_len);
1454	p->name = g_strdup_printf("multifdrecv_%d", i);
1455	}
1456	return `0`;
1457	}
1458
1459	bool multifd_recv_all_channels_created(void)
1460	{
1461	int thread_count = migrate_multifd_channels();
1462
1463	if (!migrate_use_multifd()) {
1464	return true;
1465	}
1466
1467	return thread_count == atomic_read(&multifd_recv_state->count);
1468	}
1469
1470	/*
1471	* Try to receive all multifd channels to get ready for the migration.
1472	* - Return true and do not set @errp when correctly receving all channels;
1473	* - Return false and do not set @errp when correctly receiving the current one;
1474	* - Return false and set @errp when failing to receive the current channel.
1475	*/
1476	bool multifd_recv_new_channel(QIOChannel ioc, Error *errp)
1477	{
1478	MultiFDRecvParams *p;
1479	Error *local_err = NULL;
1480	int id;
1481
1482	id = multifd_recv_initial_packet(ioc, &local_err);
1483	if (id < `0`) {
1484	multifd_recv_terminate_threads(local_err);
1485	error_propagate_prepend(errp, local_err,
1486	"failed to receive packet"
1487	" via multifd channel %d: ",
1488	atomic_read(&multifd_recv_state->count));
1489	return false;
1490	}
1491	trace_multifd_recv_new_channel(id);
1492
1493	p = &multifd_recv_state->params[id];
1494	if (p->c != NULL) {
1495	error_setg(&local_err, "multifd: received id '%d' already setup'",
1496	id);
1497	multifd_recv_terminate_threads(local_err);
1498	error_propagate(errp, local_err);
1499	return false;
1500	}
1501	p->c = ioc;
1502	object_ref(OBJECT(ioc));
1503	/ initial packet /
1504	p->num_packets = `1`;
1505
1506	p->running = true;
1507	qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1508	QEMU_THREAD_JOINABLE);
1509	atomic_inc(&multifd_recv_state->count);
1510	return atomic_read(&multifd_recv_state->count) ==
1511	migrate_multifd_channels();
1512	}
1513
1514	/**
1515	* save_page_header: write page header to wire
1516	*
1517	* If this is the 1st block, it also writes the block identification
1518	*
1519	* Returns the number of bytes written
1520	*
1521	* @f: QEMUFile where to send the data
1522	* @block: block that contains the page we want to send
1523	* @offset: offset inside the block for the page
1524	* in the lower bits, it contains flags
1525	*/
1526	static size_t save_page_header(RAMState rs, QEMUFile f, RAMBlock *block,
1527	ram_addr_t offset)
1528	{
1529	size_t size, len;
1530
1531	if (block == rs->last_sent_block) {
1532	offset \|= RAM_SAVE_FLAG_CONTINUE;
1533	}
1534	qemu_put_be64(f, offset);
1535	size = `8`;
1536
1537	if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1538	len = strlen(block->idstr);
1539	qemu_put_byte(f, len);
1540	qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1541	size += `1` + len;
1542	rs->last_sent_block = block;
1543	}
1544	return size;
1545	}
1546
1547	/**
1548	* mig_throttle_guest_down: throotle down the guest
1549	*
1550	* Reduce amount of guest cpu execution to hopefully slow down memory
1551	* writes. If guest dirty memory rate is reduced below the rate at
1552	* which we can transfer pages to the destination then we should be
1553	* able to complete migration. Some workloads dirty memory way too
1554	* fast and will not effectively converge, even with auto-converge.
1555	*/
1556	static void mig_throttle_guest_down(void)
1557	{
1558	MigrationState *s = migrate_get_current();
1559	uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1560	uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1561	int pct_max = s->parameters.max_cpu_throttle;
1562
1563	/ We have not started throttling yet. Let's start it. /
1564	if (!cpu_throttle_active()) {
1565	cpu_throttle_set(pct_initial);
1566	} else {
1567	/ Throttling already on, just increase the rate /
1568	cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1569	pct_max));
1570	}
1571	}
1572
1573	/**
1574	* xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1575	*
1576	* @rs: current RAM state
1577	* @current_addr: address for the zero page
1578	*
1579	* Update the xbzrle cache to reflect a page that's been sent as all 0.
1580	* The important thing is that a stale (not-yet-0'd) page be replaced
1581	* by the new data.
1582	* As a bonus, if the page wasn't in the cache it gets added so that
1583	* when a small write is made into the 0'd page it gets XBZRLE sent.
1584	*/
1585	static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1586	{
1587	if (rs->ram_bulk_stage \|\| !migrate_use_xbzrle()) {
1588	return;
1589	}
1590
1591	/ We don't care if this fails to allocate a new cache page*
1592	* as long as it updated an old one */
1593	cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1594	ram_counters.dirty_sync_count);
1595	}
1596
1597	#define ENCODING_FLAG_XBZRLE 0x1
1598
1599	/**
1600	* save_xbzrle_page: compress and send current page
1601	*
1602	* Returns: 1 means that we wrote the page
1603	* 0 means that page is identical to the one already sent
1604	* -1 means that xbzrle would be longer than normal
1605	*
1606	* @rs: current RAM state
1607	* @current_data: pointer to the address of the page contents
1608	* @current_addr: addr of the page
1609	* @block: block that contains the page we want to send
1610	* @offset: offset inside the block for the page
1611	* @last_stage: if we are at the completion stage
1612	*/
1613	static int save_xbzrle_page(RAMState rs, uint8_t *current_data,
1614	ram_addr_t current_addr, RAMBlock *block,
1615	ram_addr_t offset, bool last_stage)
1616	{
1617	int encoded_len = `0`, bytes_xbzrle;
1618	uint8_t *prev_cached_page;
1619
1620	if (!cache_is_cached(XBZRLE.cache, current_addr,
1621	ram_counters.dirty_sync_count)) {
1622	xbzrle_counters.cache_miss++;
1623	if (!last_stage) {
1624	if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1625	ram_counters.dirty_sync_count) == -`1`) {
1626	return -`1`;
1627	} else {
1628	/ update current_data when the page has been
1629	inserted into cache /*
1630	*current_data = get_cached_data(XBZRLE.cache, current_addr);
1631	}
1632	}
1633	return -`1`;
1634	}
1635
1636	prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1637
1638	/ save current buffer into memory /
1639	memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1640
1641	/ XBZRLE encoding (if there is no overflow) /
1642	encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1643	TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1644	TARGET_PAGE_SIZE);
1645
1646	/*
1647	* Update the cache contents, so that it corresponds to the data
1648	* sent, in all cases except where we skip the page.
1649	*/
1650	if (!last_stage && encoded_len != `0`) {
1651	memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1652	/*
1653	* In the case where we couldn't compress, ensure that the caller
1654	* sends the data from the cache, since the guest might have
1655	* changed the RAM since we copied it.
1656	*/
1657	*current_data = prev_cached_page;
1658	}
1659
1660	if (encoded_len == `0`) {
1661	trace_save_xbzrle_page_skipping();
1662	return `0`;
1663	} else if (encoded_len == -`1`) {
1664	trace_save_xbzrle_page_overflow();
1665	xbzrle_counters.overflow++;
1666	return -`1`;
1667	}
1668
1669	/ Send XBZRLE based compressed page /
1670	bytes_xbzrle = save_page_header(rs, rs->f, block,
1671	offset \| RAM_SAVE_FLAG_XBZRLE);
1672	qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1673	qemu_put_be16(rs->f, encoded_len);
1674	qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1675	bytes_xbzrle += encoded_len + `1` + `2`;
1676	xbzrle_counters.pages++;
1677	xbzrle_counters.bytes += bytes_xbzrle;
1678	ram_counters.transferred += bytes_xbzrle;
1679
1680	return `1`;
1681	}
1682
1683	/**
1684	* migration_bitmap_find_dirty: find the next dirty page from start
1685	*
1686	* Returns the page offset within memory region of the start of a dirty page
1687	*
1688	* @rs: current RAM state
1689	* @rb: RAMBlock where to search for dirty pages
1690	* @start: page where we start the search
1691	*/
1692	static inline
1693	unsigned long migration_bitmap_find_dirty(RAMState rs, RAMBlock rb,
1694	unsigned long start)
1695	{
1696	unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1697	unsigned long *bitmap = rb->bmap;
1698	unsigned long next;
1699
1700	if (ramblock_is_ignored(rb)) {
1701	return size;
1702	}
1703
1704	/*
1705	* When the free page optimization is enabled, we need to check the bitmap
1706	* to send the non-free pages rather than all the pages in the bulk stage.
1707	*/
1708	if (!rs->fpo_enabled && rs->ram_bulk_stage && start > `0`) {
1709	next = start + `1`;
1710	} else {
1711	next = find_next_bit(bitmap, size, start);
1712	}
1713
1714	return next;
1715	}
1716
1717	static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1718	RAMBlock *rb,
1719	unsigned long page)
1720	{
1721	bool ret;
1722
1723	qemu_mutex_lock(&rs->bitmap_mutex);
1724
1725	/*
1726	* Clear dirty bitmap if needed. This _must_ be called before we
1727	* send any of the page in the chunk because we need to make sure
1728	* we can capture further page content changes when we sync dirty
1729	* log the next time. So as long as we are going to send any of
1730	* the page in the chunk we clear the remote dirty bitmap for all.
1731	* Clearing it earlier won't be a problem, but too late will.
1732	*/
1733	if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1734	uint8_t shift = rb->clear_bmap_shift;
1735	hwaddr size = `1ULL` << (TARGET_PAGE_BITS + shift);
1736	hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1737
1738	/*
1739	* CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1740	* can make things easier sometimes since then start address
1741	* of the small chunk will always be 64 pages aligned so the
1742	* bitmap will always be aligned to unsigned long. We should
1743	* even be able to remove this restriction but I'm simply
1744	* keeping it.
1745	*/
1746	assert(shift >= `6`);
1747	trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1748	memory_region_clear_dirty_bitmap(rb->mr, start, size);
1749	}
1750
1751	ret = test_and_clear_bit(page, rb->bmap);
1752
1753	if (ret) {
1754	rs->migration_dirty_pages--;
1755	}
1756	qemu_mutex_unlock(&rs->bitmap_mutex);
1757
1758	return ret;
1759	}
1760
1761	/ Called with RCU critical section /
1762	static void ramblock_sync_dirty_bitmap(RAMState rs, RAMBlock rb)
1763	{
1764	rs->migration_dirty_pages +=
1765	cpu_physical_memory_sync_dirty_bitmap(rb, `0`, rb->used_length,
1766	&rs->num_dirty_pages_period);
1767	}
1768
1769	/**
1770	* ram_pagesize_summary: calculate all the pagesizes of a VM
1771	*
1772	* Returns a summary bitmap of the page sizes of all RAMBlocks
1773	*
1774	* For VMs with just normal pages this is equivalent to the host page
1775	* size. If it's got some huge pages then it's the OR of all the
1776	* different page sizes.
1777	*/
1778	uint64_t ram_pagesize_summary(void)
1779	{
1780	RAMBlock *block;
1781	uint64_t summary = `0`;
1782
1783	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1784	summary \|= block->page_size;
1785	}
1786
1787	return summary;
1788	}
1789
1790	uint64_t ram_get_total_transferred_pages(void)
1791	{
1792	return ram_counters.normal + ram_counters.duplicate +
1793	compression_counters.pages + xbzrle_counters.pages;
1794	}
1795
1796	static void migration_update_rates(RAMState *rs, int64_t end_time)
1797	{
1798	uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1799	double compressed_size;
1800
1801	/ calculate period counters /
1802	ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * `1000`
1803	/ (end_time - rs->time_last_bitmap_sync);
1804
1805	if (!page_count) {
1806	return;
1807	}
1808
1809	if (migrate_use_xbzrle()) {
1810	xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1811	rs->xbzrle_cache_miss_prev) / page_count;
1812	rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1813	}
1814
1815	if (migrate_use_compression()) {
1816	compression_counters.busy_rate = (double)(compression_counters.busy -
1817	rs->compress_thread_busy_prev) / page_count;
1818	rs->compress_thread_busy_prev = compression_counters.busy;
1819
1820	compressed_size = compression_counters.compressed_size -
1821	rs->compressed_size_prev;
1822	if (compressed_size) {
1823	double uncompressed_size = (compression_counters.pages -
1824	rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1825
1826	/ Compression-Ratio = Uncompressed-size / Compressed-size /
1827	compression_counters.compression_rate =
1828	uncompressed_size / compressed_size;
1829
1830	rs->compress_pages_prev = compression_counters.pages;
1831	rs->compressed_size_prev = compression_counters.compressed_size;
1832	}
1833	}
1834	}
1835
1836	static void migration_bitmap_sync(RAMState *rs)
1837	{
1838	RAMBlock *block;
1839	int64_t end_time;
1840	uint64_t bytes_xfer_now;
1841
1842	ram_counters.dirty_sync_count++;
1843
1844	if (!rs->time_last_bitmap_sync) {
1845	rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1846	}
1847
1848	trace_migration_bitmap_sync_start();
1849	memory_global_dirty_log_sync();
1850
1851	qemu_mutex_lock(&rs->bitmap_mutex);
1852	rcu_read_lock();
1853	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1854	ramblock_sync_dirty_bitmap(rs, block);
1855	}
1856	ram_counters.remaining = ram_bytes_remaining();
1857	rcu_read_unlock();
1858	qemu_mutex_unlock(&rs->bitmap_mutex);
1859
1860	memory_global_after_dirty_log_sync();
1861	trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1862
1863	end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1864
1865	/ more than 1 second = 1000 millisecons /
1866	if (end_time > rs->time_last_bitmap_sync + `1000`) {
1867	bytes_xfer_now = ram_counters.transferred;
1868
1869	/ During block migration the auto-converge logic incorrectly detects*
1870	* that ram migration makes no progress. Avoid this by disabling the
1871	* throttling logic during the bulk phase of block migration. */
1872	if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1873	/ The following detection logic can be refined later. For now:*
1874	Check to see if the dirtied bytes is 50% more than the approx.
1875	amount of bytes that just got transferred since the last time we
1876	were in this routine. If that happens twice, start or increase
1877	throttling /*
1878
1879	if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1880	(bytes_xfer_now - rs->bytes_xfer_prev) / `2`) &&
1881	(++rs->dirty_rate_high_cnt >= `2`)) {
1882	trace_migration_throttle();
1883	rs->dirty_rate_high_cnt = `0`;
1884	mig_throttle_guest_down();
1885	}
1886	}
1887
1888	migration_update_rates(rs, end_time);
1889
1890	rs->target_page_count_prev = rs->target_page_count;
1891
1892	/ reset period counters /
1893	rs->time_last_bitmap_sync = end_time;
1894	rs->num_dirty_pages_period = `0`;
1895	rs->bytes_xfer_prev = bytes_xfer_now;
1896	}
1897	if (migrate_use_events()) {
1898	qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1899	}
1900	}
1901
1902	static void migration_bitmap_sync_precopy(RAMState *rs)
1903	{
1904	Error *local_err = NULL;
1905
1906	/*
1907	* The current notifier usage is just an optimization to migration, so we
1908	* don't stop the normal migration process in the error case.
1909	*/
1910	if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1911	error_report_err(local_err);
1912	}
1913
1914	migration_bitmap_sync(rs);
1915
1916	if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1917	error_report_err(local_err);
1918	}
1919	}
1920
1921	/**
1922	* save_zero_page_to_file: send the zero page to the file
1923	*
1924	* Returns the size of data written to the file, 0 means the page is not
1925	* a zero page
1926	*
1927	* @rs: current RAM state
1928	* @file: the file where the data is saved
1929	* @block: block that contains the page we want to send
1930	* @offset: offset inside the block for the page
1931	*/
1932	static int save_zero_page_to_file(RAMState rs, QEMUFile file,
1933	RAMBlock *block, ram_addr_t offset)
1934	{
1935	uint8_t *p = block->host + offset;
1936	int len = `0`;
1937
1938	if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1939	len += save_page_header(rs, file, block, offset \| RAM_SAVE_FLAG_ZERO);
1940	qemu_put_byte(file, `0`);
1941	len += `1`;
1942	}
1943	return len;
1944	}
1945
1946	/**
1947	* save_zero_page: send the zero page to the stream
1948	*
1949	* Returns the number of pages written.
1950	*
1951	* @rs: current RAM state
1952	* @block: block that contains the page we want to send
1953	* @offset: offset inside the block for the page
1954	*/
1955	static int save_zero_page(RAMState rs, RAMBlock block, ram_addr_t offset)
1956	{
1957	int len = save_zero_page_to_file(rs, rs->f, block, offset);
1958
1959	if (len) {
1960	ram_counters.duplicate++;
1961	ram_counters.transferred += len;
1962	return `1`;
1963	}
1964	return -`1`;
1965	}
1966
1967	static void ram_release_pages(const char rbname, uint64_t offset, int* pages)
1968	{
1969	if (!migrate_release_ram() \|\| !migration_in_postcopy()) {
1970	return;
1971	}
1972
1973	ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1974	}
1975
1976	/*
1977	* @pages: the number of pages written by the control path,
1978	* < 0 - error
1979	* > 0 - number of pages written
1980	*
1981	* Return true if the pages has been saved, otherwise false is returned.
1982	*/
1983	static bool control_save_page(RAMState rs, RAMBlock block, ram_addr_t offset,
1984	int *pages)
1985	{
1986	uint64_t bytes_xmit = `0`;
1987	int ret;
1988
1989	*pages = -`1`;
1990	ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1991	&bytes_xmit);
1992	if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1993	return false;
1994	}
1995
1996	if (bytes_xmit) {
1997	ram_counters.transferred += bytes_xmit;
1998	*pages = `1`;
1999	}
2000
2001	if (ret == RAM_SAVE_CONTROL_DELAYED) {
2002	return true;
2003	}
2004
2005	if (bytes_xmit > `0`) {
2006	ram_counters.normal++;
2007	} else if (bytes_xmit == `0`) {
2008	ram_counters.duplicate++;
2009	}
2010
2011	return true;
2012	}
2013
2014	/*
2015	* directly send the page to the stream
2016	*
2017	* Returns the number of pages written.
2018	*
2019	* @rs: current RAM state
2020	* @block: block that contains the page we want to send
2021	* @offset: offset inside the block for the page
2022	* @buf: the page to be sent
2023	* @async: send to page asyncly
2024	*/
2025	static int save_normal_page(RAMState rs, RAMBlock block, ram_addr_t offset,
2026	uint8_t *buf, bool async)
2027	{
2028	ram_counters.transferred += save_page_header(rs, rs->f, block,
2029	offset \| RAM_SAVE_FLAG_PAGE);
2030	if (async) {
2031	qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2032	migrate_release_ram() &
2033	migration_in_postcopy());
2034	} else {
2035	qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2036	}
2037	ram_counters.transferred += TARGET_PAGE_SIZE;
2038	ram_counters.normal++;
2039	return `1`;
2040	}
2041
2042	/**
2043	* ram_save_page: send the given page to the stream
2044	*
2045	* Returns the number of pages written.
2046	* < 0 - error
2047	* >=0 - Number of pages written - this might legally be 0
2048	* if xbzrle noticed the page was the same.
2049	*
2050	* @rs: current RAM state
2051	* @block: block that contains the page we want to send
2052	* @offset: offset inside the block for the page
2053	* @last_stage: if we are at the completion stage
2054	*/
2055	static int ram_save_page(RAMState rs, PageSearchStatus pss, bool last_stage)
2056	{
2057	int pages = -`1`;
2058	uint8_t *p;
2059	bool send_async = true;
2060	RAMBlock *block = pss->block;
2061	ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2062	ram_addr_t current_addr = block->offset + offset;
2063
2064	p = block->host + offset;
2065	trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2066
2067	XBZRLE_cache_lock();
2068	if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2069	migrate_use_xbzrle()) {
2070	pages = save_xbzrle_page(rs, &p, current_addr, block,
2071	offset, last_stage);
2072	if (!last_stage) {
2073	/ Can't send this cached data async, since the cache page*
2074	* might get updated before it gets to the wire
2075	*/
2076	send_async = false;
2077	}
2078	}
2079
2080	/ XBZRLE overflow or normal page /
2081	if (pages == -`1`) {
2082	pages = save_normal_page(rs, block, offset, p, send_async);
2083	}
2084
2085	XBZRLE_cache_unlock();
2086
2087	return pages;
2088	}
2089
2090	static int ram_save_multifd_page(RAMState rs, RAMBlock block,
2091	ram_addr_t offset)
2092	{
2093	if (multifd_queue_page(rs, block, offset) < `0`) {
2094	return -`1`;
2095	}
2096	ram_counters.normal++;
2097
2098	return `1`;
2099	}
2100
2101	static bool do_compress_ram_page(QEMUFile f, z_stream stream, RAMBlock *block,
2102	ram_addr_t offset, uint8_t *source_buf)
2103	{
2104	RAMState *rs = ram_state;
2105	uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2106	bool zero_page = false;
2107	int ret;
2108
2109	if (save_zero_page_to_file(rs, f, block, offset)) {
2110	zero_page = true;
2111	goto exit;
2112	}
2113
2114	save_page_header(rs, f, block, offset \| RAM_SAVE_FLAG_COMPRESS_PAGE);
2115
2116	/*
2117	* copy it to a internal buffer to avoid it being modified by VM
2118	* so that we can catch up the error during compression and
2119	* decompression
2120	*/
2121	memcpy(source_buf, p, TARGET_PAGE_SIZE);
2122	ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2123	if (ret < `0`) {
2124	qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2125	error_report("compressed data failed!");
2126	return false;
2127	}
2128
2129	exit:
2130	ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, `1`);
2131	return zero_page;
2132	}
2133
2134	static void
2135	update_compress_thread_counts(const CompressParam param, int* bytes_xmit)
2136	{
2137	ram_counters.transferred += bytes_xmit;
2138
2139	if (param->zero_page) {
2140	ram_counters.duplicate++;
2141	return;
2142	}
2143
2144	/ 8 means a header with RAM_SAVE_FLAG_CONTINUE. /
2145	compression_counters.compressed_size += bytes_xmit - `8`;
2146	compression_counters.pages++;
2147	}
2148
2149	static bool save_page_use_compression(RAMState *rs);
2150
2151	static void flush_compressed_data(RAMState *rs)
2152	{
2153	int idx, len, thread_count;
2154
2155	if (!save_page_use_compression(rs)) {
2156	return;
2157	}
2158	thread_count = migrate_compress_threads();
2159
2160	qemu_mutex_lock(&comp_done_lock);
2161	for (idx = `0`; idx < thread_count; idx++) {
2162	while (!comp_param[idx].done) {
2163	qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2164	}
2165	}
2166	qemu_mutex_unlock(&comp_done_lock);
2167
2168	for (idx = `0`; idx < thread_count; idx++) {
2169	qemu_mutex_lock(&comp_param[idx].mutex);
2170	if (!comp_param[idx].quit) {
2171	len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2172	/*
2173	* it's safe to fetch zero_page without holding comp_done_lock
2174	* as there is no further request submitted to the thread,
2175	* i.e, the thread should be waiting for a request at this point.
2176	*/
2177	update_compress_thread_counts(&comp_param[idx], len);
2178	}
2179	qemu_mutex_unlock(&comp_param[idx].mutex);
2180	}
2181	}
2182
2183	static inline void set_compress_params(CompressParam param, RAMBlock block,
2184	ram_addr_t offset)
2185	{
2186	param->block = block;
2187	param->offset = offset;
2188	}
2189
2190	static int compress_page_with_multi_thread(RAMState rs, RAMBlock block,
2191	ram_addr_t offset)
2192	{
2193	int idx, thread_count, bytes_xmit = -`1`, pages = -`1`;
2194	bool wait = migrate_compress_wait_thread();
2195
2196	thread_count = migrate_compress_threads();
2197	qemu_mutex_lock(&comp_done_lock);
2198	retry:
2199	for (idx = `0`; idx < thread_count; idx++) {
2200	if (comp_param[idx].done) {
2201	comp_param[idx].done = false;
2202	bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2203	qemu_mutex_lock(&comp_param[idx].mutex);
2204	set_compress_params(&comp_param[idx], block, offset);
2205	qemu_cond_signal(&comp_param[idx].cond);
2206	qemu_mutex_unlock(&comp_param[idx].mutex);
2207	pages = `1`;
2208	update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2209	break;
2210	}
2211	}
2212
2213	/*
2214	* wait for the free thread if the user specifies 'compress-wait-thread',
2215	* otherwise we will post the page out in the main thread as normal page.
2216	*/
2217	if (pages < `0` && wait) {
2218	qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2219	goto retry;
2220	}
2221	qemu_mutex_unlock(&comp_done_lock);
2222
2223	return pages;
2224	}
2225
2226	/**
2227	* find_dirty_block: find the next dirty page and update any state
2228	* associated with the search process.
2229	*
2230	* Returns true if a page is found
2231	*
2232	* @rs: current RAM state
2233	* @pss: data about the state of the current dirty page scan
2234	* @again: set to false if the search has scanned the whole of RAM
2235	*/
2236	static bool find_dirty_block(RAMState rs, PageSearchStatus pss, bool *again)
2237	{
2238	pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2239	if (pss->complete_round && pss->block == rs->last_seen_block &&
2240	pss->page >= rs->last_page) {
2241	/*
2242	* We've been once around the RAM and haven't found anything.
2243	* Give up.
2244	*/
2245	*again = false;
2246	return false;
2247	}
2248	if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2249	/ Didn't find anything in this RAM Block /
2250	pss->page = `0`;
2251	pss->block = QLIST_NEXT_RCU(pss->block, next);
2252	if (!pss->block) {
2253	/*
2254	* If memory migration starts over, we will meet a dirtied page
2255	* which may still exists in compression threads's ring, so we
2256	* should flush the compressed data to make sure the new page
2257	* is not overwritten by the old one in the destination.
2258	*
2259	* Also If xbzrle is on, stop using the data compression at this
2260	* point. In theory, xbzrle can do better than compression.
2261	*/
2262	flush_compressed_data(rs);
2263
2264	/ Hit the end of the list /
2265	pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2266	/ Flag that we've looped /
2267	pss->complete_round = true;
2268	rs->ram_bulk_stage = false;
2269	}
2270	/ Didn't find anything this time, but try again on the new block /
2271	*again = true;
2272	return false;
2273	} else {
2274	/ Can go around again, but... /
2275	*again = true;
2276	/ We've found something so probably don't need to /
2277	return true;
2278	}
2279	}
2280
2281	/**
2282	* unqueue_page: gets a page of the queue
2283	*
2284	* Helper for 'get_queued_page' - gets a page off the queue
2285	*
2286	* Returns the block of the page (or NULL if none available)
2287	*
2288	* @rs: current RAM state
2289	* @offset: used to return the offset within the RAMBlock
2290	*/
2291	static RAMBlock unqueue_page(RAMState rs, ram_addr_t *offset)
2292	{
2293	RAMBlock *block = NULL;
2294
2295	if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2296	return NULL;
2297	}
2298
2299	qemu_mutex_lock(&rs->src_page_req_mutex);
2300	if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2301	struct RAMSrcPageRequest *entry =
2302	QSIMPLEQ_FIRST(&rs->src_page_requests);
2303	block = entry->rb;
2304	*offset = entry->offset;
2305
2306	if (entry->len > TARGET_PAGE_SIZE) {
2307	entry->len -= TARGET_PAGE_SIZE;
2308	entry->offset += TARGET_PAGE_SIZE;
2309	} else {
2310	memory_region_unref(block->mr);
2311	QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2312	g_free(entry);
2313	migration_consume_urgent_request();
2314	}
2315	}
2316	qemu_mutex_unlock(&rs->src_page_req_mutex);
2317
2318	return block;
2319	}
2320
2321	/**
2322	* get_queued_page: unqueue a page from the postcopy requests
2323	*
2324	* Skips pages that are already sent (!dirty)
2325	*
2326	* Returns true if a queued page is found
2327	*
2328	* @rs: current RAM state
2329	* @pss: data about the state of the current dirty page scan
2330	*/
2331	static bool get_queued_page(RAMState rs, PageSearchStatus pss)
2332	{
2333	RAMBlock *block;
2334	ram_addr_t offset;
2335	bool dirty;
2336
2337	do {
2338	block = unqueue_page(rs, &offset);
2339	/*
2340	* We're sending this page, and since it's postcopy nothing else
2341	* will dirty it, and we must make sure it doesn't get sent again
2342	* even if this queue request was received after the background
2343	* search already sent it.
2344	*/
2345	if (block) {
2346	unsigned long page;
2347
2348	page = offset >> TARGET_PAGE_BITS;
2349	dirty = test_bit(page, block->bmap);
2350	if (!dirty) {
2351	trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2352	page, test_bit(page, block->unsentmap));
2353	} else {
2354	trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2355	}
2356	}
2357
2358	} while (block && !dirty);
2359
2360	if (block) {
2361	/*
2362	* As soon as we start servicing pages out of order, then we have
2363	* to kill the bulk stage, since the bulk stage assumes
2364	* in (migration_bitmap_find_and_reset_dirty) that every page is
2365	* dirty, that's no longer true.
2366	*/
2367	rs->ram_bulk_stage = false;
2368
2369	/*
2370	* We want the background search to continue from the queued page
2371	* since the guest is likely to want other pages near to the page
2372	* it just requested.
2373	*/
2374	pss->block = block;
2375	pss->page = offset >> TARGET_PAGE_BITS;
2376
2377	/*
2378	* This unqueued page would break the "one round" check, even is
2379	* really rare.
2380	*/
2381	pss->complete_round = false;
2382	}
2383
2384	return !!block;
2385	}
2386
2387	/**
2388	* migration_page_queue_free: drop any remaining pages in the ram
2389	* request queue
2390	*
2391	* It should be empty at the end anyway, but in error cases there may
2392	* be some left. in case that there is any page left, we drop it.
2393	*
2394	*/
2395	static void migration_page_queue_free(RAMState *rs)
2396	{
2397	struct RAMSrcPageRequest mspr, next_mspr;
2398	/ This queue generally should be empty - but in the case of a failed*
2399	* migration might have some droppings in.
2400	*/
2401	rcu_read_lock();
2402	QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2403	memory_region_unref(mspr->rb->mr);
2404	QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2405	g_free(mspr);
2406	}
2407	rcu_read_unlock();
2408	}
2409
2410	/**
2411	* ram_save_queue_pages: queue the page for transmission
2412	*
2413	* A request from postcopy destination for example.
2414	*
2415	* Returns zero on success or negative on error
2416	*
2417	* @rbname: Name of the RAMBLock of the request. NULL means the
2418	* same that last one.
2419	* @start: starting address from the start of the RAMBlock
2420	* @len: length (in bytes) to send
2421	*/
2422	int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2423	{
2424	RAMBlock *ramblock;
2425	RAMState *rs = ram_state;
2426
2427	ram_counters.postcopy_requests++;
2428	rcu_read_lock();
2429	if (!rbname) {
2430	/ Reuse last RAMBlock /
2431	ramblock = rs->last_req_rb;
2432
2433	if (!ramblock) {
2434	/*
2435	* Shouldn't happen, we can't reuse the last RAMBlock if
2436	* it's the 1st request.
2437	*/
2438	error_report("ram_save_queue_pages no previous block");
2439	goto err;
2440	}
2441	} else {
2442	ramblock = qemu_ram_block_by_name(rbname);
2443
2444	if (!ramblock) {
2445	/ We shouldn't be asked for a non-existent RAMBlock /
2446	error_report("ram_save_queue_pages no block '%s'", rbname);
2447	goto err;
2448	}
2449	rs->last_req_rb = ramblock;
2450	}
2451	trace_ram_save_queue_pages(ramblock->idstr, start, len);
2452	if (start+len > ramblock->used_length) {
2453	error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2454	RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2455	__func__, start, len, ramblock->used_length);
2456	goto err;
2457	}
2458
2459	struct RAMSrcPageRequest *new_entry =
2460	g_malloc0(sizeof(struct RAMSrcPageRequest));
2461	new_entry->rb = ramblock;
2462	new_entry->offset = start;
2463	new_entry->len = len;
2464
2465	memory_region_ref(ramblock->mr);
2466	qemu_mutex_lock(&rs->src_page_req_mutex);
2467	QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2468	migration_make_urgent_request();
2469	qemu_mutex_unlock(&rs->src_page_req_mutex);
2470	rcu_read_unlock();
2471
2472	return `0`;
2473
2474	err:
2475	rcu_read_unlock();
2476	return -`1`;
2477	}
2478
2479	static bool save_page_use_compression(RAMState *rs)
2480	{
2481	if (!migrate_use_compression()) {
2482	return false;
2483	}
2484
2485	/*
2486	* If xbzrle is on, stop using the data compression after first
2487	* round of migration even if compression is enabled. In theory,
2488	* xbzrle can do better than compression.
2489	*/
2490	if (rs->ram_bulk_stage \|\| !migrate_use_xbzrle()) {
2491	return true;
2492	}
2493
2494	return false;
2495	}
2496
2497	/*
2498	* try to compress the page before posting it out, return true if the page
2499	* has been properly handled by compression, otherwise needs other
2500	* paths to handle it
2501	*/
2502	static bool save_compress_page(RAMState rs, RAMBlock block, ram_addr_t offset)
2503	{
2504	if (!save_page_use_compression(rs)) {
2505	return false;
2506	}
2507
2508	/*
2509	* When starting the process of a new block, the first page of
2510	* the block should be sent out before other pages in the same
2511	* block, and all the pages in last block should have been sent
2512	* out, keeping this order is important, because the 'cont' flag
2513	* is used to avoid resending the block name.
2514	*
2515	* We post the fist page as normal page as compression will take
2516	* much CPU resource.
2517	*/
2518	if (block != rs->last_sent_block) {
2519	flush_compressed_data(rs);
2520	return false;
2521	}
2522
2523	if (compress_page_with_multi_thread(rs, block, offset) > `0`) {
2524	return true;
2525	}
2526
2527	compression_counters.busy++;
2528	return false;
2529	}
2530
2531	/**
2532	* ram_save_target_page: save one target page
2533	*
2534	* Returns the number of pages written
2535	*
2536	* @rs: current RAM state
2537	* @pss: data about the page we want to send
2538	* @last_stage: if we are at the completion stage
2539	*/
2540	static int ram_save_target_page(RAMState rs, PageSearchStatus pss,
2541	bool last_stage)
2542	{
2543	RAMBlock *block = pss->block;
2544	ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2545	int res;
2546
2547	if (control_save_page(rs, block, offset, &res)) {
2548	return res;
2549	}
2550
2551	if (save_compress_page(rs, block, offset)) {
2552	return `1`;
2553	}
2554
2555	res = save_zero_page(rs, block, offset);
2556	if (res > `0`) {
2557	/ Must let xbzrle know, otherwise a previous (now 0'd) cached*
2558	* page would be stale
2559	*/
2560	if (!save_page_use_compression(rs)) {
2561	XBZRLE_cache_lock();
2562	xbzrle_cache_zero_page(rs, block->offset + offset);
2563	XBZRLE_cache_unlock();
2564	}
2565	ram_release_pages(block->idstr, offset, res);
2566	return res;
2567	}
2568
2569	/*
2570	* do not use multifd for compression as the first page in the new
2571	* block should be posted out before sending the compressed page
2572	*/
2573	if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2574	return ram_save_multifd_page(rs, block, offset);
2575	}
2576
2577	return ram_save_page(rs, pss, last_stage);
2578	}
2579
2580	/**
2581	* ram_save_host_page: save a whole host page
2582	*
2583	* Starting at *offset send pages up to the end of the current host
2584	* page. It's valid for the initial offset to point into the middle of
2585	* a host page in which case the remainder of the hostpage is sent.
2586	* Only dirty target pages are sent. Note that the host page size may
2587	* be a huge page for this block.
2588	* The saving stops at the boundary of the used_length of the block
2589	* if the RAMBlock isn't a multiple of the host page size.
2590	*
2591	* Returns the number of pages written or negative on error
2592	*
2593	* @rs: current RAM state
2594	* @ms: current migration state
2595	* @pss: data about the page we want to send
2596	* @last_stage: if we are at the completion stage
2597	*/
2598	static int ram_save_host_page(RAMState rs, PageSearchStatus pss,
2599	bool last_stage)
2600	{
2601	int tmppages, pages = `0`;
2602	size_t pagesize_bits =
2603	qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2604
2605	if (ramblock_is_ignored(pss->block)) {
2606	error_report("block %s should not be migrated !", pss->block->idstr);
2607	return `0`;
2608	}
2609
2610	do {
2611	/ Check the pages is dirty and if it is send it /
2612	if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2613	pss->page++;
2614	continue;
2615	}
2616
2617	tmppages = ram_save_target_page(rs, pss, last_stage);
2618	if (tmppages < `0`) {
2619	return tmppages;
2620	}
2621
2622	pages += tmppages;
2623	if (pss->block->unsentmap) {
2624	clear_bit(pss->page, pss->block->unsentmap);
2625	}
2626
2627	pss->page++;
2628	} while ((pss->page & (pagesize_bits - `1`)) &&
2629	offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2630
2631	/ The offset we leave with is the last one we looked at /
2632	pss->page--;
2633	return pages;
2634	}
2635
2636	/**
2637	* ram_find_and_save_block: finds a dirty page and sends it to f
2638	*
2639	* Called within an RCU critical section.
2640	*
2641	* Returns the number of pages written where zero means no dirty pages,
2642	* or negative on error
2643	*
2644	* @rs: current RAM state
2645	* @last_stage: if we are at the completion stage
2646	*
2647	* On systems where host-page-size > target-page-size it will send all the
2648	* pages in a host page that are dirty.
2649	*/
2650
2651	static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2652	{
2653	PageSearchStatus pss;
2654	int pages = `0`;
2655	bool again, found;
2656
2657	/ No dirty page as there is zero RAM /
2658	if (!ram_bytes_total()) {
2659	return pages;
2660	}
2661
2662	pss.block = rs->last_seen_block;
2663	pss.page = rs->last_page;
2664	pss.complete_round = false;
2665
2666	if (!pss.block) {
2667	pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2668	}
2669
2670	do {
2671	again = true;
2672	found = get_queued_page(rs, &pss);
2673
2674	if (!found) {
2675	/ priority queue empty, so just search for something dirty /
2676	found = find_dirty_block(rs, &pss, &again);
2677	}
2678
2679	if (found) {
2680	pages = ram_save_host_page(rs, &pss, last_stage);
2681	}
2682	} while (!pages && again);
2683
2684	rs->last_seen_block = pss.block;
2685	rs->last_page = pss.page;
2686
2687	return pages;
2688	}
2689
2690	void acct_update_position(QEMUFile *f, size_t size, bool zero)
2691	{
2692	uint64_t pages = size / TARGET_PAGE_SIZE;
2693
2694	if (zero) {
2695	ram_counters.duplicate += pages;
2696	} else {
2697	ram_counters.normal += pages;
2698	ram_counters.transferred += size;
2699	qemu_update_position(f, size);
2700	}
2701	}
2702
2703	static uint64_t ram_bytes_total_common(bool count_ignored)
2704	{
2705	RAMBlock *block;
2706	uint64_t total = `0`;
2707
2708	rcu_read_lock();
2709	if (count_ignored) {
2710	RAMBLOCK_FOREACH_MIGRATABLE(block) {
2711	total += block->used_length;
2712	}
2713	} else {
2714	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2715	total += block->used_length;
2716	}
2717	}
2718	rcu_read_unlock();
2719	return total;
2720	}
2721
2722	uint64_t ram_bytes_total(void)
2723	{
2724	return ram_bytes_total_common(false);
2725	}
2726
2727	static void xbzrle_load_setup(void)
2728	{
2729	XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2730	}
2731
2732	static void xbzrle_load_cleanup(void)
2733	{
2734	g_free(XBZRLE.decoded_buf);
2735	XBZRLE.decoded_buf = NULL;
2736	}
2737
2738	static void ram_state_cleanup(RAMState **rsp)
2739	{
2740	if (*rsp) {
2741	migration_page_queue_free(*rsp);
2742	qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2743	qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2744	g_free(*rsp);
2745	*rsp = NULL;
2746	}
2747	}
2748
2749	static void xbzrle_cleanup(void)
2750	{
2751	XBZRLE_cache_lock();
2752	if (XBZRLE.cache) {
2753	cache_fini(XBZRLE.cache);
2754	g_free(XBZRLE.encoded_buf);
2755	g_free(XBZRLE.current_buf);
2756	g_free(XBZRLE.zero_target_page);
2757	XBZRLE.cache = NULL;
2758	XBZRLE.encoded_buf = NULL;
2759	XBZRLE.current_buf = NULL;
2760	XBZRLE.zero_target_page = NULL;
2761	}
2762	XBZRLE_cache_unlock();
2763	}
2764
2765	static void ram_save_cleanup(void *opaque)
2766	{
2767	RAMState **rsp = opaque;
2768	RAMBlock *block;
2769
2770	/ caller have hold iothread lock or is in a bh, so there is*
2771	* no writing race against the migration bitmap
2772	*/
2773	memory_global_dirty_log_stop();
2774
2775	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2776	g_free(block->clear_bmap);
2777	block->clear_bmap = NULL;
2778	g_free(block->bmap);
2779	block->bmap = NULL;
2780	g_free(block->unsentmap);
2781	block->unsentmap = NULL;
2782	}
2783
2784	xbzrle_cleanup();
2785	compress_threads_save_cleanup();
2786	ram_state_cleanup(rsp);
2787	}
2788
2789	static void ram_state_reset(RAMState *rs)
2790	{
2791	rs->last_seen_block = NULL;
2792	rs->last_sent_block = NULL;
2793	rs->last_page = `0`;
2794	rs->last_version = ram_list.version;
2795	rs->ram_bulk_stage = true;
2796	rs->fpo_enabled = false;
2797	}
2798
2799	#define MAX_WAIT 50 /* ms, half buffered_file limit */
2800
2801	/*
2802	* 'expected' is the value you expect the bitmap mostly to be full
2803	* of; it won't bother printing lines that are all this value.
2804	* If 'todump' is null the migration bitmap is dumped.
2805	*/
2806	void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2807	unsigned long pages)
2808	{
2809	int64_t cur;
2810	int64_t linelen = `128`;
2811	char linebuf[`129`];
2812
2813	for (cur = `0`; cur < pages; cur += linelen) {
2814	int64_t curb;
2815	bool found = false;
2816	/*
2817	* Last line; catch the case where the line length
2818	* is longer than remaining ram
2819	*/
2820	if (cur + linelen > pages) {
2821	linelen = pages - cur;
2822	}
2823	for (curb = `0`; curb < linelen; curb++) {
2824	bool thisbit = test_bit(cur + curb, todump);
2825	linebuf[curb] = thisbit ? `'1'` : `'.'`;
2826	found = found \|\| (thisbit != expected);
2827	}
2828	if (found) {
2829	linebuf[curb] = `'\0'`;
2830	fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2831	}
2832	}
2833	}
2834
2835	/ ** functions for postcopy *** /
2836
2837	void ram_postcopy_migrated_memory_release(MigrationState *ms)
2838	{
2839	struct RAMBlock *block;
2840
2841	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2842	unsigned long *bitmap = block->bmap;
2843	unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2844	unsigned long run_start = find_next_zero_bit(bitmap, range, `0`);
2845
2846	while (run_start < range) {
2847	unsigned long run_end = find_next_bit(bitmap, range, run_start + `1`);
2848	ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2849	(run_end - run_start) << TARGET_PAGE_BITS);
2850	run_start = find_next_zero_bit(bitmap, range, run_end + `1`);
2851	}
2852	}
2853	}
2854
2855	/**
2856	* postcopy_send_discard_bm_ram: discard a RAMBlock
2857	*
2858	* Returns zero on success
2859	*
2860	* Callback from postcopy_each_ram_send_discard for each RAMBlock
2861	* Note: At this point the 'unsentmap' is the processed bitmap combined
2862	* with the dirtymap; so a '1' means it's either dirty or unsent.
2863	*
2864	* @ms: current migration state
2865	* @block: RAMBlock to discard
2866	*/
2867	static int postcopy_send_discard_bm_ram(MigrationState ms, RAMBlock block)
2868	{
2869	unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2870	unsigned long current;
2871	unsigned long *unsentmap = block->unsentmap;
2872
2873	for (current = `0`; current < end; ) {
2874	unsigned long one = find_next_bit(unsentmap, end, current);
2875	unsigned long zero, discard_length;
2876
2877	if (one >= end) {
2878	break;
2879	}
2880
2881	zero = find_next_zero_bit(unsentmap, end, one + `1`);
2882
2883	if (zero >= end) {
2884	discard_length = end - one;
2885	} else {
2886	discard_length = zero - one;
2887	}
2888	postcopy_discard_send_range(ms, one, discard_length);
2889	current = one + discard_length;
2890	}
2891
2892	return `0`;
2893	}
2894
2895	/**
2896	* postcopy_each_ram_send_discard: discard all RAMBlocks
2897	*
2898	* Returns 0 for success or negative for error
2899	*
2900	* Utility for the outgoing postcopy code.
2901	* Calls postcopy_send_discard_bm_ram for each RAMBlock
2902	* passing it bitmap indexes and name.
2903	* (qemu_ram_foreach_block ends up passing unscaled lengths
2904	* which would mean postcopy code would have to deal with target page)
2905	*
2906	* @ms: current migration state
2907	*/
2908	static int postcopy_each_ram_send_discard(MigrationState *ms)
2909	{
2910	struct RAMBlock *block;
2911	int ret;
2912
2913	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2914	postcopy_discard_send_init(ms, block->idstr);
2915
2916	/*
2917	* Postcopy sends chunks of bitmap over the wire, but it
2918	* just needs indexes at this point, avoids it having
2919	* target page specific code.
2920	*/
2921	ret = postcopy_send_discard_bm_ram(ms, block);
2922	postcopy_discard_send_finish(ms);
2923	if (ret) {
2924	return ret;
2925	}
2926	}
2927
2928	return `0`;
2929	}
2930
2931	/**
2932	* postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2933	*
2934	* Helper for postcopy_chunk_hostpages; it's called twice to
2935	* canonicalize the two bitmaps, that are similar, but one is
2936	* inverted.
2937	*
2938	* Postcopy requires that all target pages in a hostpage are dirty or
2939	* clean, not a mix. This function canonicalizes the bitmaps.
2940	*
2941	* @ms: current migration state
2942	* @unsent_pass: if true we need to canonicalize partially unsent host pages
2943	* otherwise we need to canonicalize partially dirty host pages
2944	* @block: block that contains the page we want to canonicalize
2945	*/
2946	static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2947	RAMBlock *block)
2948	{
2949	RAMState *rs = ram_state;
2950	unsigned long *bitmap = block->bmap;
2951	unsigned long *unsentmap = block->unsentmap;
2952	unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2953	unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2954	unsigned long run_start;
2955
2956	if (block->page_size == TARGET_PAGE_SIZE) {
2957	/ Easy case - TPS==HPS for a non-huge page RAMBlock /
2958	return;
2959	}
2960
2961	if (unsent_pass) {
2962	/ Find a sent page /
2963	run_start = find_next_zero_bit(unsentmap, pages, `0`);
2964	} else {
2965	/ Find a dirty page /
2966	run_start = find_next_bit(bitmap, pages, `0`);
2967	}
2968
2969	while (run_start < pages) {
2970
2971	/*
2972	* If the start of this run of pages is in the middle of a host
2973	* page, then we need to fixup this host page.
2974	*/
2975	if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2976	/ Find the end of this run /
2977	if (unsent_pass) {
2978	run_start = find_next_bit(unsentmap, pages, run_start + `1`);
2979	} else {
2980	run_start = find_next_zero_bit(bitmap, pages, run_start + `1`);
2981	}
2982	/*
2983	* If the end isn't at the start of a host page, then the
2984	* run doesn't finish at the end of a host page
2985	* and we need to discard.
2986	*/
2987	}
2988
2989	if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2990	unsigned long page;
2991	unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2992	host_ratio);
2993	run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2994
2995	/ Tell the destination to discard this page /
2996	if (unsent_pass \|\| !test_bit(fixup_start_addr, unsentmap)) {
2997	/ For the unsent_pass we:*
2998	* discard partially sent pages
2999	* For the !unsent_pass (dirty) we:
3000	* discard partially dirty pages that were sent
3001	* (any partially sent pages were already discarded
3002	* by the previous unsent_pass)
3003	*/
3004	postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
3005	}
3006
3007	/ Clean up the bitmap /
3008	for (page = fixup_start_addr;
3009	page < fixup_start_addr + host_ratio; page++) {
3010	/ All pages in this host page are now not sent /
3011	set_bit(page, unsentmap);
3012
3013	/*
3014	* Remark them as dirty, updating the count for any pages
3015	* that weren't previously dirty.
3016	*/
3017	rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3018	}
3019	}
3020
3021	if (unsent_pass) {
3022	/ Find the next sent page for the next iteration /
3023	run_start = find_next_zero_bit(unsentmap, pages, run_start);
3024	} else {
3025	/ Find the next dirty page for the next iteration /
3026	run_start = find_next_bit(bitmap, pages, run_start);
3027	}
3028	}
3029	}
3030
3031	/**
3032	* postcopy_chunk_hostpages: discard any partially sent host page
3033	*
3034	* Utility for the outgoing postcopy code.
3035	*
3036	* Discard any partially sent host-page size chunks, mark any partially
3037	* dirty host-page size chunks as all dirty. In this case the host-page
3038	* is the host-page for the particular RAMBlock, i.e. it might be a huge page
3039	*
3040	* Returns zero on success
3041	*
3042	* @ms: current migration state
3043	* @block: block we want to work with
3044	*/
3045	static int postcopy_chunk_hostpages(MigrationState ms, RAMBlock block)
3046	{
3047	postcopy_discard_send_init(ms, block->idstr);
3048
3049	/ First pass: Discard all partially sent host pages /
3050	postcopy_chunk_hostpages_pass(ms, true, block);
3051	/*
3052	* Second pass: Ensure that all partially dirty host pages are made
3053	* fully dirty.
3054	*/
3055	postcopy_chunk_hostpages_pass(ms, false, block);
3056
3057	postcopy_discard_send_finish(ms);
3058	return `0`;
3059	}
3060
3061	/**
3062	* ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3063	*
3064	* Returns zero on success
3065	*
3066	* Transmit the set of pages to be discarded after precopy to the target
3067	* these are pages that:
3068	* a) Have been previously transmitted but are now dirty again
3069	* b) Pages that have never been transmitted, this ensures that
3070	* any pages on the destination that have been mapped by background
3071	* tasks get discarded (transparent huge pages is the specific concern)
3072	* Hopefully this is pretty sparse
3073	*
3074	* @ms: current migration state
3075	*/
3076	int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3077	{
3078	RAMState *rs = ram_state;
3079	RAMBlock *block;
3080	int ret;
3081
3082	rcu_read_lock();
3083
3084	/ This should be our last sync, the src is now paused /
3085	migration_bitmap_sync(rs);
3086
3087	/ Easiest way to make sure we don't resume in the middle of a host-page /
3088	rs->last_seen_block = NULL;
3089	rs->last_sent_block = NULL;
3090	rs->last_page = `0`;
3091
3092	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3093	unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3094	unsigned long *bitmap = block->bmap;
3095	unsigned long *unsentmap = block->unsentmap;
3096
3097	if (!unsentmap) {
3098	/ We don't have a safe way to resize the sentmap, so*
3099	* if the bitmap was resized it will be NULL at this
3100	* point.
3101	*/
3102	error_report("migration ram resized during precopy phase");
3103	rcu_read_unlock();
3104	return -EINVAL;
3105	}
3106	/ Deal with TPS != HPS and huge pages /
3107	ret = postcopy_chunk_hostpages(ms, block);
3108	if (ret) {
3109	rcu_read_unlock();
3110	return ret;
3111	}
3112
3113	/*
3114	* Update the unsentmap to be unsentmap = unsentmap \| dirty
3115	*/
3116	bitmap_or(unsentmap, unsentmap, bitmap, pages);
3117	#ifdef DEBUG_POSTCOPY
3118	ram_debug_dump_bitmap(unsentmap, true, pages);
3119	#endif
3120	}
3121	trace_ram_postcopy_send_discard_bitmap();
3122
3123	ret = postcopy_each_ram_send_discard(ms);
3124	rcu_read_unlock();
3125
3126	return ret;
3127	}
3128
3129	/**
3130	* ram_discard_range: discard dirtied pages at the beginning of postcopy
3131	*
3132	* Returns zero on success
3133	*
3134	* @rbname: name of the RAMBlock of the request. NULL means the
3135	* same that last one.
3136	* @start: RAMBlock starting page
3137	* @length: RAMBlock size
3138	*/
3139	int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3140	{
3141	int ret = -`1`;
3142
3143	trace_ram_discard_range(rbname, start, length);
3144
3145	rcu_read_lock();
3146	RAMBlock *rb = qemu_ram_block_by_name(rbname);
3147
3148	if (!rb) {
3149	error_report("ram_discard_range: Failed to find block '%s'", rbname);
3150	goto err;
3151	}
3152
3153	/*
3154	* On source VM, we don't need to update the received bitmap since
3155	* we don't even have one.
3156	*/
3157	if (rb->receivedmap) {
3158	bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3159	length >> qemu_target_page_bits());
3160	}
3161
3162	ret = ram_block_discard_range(rb, start, length);
3163
3164	err:
3165	rcu_read_unlock();
3166
3167	return ret;
3168	}
3169
3170	/*
3171	* For every allocation, we will try not to crash the VM if the
3172	* allocation failed.
3173	*/
3174	static int xbzrle_init(void)
3175	{
3176	Error *local_err = NULL;
3177
3178	if (!migrate_use_xbzrle()) {
3179	return `0`;
3180	}
3181
3182	XBZRLE_cache_lock();
3183
3184	XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3185	if (!XBZRLE.zero_target_page) {
3186	error_report("%s: Error allocating zero page", __func__);
3187	goto err_out;
3188	}
3189
3190	XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3191	TARGET_PAGE_SIZE, &local_err);
3192	if (!XBZRLE.cache) {
3193	error_report_err(local_err);
3194	goto free_zero_page;
3195	}
3196
3197	XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3198	if (!XBZRLE.encoded_buf) {
3199	error_report("%s: Error allocating encoded_buf", __func__);
3200	goto free_cache;
3201	}
3202
3203	XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3204	if (!XBZRLE.current_buf) {
3205	error_report("%s: Error allocating current_buf", __func__);
3206	goto free_encoded_buf;
3207	}
3208
3209	/ We are all good /
3210	XBZRLE_cache_unlock();
3211	return `0`;
3212
3213	free_encoded_buf:
3214	g_free(XBZRLE.encoded_buf);
3215	XBZRLE.encoded_buf = NULL;
3216	free_cache:
3217	cache_fini(XBZRLE.cache);
3218	XBZRLE.cache = NULL;
3219	free_zero_page:
3220	g_free(XBZRLE.zero_target_page);
3221	XBZRLE.zero_target_page = NULL;
3222	err_out:
3223	XBZRLE_cache_unlock();
3224	return -ENOMEM;
3225	}
3226
3227	static int ram_state_init(RAMState **rsp)
3228	{
3229	*rsp = g_try_new0(RAMState, `1`);
3230
3231	if (!*rsp) {
3232	error_report("%s: Init ramstate fail", __func__);
3233	return -`1`;
3234	}
3235
3236	qemu_mutex_init(&(*rsp)->bitmap_mutex);
3237	qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3238	QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3239
3240	/*
3241	* Count the total number of pages used by ram blocks not including any
3242	* gaps due to alignment or unplugs.
3243	* This must match with the initial values of dirty bitmap.
3244	*/
3245	(*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3246	ram_state_reset(*rsp);
3247
3248	return `0`;
3249	}
3250
3251	static void ram_list_init_bitmaps(void)
3252	{
3253	MigrationState *ms = migrate_get_current();
3254	RAMBlock *block;
3255	unsigned long pages;
3256	uint8_t shift;
3257
3258	/ Skip setting bitmap if there is no RAM /
3259	if (ram_bytes_total()) {
3260	shift = ms->clear_bitmap_shift;
3261	if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3262	error_report("clear_bitmap_shift (%u) too big, using "
3263	"max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3264	shift = CLEAR_BITMAP_SHIFT_MAX;
3265	} else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3266	error_report("clear_bitmap_shift (%u) too small, using "
3267	"min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3268	shift = CLEAR_BITMAP_SHIFT_MIN;
3269	}
3270
3271	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3272	pages = block->max_length >> TARGET_PAGE_BITS;
3273	/*
3274	* The initial dirty bitmap for migration must be set with all
3275	* ones to make sure we'll migrate every guest RAM page to
3276	* destination.
3277	* Here we set RAMBlock.bmap all to 1 because when rebegin a
3278	* new migration after a failed migration, ram_list.
3279	* dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3280	* guest memory.
3281	*/
3282	block->bmap = bitmap_new(pages);
3283	bitmap_set(block->bmap, `0`, pages);
3284	block->clear_bmap_shift = shift;
3285	block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3286	if (migrate_postcopy_ram()) {
3287	block->unsentmap = bitmap_new(pages);
3288	bitmap_set(block->unsentmap, `0`, pages);
3289	}
3290	}
3291	}
3292	}
3293
3294	static void ram_init_bitmaps(RAMState *rs)
3295	{
3296	/ For memory_global_dirty_log_start below. /
3297	qemu_mutex_lock_iothread();
3298	qemu_mutex_lock_ramlist();
3299	rcu_read_lock();
3300
3301	ram_list_init_bitmaps();
3302	memory_global_dirty_log_start();
3303	migration_bitmap_sync_precopy(rs);
3304
3305	rcu_read_unlock();
3306	qemu_mutex_unlock_ramlist();
3307	qemu_mutex_unlock_iothread();
3308	}
3309
3310	static int ram_init_all(RAMState **rsp)
3311	{
3312	if (ram_state_init(rsp)) {
3313	return -`1`;
3314	}
3315
3316	if (xbzrle_init()) {
3317	ram_state_cleanup(rsp);
3318	return -`1`;
3319	}
3320
3321	ram_init_bitmaps(*rsp);
3322
3323	return `0`;
3324	}
3325
3326	static void ram_state_resume_prepare(RAMState rs, QEMUFile out)
3327	{
3328	RAMBlock *block;
3329	uint64_t pages = `0`;
3330
3331	/*
3332	* Postcopy is not using xbzrle/compression, so no need for that.
3333	* Also, since source are already halted, we don't need to care
3334	* about dirty page logging as well.
3335	*/
3336
3337	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3338	pages += bitmap_count_one(block->bmap,
3339	block->used_length >> TARGET_PAGE_BITS);
3340	}
3341
3342	/ This may not be aligned with current bitmaps. Recalculate. /
3343	rs->migration_dirty_pages = pages;
3344
3345	rs->last_seen_block = NULL;
3346	rs->last_sent_block = NULL;
3347	rs->last_page = `0`;
3348	rs->last_version = ram_list.version;
3349	/*
3350	* Disable the bulk stage, otherwise we'll resend the whole RAM no
3351	* matter what we have sent.
3352	*/
3353	rs->ram_bulk_stage = false;
3354
3355	/ Update RAMState cache of output QEMUFile /
3356	rs->f = out;
3357
3358	trace_ram_state_resume_prepare(pages);
3359	}
3360
3361	/*
3362	* This function clears bits of the free pages reported by the caller from the
3363	* migration dirty bitmap. @addr is the host address corresponding to the
3364	* start of the continuous guest free pages, and @len is the total bytes of
3365	* those pages.
3366	*/
3367	void qemu_guest_free_page_hint(void *addr, size_t len)
3368	{
3369	RAMBlock *block;
3370	ram_addr_t offset;
3371	size_t used_len, start, npages;
3372	MigrationState *s = migrate_get_current();
3373
3374	/ This function is currently expected to be used during live migration /
3375	if (!migration_is_setup_or_active(s->state)) {
3376	return;
3377	}
3378
3379	for (; len > `0`; len -= used_len, addr += used_len) {
3380	block = qemu_ram_block_from_host(addr, false, &offset);
3381	if (unlikely(!block \|\| offset >= block->used_length)) {
3382	/*
3383	* The implementation might not support RAMBlock resize during
3384	* live migration, but it could happen in theory with future
3385	* updates. So we add a check here to capture that case.
3386	*/
3387	error_report_once("%s unexpected error", __func__);
3388	return;
3389	}
3390
3391	if (len <= block->used_length - offset) {
3392	used_len = len;
3393	} else {
3394	used_len = block->used_length - offset;
3395	}
3396
3397	start = offset >> TARGET_PAGE_BITS;
3398	npages = used_len >> TARGET_PAGE_BITS;
3399
3400	qemu_mutex_lock(&ram_state->bitmap_mutex);
3401	ram_state->migration_dirty_pages -=
3402	bitmap_count_one_with_offset(block->bmap, start, npages);
3403	bitmap_clear(block->bmap, start, npages);
3404	qemu_mutex_unlock(&ram_state->bitmap_mutex);
3405	}
3406	}
3407
3408	/*
3409	* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3410	* long-running RCU critical section. When rcu-reclaims in the code
3411	* start to become numerous it will be necessary to reduce the
3412	* granularity of these critical sections.
3413	*/
3414
3415	/**
3416	* ram_save_setup: Setup RAM for migration
3417	*
3418	* Returns zero to indicate success and negative for error
3419	*
3420	* @f: QEMUFile where to send the data
3421	* @opaque: RAMState pointer
3422	*/
3423	static int ram_save_setup(QEMUFile f, void* *opaque)
3424	{
3425	RAMState **rsp = opaque;
3426	RAMBlock *block;
3427
3428	if (compress_threads_save_setup()) {
3429	return -`1`;
3430	}
3431
3432	/ migration has already setup the bitmap, reuse it. /
3433	if (!migration_in_colo_state()) {
3434	if (ram_init_all(rsp) != `0`) {
3435	compress_threads_save_cleanup();
3436	return -`1`;
3437	}
3438	}
3439	(*rsp)->f = f;
3440
3441	rcu_read_lock();
3442
3443	qemu_put_be64(f, ram_bytes_total_common(true) \| RAM_SAVE_FLAG_MEM_SIZE);
3444
3445	RAMBLOCK_FOREACH_MIGRATABLE(block) {
3446	qemu_put_byte(f, strlen(block->idstr));
3447	qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3448	qemu_put_be64(f, block->used_length);
3449	if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3450	qemu_put_be64(f, block->page_size);
3451	}
3452	if (migrate_ignore_shared()) {
3453	qemu_put_be64(f, block->mr->addr);
3454	}
3455	}
3456
3457	rcu_read_unlock();
3458
3459	ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3460	ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3461
3462	multifd_send_sync_main(*rsp);
3463	qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3464	qemu_fflush(f);
3465
3466	return `0`;
3467	}
3468
3469	/**
3470	* ram_save_iterate: iterative stage for migration
3471	*
3472	* Returns zero to indicate success and negative for error
3473	*
3474	* @f: QEMUFile where to send the data
3475	* @opaque: RAMState pointer
3476	*/
3477	static int ram_save_iterate(QEMUFile f, void* *opaque)
3478	{
3479	RAMState **temp = opaque;
3480	RAMState rs = temp;
3481	int ret;
3482	int i;
3483	int64_t t0;
3484	int done = `0`;
3485
3486	if (blk_mig_bulk_active()) {
3487	/ Avoid transferring ram during bulk phase of block migration as*
3488	* the bulk phase will usually take a long time and transferring
3489	* ram updates during that time is pointless. */
3490	goto out;
3491	}
3492
3493	rcu_read_lock();
3494	if (ram_list.version != rs->last_version) {
3495	ram_state_reset(rs);
3496	}
3497
3498	/ Read version before ram_list.blocks /
3499	smp_rmb();
3500
3501	ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3502
3503	t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3504	i = `0`;
3505	while ((ret = qemu_file_rate_limit(f)) == `0` \|\|
3506	!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3507	int pages;
3508
3509	if (qemu_file_get_error(f)) {
3510	break;
3511	}
3512
3513	pages = ram_find_and_save_block(rs, false);
3514	/ no more pages to sent /
3515	if (pages == `0`) {
3516	done = `1`;
3517	break;
3518	}
3519
3520	if (pages < `0`) {
3521	qemu_file_set_error(f, pages);
3522	break;
3523	}
3524
3525	rs->target_page_count += pages;
3526
3527	/ we want to check in the 1st loop, just in case it was the 1st time*
3528	and we had to sync the dirty bitmap.
3529	qemu_clock_get_ns() is a bit expensive, so we only check each some
3530	iterations
3531	*/
3532	if ((i & `63`) == `0`) {
3533	uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / `1000000`;
3534	if (t1 > MAX_WAIT) {
3535	trace_ram_save_iterate_big_wait(t1, i);
3536	break;
3537	}
3538	}
3539	i++;
3540	}
3541	rcu_read_unlock();
3542
3543	/*
3544	* Must occur before EOS (or any QEMUFile operation)
3545	* because of RDMA protocol.
3546	*/
3547	ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3548
3549	out:
3550	multifd_send_sync_main(rs);
3551	qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3552	qemu_fflush(f);
3553	ram_counters.transferred += `8`;
3554
3555	ret = qemu_file_get_error(f);
3556	if (ret < `0`) {
3557	return ret;
3558	}
3559
3560	return done;
3561	}
3562
3563	/**
3564	* ram_save_complete: function called to send the remaining amount of ram
3565	*
3566	* Returns zero to indicate success or negative on error
3567	*
3568	* Called with iothread lock
3569	*
3570	* @f: QEMUFile where to send the data
3571	* @opaque: RAMState pointer
3572	*/
3573	static int ram_save_complete(QEMUFile f, void* *opaque)
3574	{
3575	RAMState **temp = opaque;
3576	RAMState rs = temp;
3577	int ret = `0`;
3578
3579	rcu_read_lock();
3580
3581	if (!migration_in_postcopy()) {
3582	migration_bitmap_sync_precopy(rs);
3583	}
3584
3585	ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3586
3587	/ try transferring iterative blocks of memory /
3588
3589	/ flush all remaining blocks regardless of rate limiting /
3590	while (true) {
3591	int pages;
3592
3593	pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3594	/ no more blocks to sent /
3595	if (pages == `0`) {
3596	break;
3597	}
3598	if (pages < `0`) {
3599	ret = pages;
3600	break;
3601	}
3602	}
3603
3604	flush_compressed_data(rs);
3605	ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3606
3607	rcu_read_unlock();
3608
3609	multifd_send_sync_main(rs);
3610	qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3611	qemu_fflush(f);
3612
3613	return ret;
3614	}
3615
3616	static void ram_save_pending(QEMUFile f, void* *opaque, uint64_t max_size,
3617	uint64_t *res_precopy_only,
3618	uint64_t *res_compatible,
3619	uint64_t *res_postcopy_only)
3620	{
3621	RAMState **temp = opaque;
3622	RAMState rs = temp;
3623	uint64_t remaining_size;
3624
3625	remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3626
3627	if (!migration_in_postcopy() &&
3628	remaining_size < max_size) {
3629	qemu_mutex_lock_iothread();
3630	rcu_read_lock();
3631	migration_bitmap_sync_precopy(rs);
3632	rcu_read_unlock();
3633	qemu_mutex_unlock_iothread();
3634	remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3635	}
3636
3637	if (migrate_postcopy_ram()) {
3638	/ We can do postcopy, and all the data is postcopiable /
3639	*res_compatible += remaining_size;
3640	} else {
3641	*res_precopy_only += remaining_size;
3642	}
3643	}
3644
3645	static int load_xbzrle(QEMUFile f, ram_addr_t addr, void* *host)
3646	{
3647	unsigned int xh_len;
3648	int xh_flags;
3649	uint8_t *loaded_data;
3650
3651	/ extract RLE header /
3652	xh_flags = qemu_get_byte(f);
3653	xh_len = qemu_get_be16(f);
3654
3655	if (xh_flags != ENCODING_FLAG_XBZRLE) {
3656	error_report("Failed to load XBZRLE page - wrong compression!");
3657	return -`1`;
3658	}
3659
3660	if (xh_len > TARGET_PAGE_SIZE) {
3661	error_report("Failed to load XBZRLE page - len overflow!");
3662	return -`1`;
3663	}
3664	loaded_data = XBZRLE.decoded_buf;
3665	/ load data and decode /
3666	/ it can change loaded_data to point to an internal buffer /
3667	qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3668
3669	/ decode RLE /
3670	if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3671	TARGET_PAGE_SIZE) == -`1`) {
3672	error_report("Failed to load XBZRLE page - decode error!");
3673	return -`1`;
3674	}
3675
3676	return `0`;
3677	}
3678
3679	/**
3680	* ram_block_from_stream: read a RAMBlock id from the migration stream
3681	*
3682	* Must be called from within a rcu critical section.
3683	*
3684	* Returns a pointer from within the RCU-protected ram_list.
3685	*
3686	* @f: QEMUFile where to read the data from
3687	* @flags: Page flags (mostly to see if it's a continuation of previous block)
3688	*/
3689	static inline RAMBlock ram_block_from_stream(QEMUFile f, int flags)
3690	{
3691	static RAMBlock *block = NULL;
3692	char id[`256`];
3693	uint8_t len;
3694
3695	if (flags & RAM_SAVE_FLAG_CONTINUE) {
3696	if (!block) {
3697	error_report("Ack, bad migration stream!");
3698	return NULL;
3699	}
3700	return block;
3701	}
3702
3703	len = qemu_get_byte(f);
3704	qemu_get_buffer(f, (uint8_t *)id, len);
3705	id[len] = `0`;
3706
3707	block = qemu_ram_block_by_name(id);
3708	if (!block) {
3709	error_report("Can't find block %s", id);
3710	return NULL;
3711	}
3712
3713	if (ramblock_is_ignored(block)) {
3714	error_report("block %s should not be migrated !", id);
3715	return NULL;
3716	}
3717
3718	return block;
3719	}
3720
3721	static inline void host_from_ram_block_offset(RAMBlock block,
3722	ram_addr_t offset)
3723	{
3724	if (!offset_in_ramblock(block, offset)) {
3725	return NULL;
3726	}
3727
3728	return block->host + offset;
3729	}
3730
3731	static inline void colo_cache_from_block_offset(RAMBlock block,
3732	ram_addr_t offset)
3733	{
3734	if (!offset_in_ramblock(block, offset)) {
3735	return NULL;
3736	}
3737	if (!block->colo_cache) {
3738	error_report("%s: colo_cache is NULL in block :%s",
3739	__func__, block->idstr);
3740	return NULL;
3741	}
3742
3743	/*
3744	* During colo checkpoint, we need bitmap of these migrated pages.
3745	* It help us to decide which pages in ram cache should be flushed
3746	* into VM's RAM later.
3747	*/
3748	if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3749	ram_state->migration_dirty_pages++;
3750	}
3751	return block->colo_cache + offset;
3752	}
3753
3754	/**
3755	* ram_handle_compressed: handle the zero page case
3756	*
3757	* If a page (or a whole RDMA chunk) has been
3758	* determined to be zero, then zap it.
3759	*
3760	* @host: host address for the zero page
3761	* @ch: what the page is filled from. We only support zero
3762	* @size: size of the zero page
3763	*/
3764	void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3765	{
3766	if (ch != `0` \|\| !is_zero_range(host, size)) {
3767	memset(host, ch, size);
3768	}
3769	}
3770
3771	/ return the size after decompression, or negative value on error /
3772	static int
3773	qemu_uncompress_data(z_stream stream, uint8_t dest, size_t dest_len,
3774	const uint8_t *source, size_t source_len)
3775	{
3776	int err;
3777
3778	err = inflateReset(stream);
3779	if (err != Z_OK) {
3780	return -`1`;
3781	}
3782
3783	stream->avail_in = source_len;
3784	stream->next_in = (uint8_t *)source;
3785	stream->avail_out = dest_len;
3786	stream->next_out = dest;
3787
3788	err = inflate(stream, Z_NO_FLUSH);
3789	if (err != Z_STREAM_END) {
3790	return -`1`;
3791	}
3792
3793	return stream->total_out;
3794	}
3795
3796	static void do_data_decompress(void* *opaque)
3797	{
3798	DecompressParam *param = opaque;
3799	unsigned long pagesize;
3800	uint8_t *des;
3801	int len, ret;
3802
3803	qemu_mutex_lock(&param->mutex);
3804	while (!param->quit) {
3805	if (param->des) {
3806	des = param->des;
3807	len = param->len;
3808	param->des = `0`;
3809	qemu_mutex_unlock(&param->mutex);
3810
3811	pagesize = TARGET_PAGE_SIZE;
3812
3813	ret = qemu_uncompress_data(&param->stream, des, pagesize,
3814	param->compbuf, len);
3815	if (ret < `0` && migrate_get_current()->decompress_error_check) {
3816	error_report("decompress data failed");
3817	qemu_file_set_error(decomp_file, ret);
3818	}
3819
3820	qemu_mutex_lock(&decomp_done_lock);
3821	param->done = true;
3822	qemu_cond_signal(&decomp_done_cond);
3823	qemu_mutex_unlock(&decomp_done_lock);
3824
3825	qemu_mutex_lock(&param->mutex);
3826	} else {
3827	qemu_cond_wait(&param->cond, &param->mutex);
3828	}
3829	}
3830	qemu_mutex_unlock(&param->mutex);
3831
3832	return NULL;
3833	}
3834
3835	static int wait_for_decompress_done(void)
3836	{
3837	int idx, thread_count;
3838
3839	if (!migrate_use_compression()) {
3840	return `0`;
3841	}
3842
3843	thread_count = migrate_decompress_threads();
3844	qemu_mutex_lock(&decomp_done_lock);
3845	for (idx = `0`; idx < thread_count; idx++) {
3846	while (!decomp_param[idx].done) {
3847	qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3848	}
3849	}
3850	qemu_mutex_unlock(&decomp_done_lock);
3851	return qemu_file_get_error(decomp_file);
3852	}
3853
3854	static void compress_threads_load_cleanup(void)
3855	{
3856	int i, thread_count;
3857
3858	if (!migrate_use_compression()) {
3859	return;
3860	}
3861	thread_count = migrate_decompress_threads();
3862	for (i = `0`; i < thread_count; i++) {
3863	/*
3864	* we use it as a indicator which shows if the thread is
3865	* properly init'd or not
3866	*/
3867	if (!decomp_param[i].compbuf) {
3868	break;
3869	}
3870
3871	qemu_mutex_lock(&decomp_param[i].mutex);
3872	decomp_param[i].quit = true;
3873	qemu_cond_signal(&decomp_param[i].cond);
3874	qemu_mutex_unlock(&decomp_param[i].mutex);
3875	}
3876	for (i = `0`; i < thread_count; i++) {
3877	if (!decomp_param[i].compbuf) {
3878	break;
3879	}
3880
3881	qemu_thread_join(decompress_threads + i);
3882	qemu_mutex_destroy(&decomp_param[i].mutex);
3883	qemu_cond_destroy(&decomp_param[i].cond);
3884	inflateEnd(&decomp_param[i].stream);
3885	g_free(decomp_param[i].compbuf);
3886	decomp_param[i].compbuf = NULL;
3887	}
3888	g_free(decompress_threads);
3889	g_free(decomp_param);
3890	decompress_threads = NULL;
3891	decomp_param = NULL;
3892	decomp_file = NULL;
3893	}
3894
3895	static int compress_threads_load_setup(QEMUFile *f)
3896	{
3897	int i, thread_count;
3898
3899	if (!migrate_use_compression()) {
3900	return `0`;
3901	}
3902
3903	thread_count = migrate_decompress_threads();
3904	decompress_threads = g_new0(QemuThread, thread_count);
3905	decomp_param = g_new0(DecompressParam, thread_count);
3906	qemu_mutex_init(&decomp_done_lock);
3907	qemu_cond_init(&decomp_done_cond);
3908	decomp_file = f;
3909	for (i = `0`; i < thread_count; i++) {
3910	if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3911	goto exit;
3912	}
3913
3914	decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3915	qemu_mutex_init(&decomp_param[i].mutex);
3916	qemu_cond_init(&decomp_param[i].cond);
3917	decomp_param[i].done = true;
3918	decomp_param[i].quit = false;
3919	qemu_thread_create(decompress_threads + i, "decompress",
3920	do_data_decompress, decomp_param + i,
3921	QEMU_THREAD_JOINABLE);
3922	}
3923	return `0`;
3924	exit:
3925	compress_threads_load_cleanup();
3926	return -`1`;
3927	}
3928
3929	static void decompress_data_with_multi_threads(QEMUFile *f,
3930	void host, int* len)
3931	{
3932	int idx, thread_count;
3933
3934	thread_count = migrate_decompress_threads();
3935	qemu_mutex_lock(&decomp_done_lock);
3936	while (true) {
3937	for (idx = `0`; idx < thread_count; idx++) {
3938	if (decomp_param[idx].done) {
3939	decomp_param[idx].done = false;
3940	qemu_mutex_lock(&decomp_param[idx].mutex);
3941	qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3942	decomp_param[idx].des = host;
3943	decomp_param[idx].len = len;
3944	qemu_cond_signal(&decomp_param[idx].cond);
3945	qemu_mutex_unlock(&decomp_param[idx].mutex);
3946	break;
3947	}
3948	}
3949	if (idx < thread_count) {
3950	break;
3951	} else {
3952	qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3953	}
3954	}
3955	qemu_mutex_unlock(&decomp_done_lock);
3956	}
3957
3958	/*
3959	* colo cache: this is for secondary VM, we cache the whole
3960	* memory of the secondary VM, it is need to hold the global lock
3961	* to call this helper.
3962	*/
3963	int colo_init_ram_cache(void)
3964	{
3965	RAMBlock *block;
3966
3967	rcu_read_lock();
3968	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3969	block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3970	NULL,
3971	false);
3972	if (!block->colo_cache) {
3973	error_report("%s: Can't alloc memory for COLO cache of block %s,"
3974	"size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3975	block->used_length);
3976	goto out_locked;
3977	}
3978	memcpy(block->colo_cache, block->host, block->used_length);
3979	}
3980	rcu_read_unlock();
3981	/*
3982	* Record the dirty pages that sent by PVM, we use this dirty bitmap together
3983	* with to decide which page in cache should be flushed into SVM's RAM. Here
3984	* we use the same name 'ram_bitmap' as for migration.
3985	*/
3986	if (ram_bytes_total()) {
3987	RAMBlock *block;
3988
3989	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3990	unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3991
3992	block->bmap = bitmap_new(pages);
3993	bitmap_set(block->bmap, `0`, pages);
3994	}
3995	}
3996	ram_state = g_new0(RAMState, `1`);
3997	ram_state->migration_dirty_pages = `0`;
3998	qemu_mutex_init(&ram_state->bitmap_mutex);
3999	memory_global_dirty_log_start();
4000
4001	return `0`;
4002
4003	out_locked:
4004
4005	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4006	if (block->colo_cache) {
4007	qemu_anon_ram_free(block->colo_cache, block->used_length);
4008	block->colo_cache = NULL;
4009	}
4010	}
4011
4012	rcu_read_unlock();
4013	return -errno;
4014	}
4015
4016	/ It is need to hold the global lock to call this helper /
4017	void colo_release_ram_cache(void)
4018	{
4019	RAMBlock *block;
4020
4021	memory_global_dirty_log_stop();
4022	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023	g_free(block->bmap);
4024	block->bmap = NULL;
4025	}
4026
4027	rcu_read_lock();
4028
4029	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4030	if (block->colo_cache) {
4031	qemu_anon_ram_free(block->colo_cache, block->used_length);
4032	block->colo_cache = NULL;
4033	}
4034	}
4035
4036	rcu_read_unlock();
4037	qemu_mutex_destroy(&ram_state->bitmap_mutex);
4038	g_free(ram_state);
4039	ram_state = NULL;
4040	}
4041
4042	/**
4043	* ram_load_setup: Setup RAM for migration incoming side
4044	*
4045	* Returns zero to indicate success and negative for error
4046	*
4047	* @f: QEMUFile where to receive the data
4048	* @opaque: RAMState pointer
4049	*/
4050	static int ram_load_setup(QEMUFile f, void* *opaque)
4051	{
4052	if (compress_threads_load_setup(f)) {
4053	return -`1`;
4054	}
4055
4056	xbzrle_load_setup();
4057	ramblock_recv_map_init();
4058
4059	return `0`;
4060	}
4061
4062	static int ram_load_cleanup(void *opaque)
4063	{
4064	RAMBlock *rb;
4065
4066	RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4067	if (ramblock_is_pmem(rb)) {
4068	pmem_persist(rb->host, rb->used_length);
4069	}
4070	}
4071
4072	xbzrle_load_cleanup();
4073	compress_threads_load_cleanup();
4074
4075	RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4076	g_free(rb->receivedmap);
4077	rb->receivedmap = NULL;
4078	}
4079
4080	return `0`;
4081	}
4082
4083	/**
4084	* ram_postcopy_incoming_init: allocate postcopy data structures
4085	*
4086	* Returns 0 for success and negative if there was one error
4087	*
4088	* @mis: current migration incoming state
4089	*
4090	* Allocate data structures etc needed by incoming migration with
4091	* postcopy-ram. postcopy-ram's similarly names
4092	* postcopy_ram_incoming_init does the work.
4093	*/
4094	int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4095	{
4096	return postcopy_ram_incoming_init(mis);
4097	}
4098
4099	/**
4100	* ram_load_postcopy: load a page in postcopy case
4101	*
4102	* Returns 0 for success or -errno in case of error
4103	*
4104	* Called in postcopy mode by ram_load().
4105	* rcu_read_lock is taken prior to this being called.
4106	*
4107	* @f: QEMUFile where to send the data
4108	*/
4109	static int ram_load_postcopy(QEMUFile *f)
4110	{
4111	int flags = `0`, ret = `0`;
4112	bool place_needed = false;
4113	bool matches_target_page_size = false;
4114	MigrationIncomingState *mis = migration_incoming_get_current();
4115	/ Temporary page that is later 'placed' /
4116	void *postcopy_host_page = postcopy_get_tmp_page(mis);
4117	void *last_host = NULL;
4118	bool all_zero = false;
4119
4120	while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4121	ram_addr_t addr;
4122	void *host = NULL;
4123	void *page_buffer = NULL;
4124	void *place_source = NULL;
4125	RAMBlock *block = NULL;
4126	uint8_t ch;
4127
4128	addr = qemu_get_be64(f);
4129
4130	/*
4131	* If qemu file error, we should stop here, and then "addr"
4132	* may be invalid
4133	*/
4134	ret = qemu_file_get_error(f);
4135	if (ret) {
4136	break;
4137	}
4138
4139	flags = addr & ~TARGET_PAGE_MASK;
4140	addr &= TARGET_PAGE_MASK;
4141
4142	trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4143	place_needed = false;
4144	if (flags & (RAM_SAVE_FLAG_ZERO \| RAM_SAVE_FLAG_PAGE)) {
4145	block = ram_block_from_stream(f, flags);
4146
4147	host = host_from_ram_block_offset(block, addr);
4148	if (!host) {
4149	error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4150	ret = -EINVAL;
4151	break;
4152	}
4153	matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4154	/*
4155	* Postcopy requires that we place whole host pages atomically;
4156	* these may be huge pages for RAMBlocks that are backed by
4157	* hugetlbfs.
4158	* To make it atomic, the data is read into a temporary page
4159	* that's moved into place later.
4160	* The migration protocol uses, possibly smaller, target-pages
4161	* however the source ensures it always sends all the components
4162	* of a host page in order.
4163	*/
4164	page_buffer = postcopy_host_page +
4165	((uintptr_t)host & (block->page_size - `1`));
4166	/ If all TP are zero then we can optimise the place /
4167	if (!((uintptr_t)host & (block->page_size - `1`))) {
4168	all_zero = true;
4169	} else {
4170	/ not the 1st TP within the HP /
4171	if (host != (last_host + TARGET_PAGE_SIZE)) {
4172	error_report("Non-sequential target page %p/%p",
4173	host, last_host);
4174	ret = -EINVAL;
4175	break;
4176	}
4177	}
4178
4179
4180	/*
4181	* If it's the last part of a host page then we place the host
4182	* page
4183	*/
4184	place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4185	(block->page_size - `1`)) == `0`;
4186	place_source = postcopy_host_page;
4187	}
4188	last_host = host;
4189
4190	switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4191	case RAM_SAVE_FLAG_ZERO:
4192	ch = qemu_get_byte(f);
4193	memset(page_buffer, ch, TARGET_PAGE_SIZE);
4194	if (ch) {
4195	all_zero = false;
4196	}
4197	break;
4198
4199	case RAM_SAVE_FLAG_PAGE:
4200	all_zero = false;
4201	if (!matches_target_page_size) {
4202	/ For huge pages, we always use temporary buffer /
4203	qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4204	} else {
4205	/*
4206	* For small pages that matches target page size, we
4207	* avoid the qemu_file copy. Instead we directly use
4208	* the buffer of QEMUFile to place the page. Note: we
4209	* cannot do any QEMUFile operation before using that
4210	* buffer to make sure the buffer is valid when
4211	* placing the page.
4212	*/
4213	qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4214	TARGET_PAGE_SIZE);
4215	}
4216	break;
4217	case RAM_SAVE_FLAG_EOS:
4218	/ normal exit /
4219	multifd_recv_sync_main();
4220	break;
4221	default:
4222	error_report("Unknown combination of migration flags: %#x"
4223	" (postcopy mode)", flags);
4224	ret = -EINVAL;
4225	break;
4226	}
4227
4228	/ Detect for any possible file errors /
4229	if (!ret && qemu_file_get_error(f)) {
4230	ret = qemu_file_get_error(f);
4231	}
4232
4233	if (!ret && place_needed) {
4234	/ This gets called at the last target page in the host page /
4235	void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4236
4237	if (all_zero) {
4238	ret = postcopy_place_page_zero(mis, place_dest,
4239	block);
4240	} else {
4241	ret = postcopy_place_page(mis, place_dest,
4242	place_source, block);
4243	}
4244	}
4245	}
4246
4247	return ret;
4248	}
4249
4250	static bool postcopy_is_advised(void)
4251	{
4252	PostcopyState ps = postcopy_state_get();
4253	return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4254	}
4255
4256	static bool postcopy_is_running(void)
4257	{
4258	PostcopyState ps = postcopy_state_get();
4259	return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4260	}
4261
4262	/*
4263	* Flush content of RAM cache into SVM's memory.
4264	* Only flush the pages that be dirtied by PVM or SVM or both.
4265	*/
4266	static void colo_flush_ram_cache(void)
4267	{
4268	RAMBlock *block = NULL;
4269	void *dst_host;
4270	void *src_host;
4271	unsigned long offset = `0`;
4272
4273	memory_global_dirty_log_sync();
4274	rcu_read_lock();
4275	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4276	ramblock_sync_dirty_bitmap(ram_state, block);
4277	}
4278	rcu_read_unlock();
4279
4280	trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4281	rcu_read_lock();
4282	block = QLIST_FIRST_RCU(&ram_list.blocks);
4283
4284	while (block) {
4285	offset = migration_bitmap_find_dirty(ram_state, block, offset);
4286
4287	if (offset << TARGET_PAGE_BITS >= block->used_length) {
4288	offset = `0`;
4289	block = QLIST_NEXT_RCU(block, next);
4290	} else {
4291	migration_bitmap_clear_dirty(ram_state, block, offset);
4292	dst_host = block->host + (offset << TARGET_PAGE_BITS);
4293	src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4294	memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4295	}
4296	}
4297
4298	rcu_read_unlock();
4299	trace_colo_flush_ram_cache_end();
4300	}
4301
4302	/**
4303	* ram_load_precopy: load pages in precopy case
4304	*
4305	* Returns 0 for success or -errno in case of error
4306	*
4307	* Called in precopy mode by ram_load().
4308	* rcu_read_lock is taken prior to this being called.
4309	*
4310	* @f: QEMUFile where to send the data
4311	*/
4312	static int ram_load_precopy(QEMUFile *f)
4313	{
4314	int flags = `0`, ret = `0`, invalid_flags = `0`, len = `0`;
4315	/ ADVISE is earlier, it shows the source has the postcopy capability on /
4316	bool postcopy_advised = postcopy_is_advised();
4317	if (!migrate_use_compression()) {
4318	invalid_flags \|= RAM_SAVE_FLAG_COMPRESS_PAGE;
4319	}
4320
4321	while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4322	ram_addr_t addr, total_ram_bytes;
4323	void *host = NULL;
4324	uint8_t ch;
4325
4326	addr = qemu_get_be64(f);
4327	flags = addr & ~TARGET_PAGE_MASK;
4328	addr &= TARGET_PAGE_MASK;
4329
4330	if (flags & invalid_flags) {
4331	if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4332	error_report("Received an unexpected compressed page");
4333	}
4334
4335	ret = -EINVAL;
4336	break;
4337	}
4338
4339	if (flags & (RAM_SAVE_FLAG_ZERO \| RAM_SAVE_FLAG_PAGE \|
4340	RAM_SAVE_FLAG_COMPRESS_PAGE \| RAM_SAVE_FLAG_XBZRLE)) {
4341	RAMBlock *block = ram_block_from_stream(f, flags);
4342
4343	/*
4344	* After going into COLO, we should load the Page into colo_cache.
4345	*/
4346	if (migration_incoming_in_colo_state()) {
4347	host = colo_cache_from_block_offset(block, addr);
4348	} else {
4349	host = host_from_ram_block_offset(block, addr);
4350	}
4351	if (!host) {
4352	error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4353	ret = -EINVAL;
4354	break;
4355	}
4356
4357	if (!migration_incoming_in_colo_state()) {
4358	ramblock_recv_bitmap_set(block, host);
4359	}
4360
4361	trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4362	}
4363
4364	switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4365	case RAM_SAVE_FLAG_MEM_SIZE:
4366	/ Synchronize RAM block list /
4367	total_ram_bytes = addr;
4368	while (!ret && total_ram_bytes) {
4369	RAMBlock *block;
4370	char id[`256`];
4371	ram_addr_t length;
4372
4373	len = qemu_get_byte(f);
4374	qemu_get_buffer(f, (uint8_t *)id, len);
4375	id[len] = `0`;
4376	length = qemu_get_be64(f);
4377
4378	block = qemu_ram_block_by_name(id);
4379	if (block && !qemu_ram_is_migratable(block)) {
4380	error_report("block %s should not be migrated !", id);
4381	ret = -EINVAL;
4382	} else if (block) {
4383	if (length != block->used_length) {
4384	Error *local_err = NULL;
4385
4386	ret = qemu_ram_resize(block, length,
4387	&local_err);
4388	if (local_err) {
4389	error_report_err(local_err);
4390	}
4391	}
4392	/ For postcopy we need to check hugepage sizes match /
4393	if (postcopy_advised &&
4394	block->page_size != qemu_host_page_size) {
4395	uint64_t remote_page_size = qemu_get_be64(f);
4396	if (remote_page_size != block->page_size) {
4397	error_report("Mismatched RAM page size %s "
4398	"(local) %zd != %" PRId64,
4399	id, block->page_size,
4400	remote_page_size);
4401	ret = -EINVAL;
4402	}
4403	}
4404	if (migrate_ignore_shared()) {
4405	hwaddr addr = qemu_get_be64(f);
4406	if (ramblock_is_ignored(block) &&
4407	block->mr->addr != addr) {
4408	error_report("Mismatched GPAs for block %s "
4409	"%" PRId64 "!= %" PRId64,
4410	id, (uint64_t)addr,
4411	(uint64_t)block->mr->addr);
4412	ret = -EINVAL;
4413	}
4414	}
4415	ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4416	block->idstr);
4417	} else {
4418	error_report("Unknown ramblock \"%s\", cannot "
4419	"accept migration", id);
4420	ret = -EINVAL;
4421	}
4422
4423	total_ram_bytes -= length;
4424	}
4425	break;
4426
4427	case RAM_SAVE_FLAG_ZERO:
4428	ch = qemu_get_byte(f);
4429	ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4430	break;
4431
4432	case RAM_SAVE_FLAG_PAGE:
4433	qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4434	break;
4435
4436	case RAM_SAVE_FLAG_COMPRESS_PAGE:
4437	len = qemu_get_be32(f);
4438	if (len < `0` \|\| len > compressBound(TARGET_PAGE_SIZE)) {
4439	error_report("Invalid compressed data length: %d", len);
4440	ret = -EINVAL;
4441	break;
4442	}
4443	decompress_data_with_multi_threads(f, host, len);
4444	break;
4445
4446	case RAM_SAVE_FLAG_XBZRLE:
4447	if (load_xbzrle(f, addr, host) < `0`) {
4448	error_report("Failed to decompress XBZRLE page at "
4449	RAM_ADDR_FMT, addr);
4450	ret = -EINVAL;
4451	break;
4452	}
4453	break;
4454	case RAM_SAVE_FLAG_EOS:
4455	/ normal exit /
4456	multifd_recv_sync_main();
4457	break;
4458	default:
4459	if (flags & RAM_SAVE_FLAG_HOOK) {
4460	ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4461	} else {
4462	error_report("Unknown combination of migration flags: %#x",
4463	flags);
4464	ret = -EINVAL;
4465	}
4466	}
4467	if (!ret) {
4468	ret = qemu_file_get_error(f);
4469	}
4470	}
4471
4472	return ret;
4473	}
4474
4475	static int ram_load(QEMUFile f, void* opaque, int* version_id)
4476	{
4477	int ret = `0`;
4478	static uint64_t seq_iter;
4479	/*
4480	* If system is running in postcopy mode, page inserts to host memory must
4481	* be atomic
4482	*/
4483	bool postcopy_running = postcopy_is_running();
4484
4485	seq_iter++;
4486
4487	if (version_id != `4`) {
4488	return -EINVAL;
4489	}
4490
4491	/*
4492	* This RCU critical section can be very long running.
4493	* When RCU reclaims in the code start to become numerous,
4494	* it will be necessary to reduce the granularity of this
4495	* critical section.
4496	*/
4497	rcu_read_lock();
4498
4499	if (postcopy_running) {
4500	ret = ram_load_postcopy(f);
4501	} else {
4502	ret = ram_load_precopy(f);
4503	}
4504
4505	ret \|= wait_for_decompress_done();
4506	rcu_read_unlock();
4507	trace_ram_load_complete(ret, seq_iter);
4508
4509	if (!ret && migration_incoming_in_colo_state()) {
4510	colo_flush_ram_cache();
4511	}
4512	return ret;
4513	}
4514
4515	static bool ram_has_postcopy(void *opaque)
4516	{
4517	RAMBlock *rb;
4518	RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4519	if (ramblock_is_pmem(rb)) {
4520	info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4521	"is not supported now!", rb->idstr, rb->host);
4522	return false;
4523	}
4524	}
4525
4526	return migrate_postcopy_ram();
4527	}
4528
4529	/ Sync all the dirty bitmap with destination VM. /
4530	static int ram_dirty_bitmap_sync_all(MigrationState s, RAMState rs)
4531	{
4532	RAMBlock *block;
4533	QEMUFile *file = s->to_dst_file;
4534	int ramblock_count = `0`;
4535
4536	trace_ram_dirty_bitmap_sync_start();
4537
4538	RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4539	qemu_savevm_send_recv_bitmap(file, block->idstr);
4540	trace_ram_dirty_bitmap_request(block->idstr);
4541	ramblock_count++;
4542	}
4543
4544	trace_ram_dirty_bitmap_sync_wait();
4545
4546	/ Wait until all the ramblocks' dirty bitmap synced /
4547	while (ramblock_count--) {
4548	qemu_sem_wait(&s->rp_state.rp_sem);
4549	}
4550
4551	trace_ram_dirty_bitmap_sync_complete();
4552
4553	return `0`;
4554	}
4555
4556	static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4557	{
4558	qemu_sem_post(&s->rp_state.rp_sem);
4559	}
4560
4561	/*
4562	* Read the received bitmap, revert it as the initial dirty bitmap.
4563	* This is only used when the postcopy migration is paused but wants
4564	* to resume from a middle point.
4565	*/
4566	int ram_dirty_bitmap_reload(MigrationState s, RAMBlock block)
4567	{
4568	int ret = -EINVAL;
4569	QEMUFile *file = s->rp_state.from_dst_file;
4570	unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4571	uint64_t local_size = DIV_ROUND_UP(nbits, `8`);
4572	uint64_t size, end_mark;
4573
4574	trace_ram_dirty_bitmap_reload_begin(block->idstr);
4575
4576	if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4577	error_report("%s: incorrect state %s", __func__,
4578	MigrationStatus_str(s->state));
4579	return -EINVAL;
4580	}
4581
4582	/*
4583	* Note: see comments in ramblock_recv_bitmap_send() on why we
4584	* need the endianess convertion, and the paddings.
4585	*/
4586	local_size = ROUND_UP(local_size, `8`);
4587
4588	/ Add paddings /
4589	le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4590
4591	size = qemu_get_be64(file);
4592
4593	/ The size of the bitmap should match with our ramblock /
4594	if (size != local_size) {
4595	error_report("%s: ramblock '%s' bitmap size mismatch "
4596	"(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4597	block->idstr, size, local_size);
4598	ret = -EINVAL;
4599	goto out;
4600	}
4601
4602	size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4603	end_mark = qemu_get_be64(file);
4604
4605	ret = qemu_file_get_error(file);
4606	if (ret \|\| size != local_size) {
4607	error_report("%s: read bitmap failed for ramblock '%s': %d"
4608	" (size 0x%"PRIx64", got: 0x%"PRIx64")",
4609	__func__, block->idstr, ret, local_size, size);
4610	ret = -EIO;
4611	goto out;
4612	}
4613
4614	if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4615	error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4616	__func__, block->idstr, end_mark);
4617	ret = -EINVAL;
4618	goto out;
4619	}
4620
4621	/*
4622	* Endianess convertion. We are during postcopy (though paused).
4623	* The dirty bitmap won't change. We can directly modify it.
4624	*/
4625	bitmap_from_le(block->bmap, le_bitmap, nbits);
4626
4627	/*
4628	* What we received is "received bitmap". Revert it as the initial
4629	* dirty bitmap for this ramblock.
4630	*/
4631	bitmap_complement(block->bmap, block->bmap, nbits);
4632
4633	trace_ram_dirty_bitmap_reload_complete(block->idstr);
4634
4635	/*
4636	* We succeeded to sync bitmap for current ramblock. If this is
4637	* the last one to sync, we need to notify the main send thread.
4638	*/
4639	ram_dirty_bitmap_reload_notify(s);
4640
4641	ret = `0`;
4642	out:
4643	g_free(le_bitmap);
4644	return ret;
4645	}
4646
4647	static int ram_resume_prepare(MigrationState s, void* *opaque)
4648	{
4649	RAMState rs = (RAMState **)opaque;
4650	int ret;
4651
4652	ret = ram_dirty_bitmap_sync_all(s, rs);
4653	if (ret) {
4654	return ret;
4655	}
4656
4657	ram_state_resume_prepare(rs, s->to_dst_file);
4658
4659	return `0`;
4660	}
4661
4662	static SaveVMHandlers savevm_ram_handlers = {
4663	.save_setup = ram_save_setup,
4664	.save_live_iterate = ram_save_iterate,
4665	.save_live_complete_postcopy = ram_save_complete,
4666	.save_live_complete_precopy = ram_save_complete,
4667	.has_postcopy = ram_has_postcopy,
4668	.save_live_pending = ram_save_pending,
4669	.load_state = ram_load,
4670	.save_cleanup = ram_save_cleanup,
4671	.load_setup = ram_load_setup,
4672	.load_cleanup = ram_load_cleanup,
4673	.resume_prepare = ram_resume_prepare,
4674	};
4675
4676	void ram_mig_init(void)
4677	{
4678	qemu_mutex_init(&XBZRLE.lock);
4679	register_savevm_live(NULL, "ram", `0`, `4`, &savevm_ram_handlers, &ram_state);
4680	}
4681

Browse the source code of qemu/migration/ram.c