postcopy-ram.c source code [qemu/migration/postcopy-ram.c]

1	/*
2	* Postcopy migration for RAM
3	*
4	* Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5	*
6	* Authors:
7	* Dave Gilbert <dgilbert@redhat.com>
8	*
9	* This work is licensed under the terms of the GNU GPL, version 2 or later.
10	* See the COPYING file in the top-level directory.
11	*
12	*/
13
14	/*
15	* Postcopy is a migration technique where the execution flips from the
16	* source to the destination before all the data has been copied.
17	*/
18
19	#include "qemu/osdep.h"
20	#include "exec/target_page.h"
21	#include "migration.h"
22	#include "qemu-file.h"
23	#include "savevm.h"
24	#include "postcopy-ram.h"
25	#include "ram.h"
26	#include "qapi/error.h"
27	#include "qemu/notify.h"
28	#include "qemu/rcu.h"
29	#include "sysemu/sysemu.h"
30	#include "sysemu/balloon.h"
31	#include "qemu/error-report.h"
32	#include "trace.h"
33	#include "hw/boards.h"
34
35	/ Arbitrary limit on size of each discard command,*
36	* keeps them around ~200 bytes
37	*/
38	#define MAX_DISCARDS_PER_COMMAND 12
39
40	struct PostcopyDiscardState {
41	const char *ramblock_name;
42	uint16_t cur_entry;
43	/*
44	* Start and length of a discard range (bytes)
45	*/
46	uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
47	uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
48	unsigned int nsentwords;
49	unsigned int nsentcmds;
50	};
51
52	static NotifierWithReturnList postcopy_notifier_list;
53
54	void postcopy_infrastructure_init(void)
55	{
56	notifier_with_return_list_init(&postcopy_notifier_list);
57	}
58
59	void postcopy_add_notifier(NotifierWithReturn *nn)
60	{
61	notifier_with_return_list_add(&postcopy_notifier_list, nn);
62	}
63
64	void postcopy_remove_notifier(NotifierWithReturn *n)
65	{
66	notifier_with_return_remove(n);
67	}
68
69	int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
70	{
71	struct PostcopyNotifyData pnd;
72	pnd.reason = reason;
73	pnd.errp = errp;
74
75	return notifier_with_return_list_notify(&postcopy_notifier_list,
76	&pnd);
77	}
78
79	/ Postcopy needs to detect accesses to pages that haven't yet been copied*
80	* across, and efficiently map new pages in, the techniques for doing this
81	* are target OS specific.
82	*/
83	#if defined(__linux__)
84
85	#include <poll.h>
86	#include <sys/ioctl.h>
87	#include <sys/syscall.h>
88	#include <asm/types.h> /* for __u64 */
89	#endif
90
91	#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
92	#include <sys/eventfd.h>
93	#include <linux/userfaultfd.h>
94
95	typedef struct PostcopyBlocktimeContext {
96	/ time when page fault initiated per vCPU /
97	uint32_t *page_fault_vcpu_time;
98	/ page address per vCPU /
99	uintptr_t *vcpu_addr;
100	uint32_t total_blocktime;
101	/ blocktime per vCPU /
102	uint32_t *vcpu_blocktime;
103	/ point in time when last page fault was initiated /
104	uint32_t last_begin;
105	/ number of vCPU are suspended /
106	int smp_cpus_down;
107	uint64_t start_time;
108
109	/*
110	* Handler for exit event, necessary for
111	* releasing whole blocktime_ctx
112	*/
113	Notifier exit_notifier;
114	} PostcopyBlocktimeContext;
115
116	static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
117	{
118	g_free(ctx->page_fault_vcpu_time);
119	g_free(ctx->vcpu_addr);
120	g_free(ctx->vcpu_blocktime);
121	g_free(ctx);
122	}
123
124	static void migration_exit_cb(Notifier n, void* *data)
125	{
126	PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
127	exit_notifier);
128	destroy_blocktime_context(ctx);
129	}
130
131	static struct PostcopyBlocktimeContext blocktime_context_new(void*)
132	{
133	MachineState *ms = MACHINE(qdev_get_machine());
134	unsigned int smp_cpus = ms->smp.cpus;
135	PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, `1`);
136	ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
137	ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
138	ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
139
140	ctx->exit_notifier.notify = migration_exit_cb;
141	ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
142	qemu_add_exit_notifier(&ctx->exit_notifier);
143	return ctx;
144	}
145
146	static uint32List get_vcpu_blocktime_list(PostcopyBlocktimeContext ctx)
147	{
148	MachineState *ms = MACHINE(qdev_get_machine());
149	uint32List list = NULL, entry = NULL;
150	int i;
151
152	for (i = ms->smp.cpus - `1`; i >= `0`; i--) {
153	entry = g_new0(uint32List, `1`);
154	entry->value = ctx->vcpu_blocktime[i];
155	entry->next = list;
156	list = entry;
157	}
158
159	return list;
160	}
161
162	/*
163	* This function just populates MigrationInfo from postcopy's
164	* blocktime context. It will not populate MigrationInfo,
165	* unless postcopy-blocktime capability was set.
166	*
167	* @info: pointer to MigrationInfo to populate
168	*/
169	void fill_destination_postcopy_migration_info(MigrationInfo *info)
170	{
171	MigrationIncomingState *mis = migration_incoming_get_current();
172	PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
173
174	if (!bc) {
175	return;
176	}
177
178	info->has_postcopy_blocktime = true;
179	info->postcopy_blocktime = bc->total_blocktime;
180	info->has_postcopy_vcpu_blocktime = true;
181	info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
182	}
183
184	static uint32_t get_postcopy_total_blocktime(void)
185	{
186	MigrationIncomingState *mis = migration_incoming_get_current();
187	PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
188
189	if (!bc) {
190	return `0`;
191	}
192
193	return bc->total_blocktime;
194	}
195
196	/**
197	* receive_ufd_features: check userfault fd features, to request only supported
198	* features in the future.
199	*
200	* Returns: true on success
201	*
202	* __NR_userfaultfd - should be checked before
203	* @features: out parameter will contain uffdio_api.features provided by kernel
204	* in case of success
205	*/
206	static bool receive_ufd_features(uint64_t *features)
207	{
208	struct uffdio_api api_struct = {`0`};
209	int ufd;
210	bool ret = true;
211
212	/ if we are here __NR_userfaultfd should exists /
213	ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
214	if (ufd == -`1`) {
215	error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
216	strerror(errno));
217	return false;
218	}
219
220	/ ask features /
221	api_struct.api = UFFD_API;
222	api_struct.features = `0`;
223	if (ioctl(ufd, UFFDIO_API, &api_struct)) {
224	error_report("%s: UFFDIO_API failed: %s", __func__,
225	strerror(errno));
226	ret = false;
227	goto release_ufd;
228	}
229
230	*features = api_struct.features;
231
232	release_ufd:
233	close(ufd);
234	return ret;
235	}
236
237	/**
238	* request_ufd_features: this function should be called only once on a newly
239	* opened ufd, subsequent calls will lead to error.
240	*
241	* Returns: true on succes
242	*
243	* @ufd: fd obtained from userfaultfd syscall
244	* @features: bit mask see UFFD_API_FEATURES
245	*/
246	static bool request_ufd_features(int ufd, uint64_t features)
247	{
248	struct uffdio_api api_struct = {`0`};
249	uint64_t ioctl_mask;
250
251	api_struct.api = UFFD_API;
252	api_struct.features = features;
253	if (ioctl(ufd, UFFDIO_API, &api_struct)) {
254	error_report("%s failed: UFFDIO_API failed: %s", __func__,
255	strerror(errno));
256	return false;
257	}
258
259	ioctl_mask = (__u64)`1` << _UFFDIO_REGISTER \|
260	(__u64)`1` << _UFFDIO_UNREGISTER;
261	if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
262	error_report("Missing userfault features: %" PRIx64,
263	(uint64_t)(~api_struct.ioctls & ioctl_mask));
264	return false;
265	}
266
267	return true;
268	}
269
270	static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
271	{
272	uint64_t asked_features = `0`;
273	static uint64_t supported_features;
274
275	/*
276	* it's not possible to
277	* request UFFD_API twice per one fd
278	* userfault fd features is persistent
279	*/
280	if (!supported_features) {
281	if (!receive_ufd_features(&supported_features)) {
282	error_report("%s failed", __func__);
283	return false;
284	}
285	}
286
287	#ifdef UFFD_FEATURE_THREAD_ID
288	if (migrate_postcopy_blocktime() && mis &&
289	UFFD_FEATURE_THREAD_ID & supported_features) {
290	/ kernel supports that feature /
291	/ don't create blocktime_context if it exists /
292	if (!mis->blocktime_ctx) {
293	mis->blocktime_ctx = blocktime_context_new();
294	}
295
296	asked_features \|= UFFD_FEATURE_THREAD_ID;
297	}
298	#endif
299
300	/*
301	* request features, even if asked_features is 0, due to
302	* kernel expects UFFD_API before UFFDIO_REGISTER, per
303	* userfault file descriptor
304	*/
305	if (!request_ufd_features(ufd, asked_features)) {
306	error_report("%s failed: features %" PRIu64, __func__,
307	asked_features);
308	return false;
309	}
310
311	if (getpagesize() != ram_pagesize_summary()) {
312	bool have_hp = false;
313	/ We've got a huge page /
314	#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
315	have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
316	#endif
317	if (!have_hp) {
318	error_report("Userfault on this host does not support huge pages");
319	return false;
320	}
321	}
322	return true;
323	}
324
325	/ Callback from postcopy_ram_supported_by_host block iterator.*
326	*/
327	static int test_ramblock_postcopiable(RAMBlock rb, void* *opaque)
328	{
329	const char *block_name = qemu_ram_get_idstr(rb);
330	ram_addr_t length = qemu_ram_get_used_length(rb);
331	size_t pagesize = qemu_ram_pagesize(rb);
332
333	if (length % pagesize) {
334	error_report("Postcopy requires RAM blocks to be a page size multiple,"
335	" block %s is 0x" RAM_ADDR_FMT " bytes with a "
336	"page size of 0x%zx", block_name, length, pagesize);
337	return `1`;
338	}
339	return `0`;
340	}
341
342	/*
343	* Note: This has the side effect of munlock'ing all of RAM, that's
344	* normally fine since if the postcopy succeeds it gets turned back on at the
345	* end.
346	*/
347	bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
348	{
349	long pagesize = getpagesize();
350	int ufd = -`1`;
351	bool ret = false; / Error unless we change it /
352	void *testarea = NULL;
353	struct uffdio_register reg_struct;
354	struct uffdio_range range_struct;
355	uint64_t feature_mask;
356	Error *local_err = NULL;
357
358	if (qemu_target_page_size() > pagesize) {
359	error_report("Target page size bigger than host page size");
360	goto out;
361	}
362
363	ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
364	if (ufd == -`1`) {
365	error_report("%s: userfaultfd not available: %s", __func__,
366	strerror(errno));
367	goto out;
368	}
369
370	/ Give devices a chance to object /
371	if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
372	error_report_err(local_err);
373	goto out;
374	}
375
376	/ Version and features check /
377	if (!ufd_check_and_apply(ufd, mis)) {
378	goto out;
379	}
380
381	/ We don't support postcopy with shared RAM yet /
382	if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
383	goto out;
384	}
385
386	/*
387	* userfault and mlock don't go together; we'll put it back later if
388	* it was enabled.
389	*/
390	if (munlockall()) {
391	error_report("%s: munlockall: %s", __func__, strerror(errno));
392	return -`1`;
393	}
394
395	/*
396	* We need to check that the ops we need are supported on anon memory
397	* To do that we need to register a chunk and see the flags that
398	* are returned.
399	*/
400	testarea = mmap(NULL, pagesize, PROT_READ \| PROT_WRITE, MAP_PRIVATE \|
401	MAP_ANONYMOUS, -`1`, `0`);
402	if (testarea == MAP_FAILED) {
403	error_report("%s: Failed to map test area: %s", __func__,
404	strerror(errno));
405	goto out;
406	}
407	g_assert(((size_t)testarea & (pagesize-`1`)) == `0`);
408
409	reg_struct.range.start = (uintptr_t)testarea;
410	reg_struct.range.len = pagesize;
411	reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
412
413	if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
414	error_report("%s userfault register: %s", __func__, strerror(errno));
415	goto out;
416	}
417
418	range_struct.start = (uintptr_t)testarea;
419	range_struct.len = pagesize;
420	if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
421	error_report("%s userfault unregister: %s", __func__, strerror(errno));
422	goto out;
423	}
424
425	feature_mask = (__u64)`1` << _UFFDIO_WAKE \|
426	(__u64)`1` << _UFFDIO_COPY \|
427	(__u64)`1` << _UFFDIO_ZEROPAGE;
428	if ((reg_struct.ioctls & feature_mask) != feature_mask) {
429	error_report("Missing userfault map features: %" PRIx64,
430	(uint64_t)(~reg_struct.ioctls & feature_mask));
431	goto out;
432	}
433
434	/ Success! /
435	ret = true;
436	out:
437	if (testarea) {
438	munmap(testarea, pagesize);
439	}
440	if (ufd != -`1`) {
441	close(ufd);
442	}
443	return ret;
444	}
445
446	/*
447	* Setup an area of RAM so that it can be used for postcopy later; this
448	* must be done right at the start prior to pre-copy.
449	* opaque should be the MIS.
450	*/
451	static int init_range(RAMBlock rb, void* *opaque)
452	{
453	const char *block_name = qemu_ram_get_idstr(rb);
454	void *host_addr = qemu_ram_get_host_addr(rb);
455	ram_addr_t offset = qemu_ram_get_offset(rb);
456	ram_addr_t length = qemu_ram_get_used_length(rb);
457	trace_postcopy_init_range(block_name, host_addr, offset, length);
458
459	/*
460	* We need the whole of RAM to be truly empty for postcopy, so things
461	* like ROMs and any data tables built during init must be zero'd
462	* - we're going to get the copy from the source anyway.
463	* (Precopy will just overwrite this data, so doesn't need the discard)
464	*/
465	if (ram_discard_range(block_name, `0`, length)) {
466	return -`1`;
467	}
468
469	return `0`;
470	}
471
472	/*
473	* At the end of migration, undo the effects of init_range
474	* opaque should be the MIS.
475	*/
476	static int cleanup_range(RAMBlock rb, void* *opaque)
477	{
478	const char *block_name = qemu_ram_get_idstr(rb);
479	void *host_addr = qemu_ram_get_host_addr(rb);
480	ram_addr_t offset = qemu_ram_get_offset(rb);
481	ram_addr_t length = qemu_ram_get_used_length(rb);
482	MigrationIncomingState *mis = opaque;
483	struct uffdio_range range_struct;
484	trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
485
486	/*
487	* We turned off hugepage for the precopy stage with postcopy enabled
488	* we can turn it back on now.
489	*/
490	qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
491
492	/*
493	* We can also turn off userfault now since we should have all the
494	* pages. It can be useful to leave it on to debug postcopy
495	* if you're not sure it's always getting every page.
496	*/
497	range_struct.start = (uintptr_t)host_addr;
498	range_struct.len = length;
499
500	if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
501	error_report("%s: userfault unregister %s", __func__, strerror(errno));
502
503	return -`1`;
504	}
505
506	return `0`;
507	}
508
509	/*
510	* Initialise postcopy-ram, setting the RAM to a state where we can go into
511	* postcopy later; must be called prior to any precopy.
512	* called from arch_init's similarly named ram_postcopy_incoming_init
513	*/
514	int postcopy_ram_incoming_init(MigrationIncomingState *mis)
515	{
516	if (foreach_not_ignored_block(init_range, NULL)) {
517	return -`1`;
518	}
519
520	return `0`;
521	}
522
523	/*
524	* Manage a single vote to the QEMU balloon inhibitor for all postcopy usage,
525	* last caller wins.
526	*/
527	static void postcopy_balloon_inhibit(bool state)
528	{
529	static bool cur_state = false;
530
531	if (state != cur_state) {
532	qemu_balloon_inhibit(state);
533	cur_state = state;
534	}
535	}
536
537	/*
538	* At the end of a migration where postcopy_ram_incoming_init was called.
539	*/
540	int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
541	{
542	trace_postcopy_ram_incoming_cleanup_entry();
543
544	if (mis->have_fault_thread) {
545	Error *local_err = NULL;
546
547	/ Let the fault thread quit /
548	atomic_set(&mis->fault_thread_quit, `1`);
549	postcopy_fault_thread_notify(mis);
550	trace_postcopy_ram_incoming_cleanup_join();
551	qemu_thread_join(&mis->fault_thread);
552
553	if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
554	error_report_err(local_err);
555	return -`1`;
556	}
557
558	if (foreach_not_ignored_block(cleanup_range, mis)) {
559	return -`1`;
560	}
561
562	trace_postcopy_ram_incoming_cleanup_closeuf();
563	close(mis->userfault_fd);
564	close(mis->userfault_event_fd);
565	mis->have_fault_thread = false;
566	}
567
568	postcopy_balloon_inhibit(false);
569
570	if (enable_mlock) {
571	if (os_mlock() < `0`) {
572	error_report("mlock: %s", strerror(errno));
573	/*
574	* It doesn't feel right to fail at this point, we have a valid
575	* VM state.
576	*/
577	}
578	}
579
580	postcopy_state_set(POSTCOPY_INCOMING_END);
581
582	if (mis->postcopy_tmp_page) {
583	munmap(mis->postcopy_tmp_page, mis->largest_page_size);
584	mis->postcopy_tmp_page = NULL;
585	}
586	if (mis->postcopy_tmp_zero_page) {
587	munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
588	mis->postcopy_tmp_zero_page = NULL;
589	}
590	trace_postcopy_ram_incoming_cleanup_blocktime(
591	get_postcopy_total_blocktime());
592
593	trace_postcopy_ram_incoming_cleanup_exit();
594	return `0`;
595	}
596
597	/*
598	* Disable huge pages on an area
599	*/
600	static int nhp_range(RAMBlock rb, void* *opaque)
601	{
602	const char *block_name = qemu_ram_get_idstr(rb);
603	void *host_addr = qemu_ram_get_host_addr(rb);
604	ram_addr_t offset = qemu_ram_get_offset(rb);
605	ram_addr_t length = qemu_ram_get_used_length(rb);
606	trace_postcopy_nhp_range(block_name, host_addr, offset, length);
607
608	/*
609	* Before we do discards we need to ensure those discards really
610	* do delete areas of the page, even if THP thinks a hugepage would
611	* be a good idea, so force hugepages off.
612	*/
613	qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
614
615	return `0`;
616	}
617
618	/*
619	* Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
620	* however leaving it until after precopy means that most of the precopy
621	* data is still THPd
622	*/
623	int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
624	{
625	if (foreach_not_ignored_block(nhp_range, mis)) {
626	return -`1`;
627	}
628
629	postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
630
631	return `0`;
632	}
633
634	/*
635	* Mark the given area of RAM as requiring notification to unwritten areas
636	* Used as a callback on foreach_not_ignored_block.
637	* host_addr: Base of area to mark
638	* offset: Offset in the whole ram arena
639	* length: Length of the section
640	* opaque: MigrationIncomingState pointer
641	* Returns 0 on success
642	*/
643	static int ram_block_enable_notify(RAMBlock rb, void* *opaque)
644	{
645	MigrationIncomingState *mis = opaque;
646	struct uffdio_register reg_struct;
647
648	reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
649	reg_struct.range.len = qemu_ram_get_used_length(rb);
650	reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
651
652	/ Now tell our userfault_fd that it's responsible for this area /
653	if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
654	error_report("%s userfault register: %s", __func__, strerror(errno));
655	return -`1`;
656	}
657	if (!(reg_struct.ioctls & ((__u64)`1` << _UFFDIO_COPY))) {
658	error_report("%s userfault: Region doesn't support COPY", __func__);
659	return -`1`;
660	}
661	if (reg_struct.ioctls & ((__u64)`1` << _UFFDIO_ZEROPAGE)) {
662	qemu_ram_set_uf_zeroable(rb);
663	}
664
665	return `0`;
666	}
667
668	int postcopy_wake_shared(struct PostCopyFD *pcfd,
669	uint64_t client_addr,
670	RAMBlock *rb)
671	{
672	size_t pagesize = qemu_ram_pagesize(rb);
673	struct uffdio_range range;
674	int ret;
675	trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
676	range.start = client_addr & ~(pagesize - `1`);
677	range.len = pagesize;
678	ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
679	if (ret) {
680	error_report("%s: Failed to wake: %zx in %s (%s)",
681	__func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
682	strerror(errno));
683	}
684	return ret;
685	}
686
687	/*
688	* Callback from shared fault handlers to ask for a page,
689	* the page must be specified by a RAMBlock and an offset in that rb
690	* Note: Only for use by shared fault handlers (in fault thread)
691	*/
692	int postcopy_request_shared_page(struct PostCopyFD pcfd, RAMBlock rb,
693	uint64_t client_addr, uint64_t rb_offset)
694	{
695	size_t pagesize = qemu_ram_pagesize(rb);
696	uint64_t aligned_rbo = rb_offset & ~(pagesize - `1`);
697	MigrationIncomingState *mis = migration_incoming_get_current();
698
699	trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
700	rb_offset);
701	if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
702	trace_postcopy_request_shared_page_present(pcfd->idstr,
703	qemu_ram_get_idstr(rb), rb_offset);
704	return postcopy_wake_shared(pcfd, client_addr, rb);
705	}
706	if (rb != mis->last_rb) {
707	mis->last_rb = rb;
708	migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
709	aligned_rbo, pagesize);
710	} else {
711	/ Save some space /
712	migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
713	}
714	return `0`;
715	}
716
717	static int get_mem_fault_cpu_index(uint32_t pid)
718	{
719	CPUState *cpu_iter;
720
721	CPU_FOREACH(cpu_iter) {
722	if (cpu_iter->thread_id == pid) {
723	trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
724	return cpu_iter->cpu_index;
725	}
726	}
727	trace_get_mem_fault_cpu_index(-`1`, pid);
728	return -`1`;
729	}
730
731	static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
732	{
733	int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
734	dc->start_time;
735	return start_time_offset < `1` ? `1` : start_time_offset & UINT32_MAX;
736	}
737
738	/*
739	* This function is being called when pagefault occurs. It
740	* tracks down vCPU blocking time.
741	*
742	* @addr: faulted host virtual address
743	* @ptid: faulted process thread id
744	* @rb: ramblock appropriate to addr
745	*/
746	static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
747	RAMBlock *rb)
748	{
749	int cpu, already_received;
750	MigrationIncomingState *mis = migration_incoming_get_current();
751	PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
752	uint32_t low_time_offset;
753
754	if (!dc \|\| ptid == `0`) {
755	return;
756	}
757	cpu = get_mem_fault_cpu_index(ptid);
758	if (cpu < `0`) {
759	return;
760	}
761
762	low_time_offset = get_low_time_offset(dc);
763	if (dc->vcpu_addr[cpu] == `0`) {
764	atomic_inc(&dc->smp_cpus_down);
765	}
766
767	atomic_xchg(&dc->last_begin, low_time_offset);
768	atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
769	atomic_xchg(&dc->vcpu_addr[cpu], addr);
770
771	/ check it here, not at the begining of the function,*
772	* due to, check could accur early than bitmap_set in
773	* qemu_ufd_copy_ioctl */
774	already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
775	if (already_received) {
776	atomic_xchg(&dc->vcpu_addr[cpu], `0`);
777	atomic_xchg(&dc->page_fault_vcpu_time[cpu], `0`);
778	atomic_dec(&dc->smp_cpus_down);
779	}
780	trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
781	cpu, already_received);
782	}
783
784	/*
785	* This function just provide calculated blocktime per cpu and trace it.
786	* Total blocktime is calculated in mark_postcopy_blocktime_end.
787	*
788	*
789	* Assume we have 3 CPU
790	*
791	* S1 E1 S1 E1
792	* -----*********------------xxx*************------------------------> CPU1
793	*
794	* S2 E2
795	* ------------****************xxx---------------------------------------> CPU2
796	*
797	* S3 E3
798	* ------------------------**xxx******-------------------------------> CPU3
799	*
800	* We have sequence S1,S2,E1,S3,S1,E2,E3,E1
801	* S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
802	* S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
803	* it's a part of total blocktime.
804	* S1 - here is last_begin
805	* Legend of the picture is following:
806	* * - means blocktime per vCPU
807	* x - means overlapped blocktime (total blocktime)
808	*
809	* @addr: host virtual address
810	*/
811	static void mark_postcopy_blocktime_end(uintptr_t addr)
812	{
813	MigrationIncomingState *mis = migration_incoming_get_current();
814	PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
815	MachineState *ms = MACHINE(qdev_get_machine());
816	unsigned int smp_cpus = ms->smp.cpus;
817	int i, affected_cpu = `0`;
818	bool vcpu_total_blocktime = false;
819	uint32_t read_vcpu_time, low_time_offset;
820
821	if (!dc) {
822	return;
823	}
824
825	low_time_offset = get_low_time_offset(dc);
826	/ lookup cpu, to clear it,*
827	* that algorithm looks straighforward, but it's not
828	* optimal, more optimal algorithm is keeping tree or hash
829	* where key is address value is a list of */
830	for (i = `0`; i < smp_cpus; i++) {
831	uint32_t vcpu_blocktime = `0`;
832
833	read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], `0`);
834	if (atomic_fetch_add(&dc->vcpu_addr[i], `0`) != addr \|\|
835	read_vcpu_time == `0`) {
836	continue;
837	}
838	atomic_xchg(&dc->vcpu_addr[i], `0`);
839	vcpu_blocktime = low_time_offset - read_vcpu_time;
840	affected_cpu += `1`;
841	/ we need to know is that mark_postcopy_end was due to*
842	* faulted page, another possible case it's prefetched
843	* page and in that case we shouldn't be here */
844	if (!vcpu_total_blocktime &&
845	atomic_fetch_add(&dc->smp_cpus_down, `0`) == smp_cpus) {
846	vcpu_total_blocktime = true;
847	}
848	/ continue cycle, due to one page could affect several vCPUs /
849	dc->vcpu_blocktime[i] += vcpu_blocktime;
850	}
851
852	atomic_sub(&dc->smp_cpus_down, affected_cpu);
853	if (vcpu_total_blocktime) {
854	dc->total_blocktime += low_time_offset - atomic_fetch_add(
855	&dc->last_begin, `0`);
856	}
857	trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
858	affected_cpu);
859	}
860
861	static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
862	{
863	trace_postcopy_pause_fault_thread();
864
865	qemu_sem_wait(&mis->postcopy_pause_sem_fault);
866
867	trace_postcopy_pause_fault_thread_continued();
868
869	return true;
870	}
871
872	/*
873	* Handle faults detected by the USERFAULT markings
874	*/
875	static void postcopy_ram_fault_thread(void* *opaque)
876	{
877	MigrationIncomingState *mis = opaque;
878	struct uffd_msg msg;
879	int ret;
880	size_t index;
881	RAMBlock *rb = NULL;
882
883	trace_postcopy_ram_fault_thread_entry();
884	rcu_register_thread();
885	mis->last_rb = NULL; / last RAMBlock we sent part of /
886	qemu_sem_post(&mis->fault_thread_sem);
887
888	struct pollfd *pfd;
889	size_t pfd_len = `2` + mis->postcopy_remote_fds->len;
890
891	pfd = g_new0(struct pollfd, pfd_len);
892
893	pfd[`0`].fd = mis->userfault_fd;
894	pfd[`0`].events = POLLIN;
895	pfd[`1`].fd = mis->userfault_event_fd;
896	pfd[`1`].events = POLLIN; / Waiting for eventfd to go positive /
897	trace_postcopy_ram_fault_thread_fds_core(pfd[`0`].fd, pfd[`1`].fd);
898	for (index = `0`; index < mis->postcopy_remote_fds->len; index++) {
899	struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
900	struct PostCopyFD, index);
901	pfd[`2` + index].fd = pcfd->fd;
902	pfd[`2` + index].events = POLLIN;
903	trace_postcopy_ram_fault_thread_fds_extra(`2` + index, pcfd->idstr,
904	pcfd->fd);
905	}
906
907	while (true) {
908	ram_addr_t rb_offset;
909	int poll_result;
910
911	/*
912	* We're mainly waiting for the kernel to give us a faulting HVA,
913	* however we can be told to quit via userfault_quit_fd which is
914	* an eventfd
915	*/
916
917	poll_result = poll(pfd, pfd_len, -`1` / Wait forever /);
918	if (poll_result == -`1`) {
919	error_report("%s: userfault poll: %s", __func__, strerror(errno));
920	break;
921	}
922
923	if (!mis->to_src_file) {
924	/*
925	* Possibly someone tells us that the return path is
926	* broken already using the event. We should hold until
927	* the channel is rebuilt.
928	*/
929	if (postcopy_pause_fault_thread(mis)) {
930	mis->last_rb = NULL;
931	/ Continue to read the userfaultfd /
932	} else {
933	error_report("%s: paused but don't allow to continue",
934	__func__);
935	break;
936	}
937	}
938
939	if (pfd[`1`].revents) {
940	uint64_t tmp64 = `0`;
941
942	/ Consume the signal /
943	if (read(mis->userfault_event_fd, &tmp64, `8`) != `8`) {
944	/ Nothing obviously nicer than posting this error. /
945	error_report("%s: read() failed", __func__);
946	}
947
948	if (atomic_read(&mis->fault_thread_quit)) {
949	trace_postcopy_ram_fault_thread_quit();
950	break;
951	}
952	}
953
954	if (pfd[`0`].revents) {
955	poll_result--;
956	ret = read(mis->userfault_fd, &msg, sizeof(msg));
957	if (ret != sizeof(msg)) {
958	if (errno == EAGAIN) {
959	/*
960	* if a wake up happens on the other thread just after
961	* the poll, there is nothing to read.
962	*/
963	continue;
964	}
965	if (ret < `0`) {
966	error_report("%s: Failed to read full userfault "
967	"message: %s",
968	__func__, strerror(errno));
969	break;
970	} else {
971	error_report("%s: Read %d bytes from userfaultfd "
972	"expected %zd",
973	__func__, ret, sizeof(msg));
974	break; / Lost alignment, don't know what we'd read next /
975	}
976	}
977	if (msg.event != UFFD_EVENT_PAGEFAULT) {
978	error_report("%s: Read unexpected event %ud from userfaultfd",
979	__func__, msg.event);
980	continue; / It's not a page fault, shouldn't happen /
981	}
982
983	rb = qemu_ram_block_from_host(
984	(void *)(uintptr_t)msg.arg.pagefault.address,
985	true, &rb_offset);
986	if (!rb) {
987	error_report("postcopy_ram_fault_thread: Fault outside guest: %"
988	PRIx64, (uint64_t)msg.arg.pagefault.address);
989	break;
990	}
991
992	rb_offset &= ~(qemu_ram_pagesize(rb) - `1`);
993	trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
994	qemu_ram_get_idstr(rb),
995	rb_offset,
996	msg.arg.pagefault.feat.ptid);
997	mark_postcopy_blocktime_begin(
998	(uintptr_t)(msg.arg.pagefault.address),
999	msg.arg.pagefault.feat.ptid, rb);
1000
1001	retry:
1002	/*
1003	* Send the request to the source - we want to request one
1004	* of our host page sizes (which is >= TPS)
1005	*/
1006	if (rb != mis->last_rb) {
1007	mis->last_rb = rb;
1008	ret = migrate_send_rp_req_pages(mis,
1009	qemu_ram_get_idstr(rb),
1010	rb_offset,
1011	qemu_ram_pagesize(rb));
1012	} else {
1013	/ Save some space /
1014	ret = migrate_send_rp_req_pages(mis,
1015	NULL,
1016	rb_offset,
1017	qemu_ram_pagesize(rb));
1018	}
1019
1020	if (ret) {
1021	/ May be network failure, try to wait for recovery /
1022	if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1023	/ We got reconnected somehow, try to continue /
1024	mis->last_rb = NULL;
1025	goto retry;
1026	} else {
1027	/ This is a unavoidable fault /
1028	error_report("%s: migrate_send_rp_req_pages() get %d",
1029	__func__, ret);
1030	break;
1031	}
1032	}
1033	}
1034
1035	/ Now handle any requests from external processes on shared memory /
1036	/ TODO: May need to handle devices deregistering during postcopy /
1037	for (index = `2`; index < pfd_len && poll_result; index++) {
1038	if (pfd[index].revents) {
1039	struct PostCopyFD *pcfd =
1040	&g_array_index(mis->postcopy_remote_fds,
1041	struct PostCopyFD, index - `2`);
1042
1043	poll_result--;
1044	if (pfd[index].revents & POLLERR) {
1045	error_report("%s: POLLERR on poll %zd fd=%d",
1046	__func__, index, pcfd->fd);
1047	pfd[index].events = `0`;
1048	continue;
1049	}
1050
1051	ret = read(pcfd->fd, &msg, sizeof(msg));
1052	if (ret != sizeof(msg)) {
1053	if (errno == EAGAIN) {
1054	/*
1055	* if a wake up happens on the other thread just after
1056	* the poll, there is nothing to read.
1057	*/
1058	continue;
1059	}
1060	if (ret < `0`) {
1061	error_report("%s: Failed to read full userfault "
1062	"message: %s (shared) revents=%d",
1063	__func__, strerror(errno),
1064	pfd[index].revents);
1065	/TODO: Could just disable this sharer /
1066	break;
1067	} else {
1068	error_report("%s: Read %d bytes from userfaultfd "
1069	"expected %zd (shared)",
1070	__func__, ret, sizeof(msg));
1071	/TODO: Could just disable this sharer /
1072	break; /Lost alignment,don't know what we'd read next/
1073	}
1074	}
1075	if (msg.event != UFFD_EVENT_PAGEFAULT) {
1076	error_report("%s: Read unexpected event %ud "
1077	"from userfaultfd (shared)",
1078	__func__, msg.event);
1079	continue; / It's not a page fault, shouldn't happen /
1080	}
1081	/ Call the device handler registered with us /
1082	ret = pcfd->handler(pcfd, &msg);
1083	if (ret) {
1084	error_report("%s: Failed to resolve shared fault on %zd/%s",
1085	__func__, index, pcfd->idstr);
1086	/ TODO: Fail? Disable this sharer? /
1087	}
1088	}
1089	}
1090	}
1091	rcu_unregister_thread();
1092	trace_postcopy_ram_fault_thread_exit();
1093	g_free(pfd);
1094	return NULL;
1095	}
1096
1097	int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1098	{
1099	/ Open the fd for the kernel to give us userfaults /
1100	mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC \| O_NONBLOCK);
1101	if (mis->userfault_fd == -`1`) {
1102	error_report("%s: Failed to open userfault fd: %s", __func__,
1103	strerror(errno));
1104	return -`1`;
1105	}
1106
1107	/*
1108	* Although the host check already tested the API, we need to
1109	* do the check again as an ABI handshake on the new fd.
1110	*/
1111	if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1112	return -`1`;
1113	}
1114
1115	/ Now an eventfd we use to tell the fault-thread to quit /
1116	mis->userfault_event_fd = eventfd(`0`, EFD_CLOEXEC);
1117	if (mis->userfault_event_fd == -`1`) {
1118	error_report("%s: Opening userfault_event_fd: %s", __func__,
1119	strerror(errno));
1120	close(mis->userfault_fd);
1121	return -`1`;
1122	}
1123
1124	qemu_sem_init(&mis->fault_thread_sem, `0`);
1125	qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1126	postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1127	qemu_sem_wait(&mis->fault_thread_sem);
1128	qemu_sem_destroy(&mis->fault_thread_sem);
1129	mis->have_fault_thread = true;
1130
1131	/ Mark so that we get notified of accesses to unwritten areas /
1132	if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1133	error_report("ram_block_enable_notify failed");
1134	return -`1`;
1135	}
1136
1137	/*
1138	* Ballooning can mark pages as absent while we're postcopying
1139	* that would cause false userfaults.
1140	*/
1141	postcopy_balloon_inhibit(true);
1142
1143	trace_postcopy_ram_enable_notify();
1144
1145	return `0`;
1146	}
1147
1148	static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1149	void from_addr, uint64_t pagesize, RAMBlock rb)
1150	{
1151	int ret;
1152	if (from_addr) {
1153	struct uffdio_copy copy_struct;
1154	copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1155	copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1156	copy_struct.len = pagesize;
1157	copy_struct.mode = `0`;
1158	ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
1159	} else {
1160	struct uffdio_zeropage zero_struct;
1161	zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1162	zero_struct.range.len = pagesize;
1163	zero_struct.mode = `0`;
1164	ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1165	}
1166	if (!ret) {
1167	ramblock_recv_bitmap_set_range(rb, host_addr,
1168	pagesize / qemu_target_page_size());
1169	mark_postcopy_blocktime_end((uintptr_t)host_addr);
1170
1171	}
1172	return ret;
1173	}
1174
1175	int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1176	{
1177	int i;
1178	MigrationIncomingState *mis = migration_incoming_get_current();
1179	GArray *pcrfds = mis->postcopy_remote_fds;
1180
1181	for (i = `0`; i < pcrfds->len; i++) {
1182	struct PostCopyFD cur = &g_array_index(pcrfds, struct* PostCopyFD, i);
1183	int ret = cur->waker(cur, rb, offset);
1184	if (ret) {
1185	return ret;
1186	}
1187	}
1188	return `0`;
1189	}
1190
1191	/*
1192	* Place a host page (from) at (host) atomically
1193	* returns 0 on success
1194	*/
1195	int postcopy_place_page(MigrationIncomingState mis, void* host, void* *from,
1196	RAMBlock *rb)
1197	{
1198	size_t pagesize = qemu_ram_pagesize(rb);
1199
1200	/ copy also acks to the kernel waking the stalled thread up*
1201	* TODO: We can inhibit that ack and only do it if it was requested
1202	* which would be slightly cheaper, but we'd have to be careful
1203	* of the order of updating our page state.
1204	*/
1205	if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1206	int e = errno;
1207	error_report("%s: %s copy host: %p from: %p (size: %zd)",
1208	__func__, strerror(e), host, from, pagesize);
1209
1210	return -e;
1211	}
1212
1213	trace_postcopy_place_page(host);
1214	return postcopy_notify_shared_wake(rb,
1215	qemu_ram_block_host_offset(rb, host));
1216	}
1217
1218	/*
1219	* Place a zero page at (host) atomically
1220	* returns 0 on success
1221	*/
1222	int postcopy_place_page_zero(MigrationIncomingState mis, void* *host,
1223	RAMBlock *rb)
1224	{
1225	size_t pagesize = qemu_ram_pagesize(rb);
1226	trace_postcopy_place_page_zero(host);
1227
1228	/ Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE*
1229	* but it's not available for everything (e.g. hugetlbpages)
1230	*/
1231	if (qemu_ram_is_uf_zeroable(rb)) {
1232	if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1233	int e = errno;
1234	error_report("%s: %s zero host: %p",
1235	__func__, strerror(e), host);
1236
1237	return -e;
1238	}
1239	return postcopy_notify_shared_wake(rb,
1240	qemu_ram_block_host_offset(rb,
1241	host));
1242	} else {
1243	/ The kernel can't use UFFDIO_ZEROPAGE for hugepages /
1244	if (!mis->postcopy_tmp_zero_page) {
1245	mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1246	PROT_READ \| PROT_WRITE,
1247	MAP_PRIVATE \| MAP_ANONYMOUS,
1248	-`1`, `0`);
1249	if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1250	int e = errno;
1251	mis->postcopy_tmp_zero_page = NULL;
1252	error_report("%s: %s mapping large zero page",
1253	__func__, strerror(e));
1254	return -e;
1255	}
1256	memset(mis->postcopy_tmp_zero_page, `'\0'`, mis->largest_page_size);
1257	}
1258	return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1259	rb);
1260	}
1261	}
1262
1263	/*
1264	* Returns a target page of memory that can be mapped at a later point in time
1265	* using postcopy_place_page
1266	* The same address is used repeatedly, postcopy_place_page just takes the
1267	* backing page away.
1268	* Returns: Pointer to allocated page
1269	*
1270	*/
1271	void postcopy_get_tmp_page(MigrationIncomingState mis)
1272	{
1273	if (!mis->postcopy_tmp_page) {
1274	mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1275	PROT_READ \| PROT_WRITE, MAP_PRIVATE \|
1276	MAP_ANONYMOUS, -`1`, `0`);
1277	if (mis->postcopy_tmp_page == MAP_FAILED) {
1278	mis->postcopy_tmp_page = NULL;
1279	error_report("%s: %s", __func__, strerror(errno));
1280	return NULL;
1281	}
1282	}
1283
1284	return mis->postcopy_tmp_page;
1285	}
1286
1287	#else
1288	/ No target OS support, stubs just fail /
1289	void fill_destination_postcopy_migration_info(MigrationInfo *info)
1290	{
1291	}
1292
1293	bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1294	{
1295	error_report("%s: No OS support", __func__);
1296	return false;
1297	}
1298
1299	int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1300	{
1301	error_report("postcopy_ram_incoming_init: No OS support");
1302	return -`1`;
1303	}
1304
1305	int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1306	{
1307	assert(`0`);
1308	return -`1`;
1309	}
1310
1311	int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1312	{
1313	assert(`0`);
1314	return -`1`;
1315	}
1316
1317	int postcopy_request_shared_page(struct PostCopyFD pcfd, RAMBlock rb,
1318	uint64_t client_addr, uint64_t rb_offset)
1319	{
1320	assert(`0`);
1321	return -`1`;
1322	}
1323
1324	int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1325	{
1326	assert(`0`);
1327	return -`1`;
1328	}
1329
1330	int postcopy_place_page(MigrationIncomingState mis, void* host, void* *from,
1331	RAMBlock *rb)
1332	{
1333	assert(`0`);
1334	return -`1`;
1335	}
1336
1337	int postcopy_place_page_zero(MigrationIncomingState mis, void* *host,
1338	RAMBlock *rb)
1339	{
1340	assert(`0`);
1341	return -`1`;
1342	}
1343
1344	void postcopy_get_tmp_page(MigrationIncomingState mis)
1345	{
1346	assert(`0`);
1347	return NULL;
1348	}
1349
1350	int postcopy_wake_shared(struct PostCopyFD *pcfd,
1351	uint64_t client_addr,
1352	RAMBlock *rb)
1353	{
1354	assert(`0`);
1355	return -`1`;
1356	}
1357	#endif
1358
1359	/ ------------------------------------------------------------------------- /
1360
1361	void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1362	{
1363	uint64_t tmp64 = `1`;
1364
1365	/*
1366	* Wakeup the fault_thread. It's an eventfd that should currently
1367	* be at 0, we're going to increment it to 1
1368	*/
1369	if (write(mis->userfault_event_fd, &tmp64, `8`) != `8`) {
1370	/ Not much we can do here, but may as well report it /
1371	error_report("%s: incrementing failed: %s", __func__,
1372	strerror(errno));
1373	}
1374	}
1375
1376	/**
1377	* postcopy_discard_send_init: Called at the start of each RAMBlock before
1378	* asking to discard individual ranges.
1379	*
1380	* @ms: The current migration state.
1381	* @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
1382	* @name: RAMBlock that discards will operate on.
1383	*/
1384	static PostcopyDiscardState pds = {`0`};
1385	void postcopy_discard_send_init(MigrationState ms, const* char *name)
1386	{
1387	pds.ramblock_name = name;
1388	pds.cur_entry = `0`;
1389	pds.nsentwords = `0`;
1390	pds.nsentcmds = `0`;
1391	}
1392
1393	/**
1394	* postcopy_discard_send_range: Called by the bitmap code for each chunk to
1395	* discard. May send a discard message, may just leave it queued to
1396	* be sent later.
1397	*
1398	* @ms: Current migration state.
1399	* @start,@length: a range of pages in the migration bitmap in the
1400	* RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1401	*/
1402	void postcopy_discard_send_range(MigrationState ms, unsigned* long start,
1403	unsigned long length)
1404	{
1405	size_t tp_size = qemu_target_page_size();
1406	/ Convert to byte offsets within the RAM block /
1407	pds.start_list[pds.cur_entry] = start * tp_size;
1408	pds.length_list[pds.cur_entry] = length * tp_size;
1409	trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1410	pds.cur_entry++;
1411	pds.nsentwords++;
1412
1413	if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1414	/ Full set, ship it! /
1415	qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1416	pds.ramblock_name,
1417	pds.cur_entry,
1418	pds.start_list,
1419	pds.length_list);
1420	pds.nsentcmds++;
1421	pds.cur_entry = `0`;
1422	}
1423	}
1424
1425	/**
1426	* postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1427	* bitmap code. Sends any outstanding discard messages, frees the PDS
1428	*
1429	* @ms: Current migration state.
1430	*/
1431	void postcopy_discard_send_finish(MigrationState *ms)
1432	{
1433	/ Anything unsent? /
1434	if (pds.cur_entry) {
1435	qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1436	pds.ramblock_name,
1437	pds.cur_entry,
1438	pds.start_list,
1439	pds.length_list);
1440	pds.nsentcmds++;
1441	}
1442
1443	trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1444	pds.nsentcmds);
1445	}
1446
1447	/*
1448	* Current state of incoming postcopy; note this is not part of
1449	* MigrationIncomingState since it's state is used during cleanup
1450	* at the end as MIS is being freed.
1451	*/
1452	static PostcopyState incoming_postcopy_state;
1453
1454	PostcopyState postcopy_state_get(void)
1455	{
1456	return atomic_mb_read(&incoming_postcopy_state);
1457	}
1458
1459	/ Set the state and return the old state /
1460	PostcopyState postcopy_state_set(PostcopyState new_state)
1461	{
1462	return atomic_xchg(&incoming_postcopy_state, new_state);
1463	}
1464
1465	/ Register a handler for external shared memory postcopy*
1466	* called on the destination.
1467	*/
1468	void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1469	{
1470	MigrationIncomingState *mis = migration_incoming_get_current();
1471
1472	mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1473	*pcfd);
1474	}
1475
1476	/ Unregister a handler for external shared memory postcopy*
1477	*/
1478	void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1479	{
1480	guint i;
1481	MigrationIncomingState *mis = migration_incoming_get_current();
1482	GArray *pcrfds = mis->postcopy_remote_fds;
1483
1484	for (i = `0`; i < pcrfds->len; i++) {
1485	struct PostCopyFD cur = &g_array_index(pcrfds, struct* PostCopyFD, i);
1486	if (cur->fd == pcfd->fd) {
1487	mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1488	return;
1489	}
1490	}
1491	}
1492

Browse the source code of qemu/migration/postcopy-ram.c