spapr_pci_nvlink2.c source code [qemu/hw/ppc/spapr_pci_nvlink2.c]

1	/*
2	* QEMU sPAPR PCI for NVLink2 pass through
3	*
4	* Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation.
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to deal
8	* in the Software without restriction, including without limitation the rights
9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10	* copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22	* THE SOFTWARE.
23	*/
24	#include "qemu/osdep.h"
25	#include "qapi/error.h"
26	#include "qemu-common.h"
27	#include "hw/pci/pci.h"
28	#include "hw/pci-host/spapr.h"
29	#include "qemu/error-report.h"
30	#include "hw/ppc/fdt.h"
31	#include "hw/pci/pci_bridge.h"
32
33	#define PHANDLE_PCIDEV(phb, pdev) (0x12000000 \| \
34	(((phb)->index) << 16) \| ((pdev)->devfn))
35	#define PHANDLE_GPURAM(phb, n) (0x110000FF \| ((n) << 8) \| \
36	(((phb)->index) << 16))
37	#define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 \| (((phb)->index) << 8) \| \
38	((gn) << 4) \| (nn))
39
40	#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1))
41
42	typedef struct SpaprPhbPciNvGpuSlot {
43	uint64_t tgt;
44	uint64_t gpa;
45	unsigned numa_id;
46	PCIDevice *gpdev;
47	int linknum;
48	struct {
49	uint64_t atsd_gpa;
50	PCIDevice *npdev;
51	uint32_t link_speed;
52	} links[NVGPU_MAX_LINKS];
53	} SpaprPhbPciNvGpuSlot;
54
55	struct SpaprPhbPciNvGpuConfig {
56	uint64_t nv2_ram_current;
57	uint64_t nv2_atsd_current;
58	int num; / number of non empty (i.e. tgt!=0) entries in slots[] /
59	SpaprPhbPciNvGpuSlot slots[NVGPU_MAX_NUM];
60	Error *errp;
61	};
62
63	static SpaprPhbPciNvGpuSlot *
64	spapr_nvgpu_get_slot(SpaprPhbPciNvGpuConfig *nvgpus, uint64_t tgt)
65	{
66	int i;
67
68	/ Search for partially collected "slot" /
69	for (i = `0`; i < nvgpus->num; ++i) {
70	if (nvgpus->slots[i].tgt == tgt) {
71	return &nvgpus->slots[i];
72	}
73	}
74
75	if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) {
76	return NULL;
77	}
78
79	i = nvgpus->num;
80	nvgpus->slots[i].tgt = tgt;
81	++nvgpus->num;
82
83	return &nvgpus->slots[i];
84	}
85
86	static void spapr_pci_collect_nvgpu(SpaprPhbPciNvGpuConfig *nvgpus,
87	PCIDevice *pdev, uint64_t tgt,
88	MemoryRegion mr, Error *errp)
89	{
90	MachineState *machine = MACHINE(qdev_get_machine());
91	SpaprMachineState *spapr = SPAPR_MACHINE(machine);
92	SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
93
94	if (!nvslot) {
95	error_setg(errp, "Found too many GPUs per vPHB");
96	return;
97	}
98	g_assert(!nvslot->gpdev);
99	nvslot->gpdev = pdev;
100
101	nvslot->gpa = nvgpus->nv2_ram_current;
102	nvgpus->nv2_ram_current += memory_region_size(mr);
103	nvslot->numa_id = spapr->gpu_numa_id;
104	++spapr->gpu_numa_id;
105	}
106
107	static void spapr_pci_collect_nvnpu(SpaprPhbPciNvGpuConfig *nvgpus,
108	PCIDevice *pdev, uint64_t tgt,
109	MemoryRegion mr, Error *errp)
110	{
111	SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
112	int j;
113
114	if (!nvslot) {
115	error_setg(errp, "Found too many NVLink bridges per vPHB");
116	return;
117	}
118
119	j = nvslot->linknum;
120	if (j == ARRAY_SIZE(nvslot->links)) {
121	error_setg(errp, "Found too many NVLink bridges per GPU");
122	return;
123	}
124	++nvslot->linknum;
125
126	g_assert(!nvslot->links[j].npdev);
127	nvslot->links[j].npdev = pdev;
128	nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current;
129	nvgpus->nv2_atsd_current += memory_region_size(mr);
130	nvslot->links[j].link_speed =
131	object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL);
132	}
133
134	static void spapr_phb_pci_collect_nvgpu(PCIBus bus, PCIDevice pdev,
135	void *opaque)
136	{
137	PCIBus *sec_bus;
138	Object *po = OBJECT(pdev);
139	uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL);
140
141	if (tgt) {
142	Error *local_err = NULL;
143	SpaprPhbPciNvGpuConfig *nvgpus = opaque;
144	Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL);
145	Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]",
146	NULL);
147
148	g_assert(mr_gpu \|\| mr_npu);
149	if (mr_gpu) {
150	spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu),
151	&local_err);
152	} else {
153	spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu),
154	&local_err);
155	}
156	error_propagate(&nvgpus->errp, local_err);
157	}
158	if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, `1`) !=
159	PCI_HEADER_TYPE_BRIDGE)) {
160	return;
161	}
162
163	sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
164	if (!sec_bus) {
165	return;
166	}
167
168	pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
169	spapr_phb_pci_collect_nvgpu, opaque);
170	}
171
172	void spapr_phb_nvgpu_setup(SpaprPhbState sphb, Error *errp)
173	{
174	int i, j, valid_gpu_num;
175	PCIBus *bus;
176
177	/ Search for GPUs and NPUs /
178	if (!sphb->nv2_gpa_win_addr \|\| !sphb->nv2_atsd_win_addr) {
179	return;
180	}
181
182	sphb->nvgpus = g_new0(SpaprPhbPciNvGpuConfig, `1`);
183	sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr;
184	sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr;
185
186	bus = PCI_HOST_BRIDGE(sphb)->bus;
187	pci_for_each_device(bus, pci_bus_num(bus),
188	spapr_phb_pci_collect_nvgpu, sphb->nvgpus);
189
190	if (sphb->nvgpus->errp) {
191	error_propagate(errp, sphb->nvgpus->errp);
192	sphb->nvgpus->errp = NULL;
193	goto cleanup_exit;
194	}
195
196	/ Add found GPU RAM and ATSD MRs if found /
197	for (i = `0`, valid_gpu_num = `0`; i < sphb->nvgpus->num; ++i) {
198	Object *nvmrobj;
199	SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
200
201	if (!nvslot->gpdev) {
202	continue;
203	}
204	nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev),
205	"nvlink2-mr[0]", NULL);
206	/ ATSD is pointless without GPU RAM MR so skip those /
207	if (!nvmrobj) {
208	continue;
209	}
210
211	++valid_gpu_num;
212	memory_region_add_subregion(get_system_memory(), nvslot->gpa,
213	MEMORY_REGION(nvmrobj));
214
215	for (j = `0`; j < nvslot->linknum; ++j) {
216	Object *atsdmrobj;
217
218	atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev),
219	"nvlink2-atsd-mr[0]", NULL);
220	if (!atsdmrobj) {
221	continue;
222	}
223	memory_region_add_subregion(get_system_memory(),
224	nvslot->links[j].atsd_gpa,
225	MEMORY_REGION(atsdmrobj));
226	}
227	}
228
229	if (valid_gpu_num) {
230	return;
231	}
232	/ We did not find any interesting GPU /
233	cleanup_exit:
234	g_free(sphb->nvgpus);
235	sphb->nvgpus = NULL;
236	}
237
238	void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
239	{
240	int i, j;
241
242	if (!sphb->nvgpus) {
243	return;
244	}
245
246	for (i = `0`; i < sphb->nvgpus->num; ++i) {
247	SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
248	Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
249	"nvlink2-mr[0]", NULL);
250
251	if (nv_mrobj) {
252	memory_region_del_subregion(get_system_memory(),
253	MEMORY_REGION(nv_mrobj));
254	}
255	for (j = `0`; j < nvslot->linknum; ++j) {
256	PCIDevice *npdev = nvslot->links[j].npdev;
257	Object *atsd_mrobj;
258	atsd_mrobj = object_property_get_link(OBJECT(npdev),
259	"nvlink2-atsd-mr[0]", NULL);
260	if (atsd_mrobj) {
261	memory_region_del_subregion(get_system_memory(),
262	MEMORY_REGION(atsd_mrobj));
263	}
264	}
265	}
266	g_free(sphb->nvgpus);
267	sphb->nvgpus = NULL;
268	}
269
270	void spapr_phb_nvgpu_populate_dt(SpaprPhbState sphb, void* fdt, int* bus_off,
271	Error **errp)
272	{
273	int i, j, atsdnum = `0`;
274	uint64_t atsd[`8`]; / The existing limitation of known guests /
275
276	if (!sphb->nvgpus) {
277	return;
278	}
279
280	for (i = `0`; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) {
281	SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
282
283	if (!nvslot->gpdev) {
284	continue;
285	}
286	for (j = `0`; j < nvslot->linknum; ++j) {
287	if (!nvslot->links[j].atsd_gpa) {
288	continue;
289	}
290
291	if (atsdnum == ARRAY_SIZE(atsd)) {
292	error_report("Only %"PRIuPTR" ATSD registers supported",
293	ARRAY_SIZE(atsd));
294	break;
295	}
296	atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa);
297	++atsdnum;
298	}
299	}
300
301	if (!atsdnum) {
302	error_setg(errp, "No ATSD registers found");
303	return;
304	}
305
306	if (!spapr_phb_eeh_available(sphb)) {
307	/*
308	* ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB
309	* which we do not emulate as a separate device. Instead we put
310	* ibm,mmio-atsd to the vPHB with GPU and make sure that we do not
311	* put GPUs from different IOMMU groups to the same vPHB to ensure
312	* that the guest will use ATSDs from the corresponding NPU.
313	*/
314	error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group");
315	return;
316	}
317
318	_FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd,
319	atsdnum * sizeof(atsd[`0`]))));
320	}
321
322	void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState sphb, void* *fdt)
323	{
324	int i, j, linkidx, npuoff;
325	char *npuname;
326
327	if (!sphb->nvgpus) {
328	return;
329	}
330
331	npuname = g_strdup_printf("npuphb%d", sphb->index);
332	npuoff = fdt_add_subnode(fdt, `0`, npuname);
333	_FDT(npuoff);
334	_FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", `1`));
335	_FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", `0`));
336	/ Advertise NPU as POWER9 so the guest can enable NPU2 contexts /
337	_FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu")));
338	g_free(npuname);
339
340	for (i = `0`, linkidx = `0`; i < sphb->nvgpus->num; ++i) {
341	for (j = `0`; j < sphb->nvgpus->slots[i].linknum; ++j) {
342	char *linkname = g_strdup_printf("link@%d", linkidx);
343	int off = fdt_add_subnode(fdt, npuoff, linkname);
344
345	_FDT(off);
346	/ _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); /
347	_FDT((fdt_setprop_string(fdt, off, "compatible",
348	"ibm,npu-link")));
349	_FDT((fdt_setprop_cell(fdt, off, "phandle",
350	PHANDLE_NVLINK(sphb, i, j))));
351	_FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx)));
352	g_free(linkname);
353	++linkidx;
354	}
355	}
356
357	/ Add memory nodes for GPU RAM and mark them unusable /
358	for (i = `0`; i < sphb->nvgpus->num; ++i) {
359	SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
360	Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
361	"nvlink2-mr[0]", NULL);
362	uint32_t associativity[] = {
363	cpu_to_be32(`0x4`),
364	SPAPR_GPU_NUMA_ID,
365	SPAPR_GPU_NUMA_ID,
366	SPAPR_GPU_NUMA_ID,
367	cpu_to_be32(nvslot->numa_id)
368	};
369	uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
370	uint64_t mem_reg[`2`] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) };
371	char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa);
372	int off = fdt_add_subnode(fdt, `0`, mem_name);
373
374	_FDT(off);
375	_FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
376	_FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
377	_FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
378	sizeof(associativity))));
379
380	_FDT((fdt_setprop_string(fdt, off, "compatible",
381	"ibm,coherent-device-memory")));
382
383	mem_reg[`1`] = cpu_to_be64(`0`);
384	_FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg,
385	sizeof(mem_reg))));
386	_FDT((fdt_setprop_cell(fdt, off, "phandle",
387	PHANDLE_GPURAM(sphb, i))));
388	g_free(mem_name);
389	}
390
391	}
392
393	void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice dev, void* fdt, int* offset,
394	SpaprPhbState *sphb)
395	{
396	int i, j;
397
398	if (!sphb->nvgpus) {
399	return;
400	}
401
402	for (i = `0`; i < sphb->nvgpus->num; ++i) {
403	SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i];
404
405	/ Skip "slot" without attached GPU /
406	if (!nvslot->gpdev) {
407	continue;
408	}
409	if (dev == nvslot->gpdev) {
410	uint32_t npus[nvslot->linknum];
411
412	for (j = `0`; j < nvslot->linknum; ++j) {
413	PCIDevice *npdev = nvslot->links[j].npdev;
414
415	npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
416	}
417	_FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
418	j * sizeof(npus[`0`])));
419	_FDT((fdt_setprop_cell(fdt, offset, "phandle",
420	PHANDLE_PCIDEV(sphb, dev))));
421	continue;
422	}
423
424	for (j = `0`; j < nvslot->linknum; ++j) {
425	if (dev != nvslot->links[j].npdev) {
426	continue;
427	}
428
429	_FDT((fdt_setprop_cell(fdt, offset, "phandle",
430	PHANDLE_PCIDEV(sphb, dev))));
431	_FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
432	PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
433	_FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
434	PHANDLE_NVLINK(sphb, i, j))));
435	/*
436	* If we ever want to emulate GPU RAM at the same location as on
437	* the host - here is the encoding GPA->TGT:
438	*
439	* gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
440	* gta \|= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
441	* gta \|= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
442	* gta \|= sphb->nv2_gpa & ((1UL << 43) - 1);
443	*/
444	_FDT(fdt_setprop_cell(fdt, offset, "memory-region",
445	PHANDLE_GPURAM(sphb, i)));
446	_FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
447	nvslot->tgt));
448	_FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
449	nvslot->links[j].link_speed));
450	}
451	}
452	}
453

Browse the source code of qemu/hw/ppc/spapr_pci_nvlink2.c