1 | /* |
2 | * QEMU sPAPR PCI for NVLink2 pass through |
3 | * |
4 | * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation. |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | * of this software and associated documentation files (the "Software"), to deal |
8 | * in the Software without restriction, including without limitation the rights |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | * copies of the Software, and to permit persons to whom the Software is |
11 | * furnished to do so, subject to the following conditions: |
12 | * |
13 | * The above copyright notice and this permission notice shall be included in |
14 | * all copies or substantial portions of the Software. |
15 | * |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
22 | * THE SOFTWARE. |
23 | */ |
24 | #include "qemu/osdep.h" |
25 | #include "qapi/error.h" |
26 | #include "qemu-common.h" |
27 | #include "hw/pci/pci.h" |
28 | #include "hw/pci-host/spapr.h" |
29 | #include "qemu/error-report.h" |
30 | #include "hw/ppc/fdt.h" |
31 | #include "hw/pci/pci_bridge.h" |
32 | |
33 | #define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \ |
34 | (((phb)->index) << 16) | ((pdev)->devfn)) |
35 | #define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \ |
36 | (((phb)->index) << 16)) |
37 | #define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ |
38 | ((gn) << 4) | (nn)) |
39 | |
40 | #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) |
41 | |
42 | typedef struct SpaprPhbPciNvGpuSlot { |
43 | uint64_t tgt; |
44 | uint64_t gpa; |
45 | unsigned numa_id; |
46 | PCIDevice *gpdev; |
47 | int linknum; |
48 | struct { |
49 | uint64_t atsd_gpa; |
50 | PCIDevice *npdev; |
51 | uint32_t link_speed; |
52 | } links[NVGPU_MAX_LINKS]; |
53 | } SpaprPhbPciNvGpuSlot; |
54 | |
55 | struct SpaprPhbPciNvGpuConfig { |
56 | uint64_t nv2_ram_current; |
57 | uint64_t nv2_atsd_current; |
58 | int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */ |
59 | SpaprPhbPciNvGpuSlot slots[NVGPU_MAX_NUM]; |
60 | Error *errp; |
61 | }; |
62 | |
63 | static SpaprPhbPciNvGpuSlot * |
64 | spapr_nvgpu_get_slot(SpaprPhbPciNvGpuConfig *nvgpus, uint64_t tgt) |
65 | { |
66 | int i; |
67 | |
68 | /* Search for partially collected "slot" */ |
69 | for (i = 0; i < nvgpus->num; ++i) { |
70 | if (nvgpus->slots[i].tgt == tgt) { |
71 | return &nvgpus->slots[i]; |
72 | } |
73 | } |
74 | |
75 | if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) { |
76 | return NULL; |
77 | } |
78 | |
79 | i = nvgpus->num; |
80 | nvgpus->slots[i].tgt = tgt; |
81 | ++nvgpus->num; |
82 | |
83 | return &nvgpus->slots[i]; |
84 | } |
85 | |
86 | static void spapr_pci_collect_nvgpu(SpaprPhbPciNvGpuConfig *nvgpus, |
87 | PCIDevice *pdev, uint64_t tgt, |
88 | MemoryRegion *mr, Error **errp) |
89 | { |
90 | MachineState *machine = MACHINE(qdev_get_machine()); |
91 | SpaprMachineState *spapr = SPAPR_MACHINE(machine); |
92 | SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); |
93 | |
94 | if (!nvslot) { |
95 | error_setg(errp, "Found too many GPUs per vPHB" ); |
96 | return; |
97 | } |
98 | g_assert(!nvslot->gpdev); |
99 | nvslot->gpdev = pdev; |
100 | |
101 | nvslot->gpa = nvgpus->nv2_ram_current; |
102 | nvgpus->nv2_ram_current += memory_region_size(mr); |
103 | nvslot->numa_id = spapr->gpu_numa_id; |
104 | ++spapr->gpu_numa_id; |
105 | } |
106 | |
107 | static void spapr_pci_collect_nvnpu(SpaprPhbPciNvGpuConfig *nvgpus, |
108 | PCIDevice *pdev, uint64_t tgt, |
109 | MemoryRegion *mr, Error **errp) |
110 | { |
111 | SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); |
112 | int j; |
113 | |
114 | if (!nvslot) { |
115 | error_setg(errp, "Found too many NVLink bridges per vPHB" ); |
116 | return; |
117 | } |
118 | |
119 | j = nvslot->linknum; |
120 | if (j == ARRAY_SIZE(nvslot->links)) { |
121 | error_setg(errp, "Found too many NVLink bridges per GPU" ); |
122 | return; |
123 | } |
124 | ++nvslot->linknum; |
125 | |
126 | g_assert(!nvslot->links[j].npdev); |
127 | nvslot->links[j].npdev = pdev; |
128 | nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current; |
129 | nvgpus->nv2_atsd_current += memory_region_size(mr); |
130 | nvslot->links[j].link_speed = |
131 | object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed" , NULL); |
132 | } |
133 | |
134 | static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev, |
135 | void *opaque) |
136 | { |
137 | PCIBus *sec_bus; |
138 | Object *po = OBJECT(pdev); |
139 | uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt" , NULL); |
140 | |
141 | if (tgt) { |
142 | Error *local_err = NULL; |
143 | SpaprPhbPciNvGpuConfig *nvgpus = opaque; |
144 | Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]" , NULL); |
145 | Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]" , |
146 | NULL); |
147 | |
148 | g_assert(mr_gpu || mr_npu); |
149 | if (mr_gpu) { |
150 | spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu), |
151 | &local_err); |
152 | } else { |
153 | spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu), |
154 | &local_err); |
155 | } |
156 | error_propagate(&nvgpus->errp, local_err); |
157 | } |
158 | if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != |
159 | PCI_HEADER_TYPE_BRIDGE)) { |
160 | return; |
161 | } |
162 | |
163 | sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); |
164 | if (!sec_bus) { |
165 | return; |
166 | } |
167 | |
168 | pci_for_each_device(sec_bus, pci_bus_num(sec_bus), |
169 | spapr_phb_pci_collect_nvgpu, opaque); |
170 | } |
171 | |
172 | void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp) |
173 | { |
174 | int i, j, valid_gpu_num; |
175 | PCIBus *bus; |
176 | |
177 | /* Search for GPUs and NPUs */ |
178 | if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) { |
179 | return; |
180 | } |
181 | |
182 | sphb->nvgpus = g_new0(SpaprPhbPciNvGpuConfig, 1); |
183 | sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr; |
184 | sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr; |
185 | |
186 | bus = PCI_HOST_BRIDGE(sphb)->bus; |
187 | pci_for_each_device(bus, pci_bus_num(bus), |
188 | spapr_phb_pci_collect_nvgpu, sphb->nvgpus); |
189 | |
190 | if (sphb->nvgpus->errp) { |
191 | error_propagate(errp, sphb->nvgpus->errp); |
192 | sphb->nvgpus->errp = NULL; |
193 | goto cleanup_exit; |
194 | } |
195 | |
196 | /* Add found GPU RAM and ATSD MRs if found */ |
197 | for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) { |
198 | Object *nvmrobj; |
199 | SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; |
200 | |
201 | if (!nvslot->gpdev) { |
202 | continue; |
203 | } |
204 | nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev), |
205 | "nvlink2-mr[0]" , NULL); |
206 | /* ATSD is pointless without GPU RAM MR so skip those */ |
207 | if (!nvmrobj) { |
208 | continue; |
209 | } |
210 | |
211 | ++valid_gpu_num; |
212 | memory_region_add_subregion(get_system_memory(), nvslot->gpa, |
213 | MEMORY_REGION(nvmrobj)); |
214 | |
215 | for (j = 0; j < nvslot->linknum; ++j) { |
216 | Object *atsdmrobj; |
217 | |
218 | atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev), |
219 | "nvlink2-atsd-mr[0]" , NULL); |
220 | if (!atsdmrobj) { |
221 | continue; |
222 | } |
223 | memory_region_add_subregion(get_system_memory(), |
224 | nvslot->links[j].atsd_gpa, |
225 | MEMORY_REGION(atsdmrobj)); |
226 | } |
227 | } |
228 | |
229 | if (valid_gpu_num) { |
230 | return; |
231 | } |
232 | /* We did not find any interesting GPU */ |
233 | cleanup_exit: |
234 | g_free(sphb->nvgpus); |
235 | sphb->nvgpus = NULL; |
236 | } |
237 | |
238 | void spapr_phb_nvgpu_free(SpaprPhbState *sphb) |
239 | { |
240 | int i, j; |
241 | |
242 | if (!sphb->nvgpus) { |
243 | return; |
244 | } |
245 | |
246 | for (i = 0; i < sphb->nvgpus->num; ++i) { |
247 | SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; |
248 | Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), |
249 | "nvlink2-mr[0]" , NULL); |
250 | |
251 | if (nv_mrobj) { |
252 | memory_region_del_subregion(get_system_memory(), |
253 | MEMORY_REGION(nv_mrobj)); |
254 | } |
255 | for (j = 0; j < nvslot->linknum; ++j) { |
256 | PCIDevice *npdev = nvslot->links[j].npdev; |
257 | Object *atsd_mrobj; |
258 | atsd_mrobj = object_property_get_link(OBJECT(npdev), |
259 | "nvlink2-atsd-mr[0]" , NULL); |
260 | if (atsd_mrobj) { |
261 | memory_region_del_subregion(get_system_memory(), |
262 | MEMORY_REGION(atsd_mrobj)); |
263 | } |
264 | } |
265 | } |
266 | g_free(sphb->nvgpus); |
267 | sphb->nvgpus = NULL; |
268 | } |
269 | |
270 | void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off, |
271 | Error **errp) |
272 | { |
273 | int i, j, atsdnum = 0; |
274 | uint64_t atsd[8]; /* The existing limitation of known guests */ |
275 | |
276 | if (!sphb->nvgpus) { |
277 | return; |
278 | } |
279 | |
280 | for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) { |
281 | SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; |
282 | |
283 | if (!nvslot->gpdev) { |
284 | continue; |
285 | } |
286 | for (j = 0; j < nvslot->linknum; ++j) { |
287 | if (!nvslot->links[j].atsd_gpa) { |
288 | continue; |
289 | } |
290 | |
291 | if (atsdnum == ARRAY_SIZE(atsd)) { |
292 | error_report("Only %" PRIuPTR" ATSD registers supported" , |
293 | ARRAY_SIZE(atsd)); |
294 | break; |
295 | } |
296 | atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa); |
297 | ++atsdnum; |
298 | } |
299 | } |
300 | |
301 | if (!atsdnum) { |
302 | error_setg(errp, "No ATSD registers found" ); |
303 | return; |
304 | } |
305 | |
306 | if (!spapr_phb_eeh_available(sphb)) { |
307 | /* |
308 | * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB |
309 | * which we do not emulate as a separate device. Instead we put |
310 | * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not |
311 | * put GPUs from different IOMMU groups to the same vPHB to ensure |
312 | * that the guest will use ATSDs from the corresponding NPU. |
313 | */ |
314 | error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group" ); |
315 | return; |
316 | } |
317 | |
318 | _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd" , atsd, |
319 | atsdnum * sizeof(atsd[0])))); |
320 | } |
321 | |
322 | void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) |
323 | { |
324 | int i, j, linkidx, npuoff; |
325 | char *npuname; |
326 | |
327 | if (!sphb->nvgpus) { |
328 | return; |
329 | } |
330 | |
331 | npuname = g_strdup_printf("npuphb%d" , sphb->index); |
332 | npuoff = fdt_add_subnode(fdt, 0, npuname); |
333 | _FDT(npuoff); |
334 | _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells" , 1)); |
335 | _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells" , 0)); |
336 | /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */ |
337 | _FDT((fdt_setprop_string(fdt, npuoff, "compatible" , "ibm,power9-npu" ))); |
338 | g_free(npuname); |
339 | |
340 | for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) { |
341 | for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) { |
342 | char *linkname = g_strdup_printf("link@%d" , linkidx); |
343 | int off = fdt_add_subnode(fdt, npuoff, linkname); |
344 | |
345 | _FDT(off); |
346 | /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */ |
347 | _FDT((fdt_setprop_string(fdt, off, "compatible" , |
348 | "ibm,npu-link" ))); |
349 | _FDT((fdt_setprop_cell(fdt, off, "phandle" , |
350 | PHANDLE_NVLINK(sphb, i, j)))); |
351 | _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index" , linkidx))); |
352 | g_free(linkname); |
353 | ++linkidx; |
354 | } |
355 | } |
356 | |
357 | /* Add memory nodes for GPU RAM and mark them unusable */ |
358 | for (i = 0; i < sphb->nvgpus->num; ++i) { |
359 | SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; |
360 | Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), |
361 | "nvlink2-mr[0]" , NULL); |
362 | uint32_t associativity[] = { |
363 | cpu_to_be32(0x4), |
364 | SPAPR_GPU_NUMA_ID, |
365 | SPAPR_GPU_NUMA_ID, |
366 | SPAPR_GPU_NUMA_ID, |
367 | cpu_to_be32(nvslot->numa_id) |
368 | }; |
369 | uint64_t size = object_property_get_uint(nv_mrobj, "size" , NULL); |
370 | uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; |
371 | char *mem_name = g_strdup_printf("memory@%" PRIx64, nvslot->gpa); |
372 | int off = fdt_add_subnode(fdt, 0, mem_name); |
373 | |
374 | _FDT(off); |
375 | _FDT((fdt_setprop_string(fdt, off, "device_type" , "memory" ))); |
376 | _FDT((fdt_setprop(fdt, off, "reg" , mem_reg, sizeof(mem_reg)))); |
377 | _FDT((fdt_setprop(fdt, off, "ibm,associativity" , associativity, |
378 | sizeof(associativity)))); |
379 | |
380 | _FDT((fdt_setprop_string(fdt, off, "compatible" , |
381 | "ibm,coherent-device-memory" ))); |
382 | |
383 | mem_reg[1] = cpu_to_be64(0); |
384 | _FDT((fdt_setprop(fdt, off, "linux,usable-memory" , mem_reg, |
385 | sizeof(mem_reg)))); |
386 | _FDT((fdt_setprop_cell(fdt, off, "phandle" , |
387 | PHANDLE_GPURAM(sphb, i)))); |
388 | g_free(mem_name); |
389 | } |
390 | |
391 | } |
392 | |
393 | void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset, |
394 | SpaprPhbState *sphb) |
395 | { |
396 | int i, j; |
397 | |
398 | if (!sphb->nvgpus) { |
399 | return; |
400 | } |
401 | |
402 | for (i = 0; i < sphb->nvgpus->num; ++i) { |
403 | SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; |
404 | |
405 | /* Skip "slot" without attached GPU */ |
406 | if (!nvslot->gpdev) { |
407 | continue; |
408 | } |
409 | if (dev == nvslot->gpdev) { |
410 | uint32_t npus[nvslot->linknum]; |
411 | |
412 | for (j = 0; j < nvslot->linknum; ++j) { |
413 | PCIDevice *npdev = nvslot->links[j].npdev; |
414 | |
415 | npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev)); |
416 | } |
417 | _FDT(fdt_setprop(fdt, offset, "ibm,npu" , npus, |
418 | j * sizeof(npus[0]))); |
419 | _FDT((fdt_setprop_cell(fdt, offset, "phandle" , |
420 | PHANDLE_PCIDEV(sphb, dev)))); |
421 | continue; |
422 | } |
423 | |
424 | for (j = 0; j < nvslot->linknum; ++j) { |
425 | if (dev != nvslot->links[j].npdev) { |
426 | continue; |
427 | } |
428 | |
429 | _FDT((fdt_setprop_cell(fdt, offset, "phandle" , |
430 | PHANDLE_PCIDEV(sphb, dev)))); |
431 | _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu" , |
432 | PHANDLE_PCIDEV(sphb, nvslot->gpdev))); |
433 | _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink" , |
434 | PHANDLE_NVLINK(sphb, i, j)))); |
435 | /* |
436 | * If we ever want to emulate GPU RAM at the same location as on |
437 | * the host - here is the encoding GPA->TGT: |
438 | * |
439 | * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42; |
440 | * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43; |
441 | * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45; |
442 | * gta |= sphb->nv2_gpa & ((1UL << 43) - 1); |
443 | */ |
444 | _FDT(fdt_setprop_cell(fdt, offset, "memory-region" , |
445 | PHANDLE_GPURAM(sphb, i))); |
446 | _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr" , |
447 | nvslot->tgt)); |
448 | _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed" , |
449 | nvslot->links[j].link_speed)); |
450 | } |
451 | } |
452 | } |
453 | |