1 | /** |
2 | * Copyright (c) 2006-2023 LOVE Development Team |
3 | * |
4 | * This software is provided 'as-is', without any express or implied |
5 | * warranty. In no event will the authors be held liable for any damages |
6 | * arising from the use of this software. |
7 | * |
8 | * Permission is granted to anyone to use this software for any purpose, |
9 | * including commercial applications, and to alter it and redistribute it |
10 | * freely, subject to the following restrictions: |
11 | * |
12 | * 1. The origin of this software must not be misrepresented; you must not |
13 | * claim that you wrote the original software. If you use this software |
14 | * in a product, an acknowledgment in the product documentation would be |
15 | * appreciated but is not required. |
16 | * 2. Altered source versions must be plainly marked as such, and must not be |
17 | * misrepresented as being the original software. |
18 | * 3. This notice may not be removed or altered from any source distribution. |
19 | **/ |
20 | |
21 | #include "common/config.h" |
22 | #include "StreamBuffer.h" |
23 | #include "OpenGL.h" |
24 | #include "FenceSync.h" |
25 | #include "graphics/Volatile.h" |
26 | #include "common/Exception.h" |
27 | #include "common/memory.h" |
28 | |
29 | #include <vector> |
30 | #include <algorithm> |
31 | |
32 | namespace love |
33 | { |
34 | namespace graphics |
35 | { |
36 | namespace opengl |
37 | { |
38 | |
39 | // Typically this should be 3 frames, but we only do per-frame syncing right now |
40 | // so we add an extra frame to reduce the (small) chance of stalls. |
41 | static const int BUFFER_FRAMES = 4; |
42 | |
43 | class StreamBufferClientMemory final : public love::graphics::StreamBuffer |
44 | { |
45 | public: |
46 | |
47 | StreamBufferClientMemory(BufferType mode, size_t size) |
48 | : love::graphics::StreamBuffer(mode, size) |
49 | , data(nullptr) |
50 | { |
51 | try |
52 | { |
53 | data = new uint8[size]; |
54 | } |
55 | catch (std::exception &) |
56 | { |
57 | throw love::Exception("Out of memory." ); |
58 | } |
59 | } |
60 | |
61 | virtual ~StreamBufferClientMemory() |
62 | { |
63 | delete[] data; |
64 | } |
65 | |
66 | MapInfo map(size_t /*minsize*/) override |
67 | { |
68 | return MapInfo(data, bufferSize); |
69 | } |
70 | |
71 | size_t unmap(size_t /*usedsize*/) override |
72 | { |
73 | return (size_t) data; |
74 | } |
75 | |
76 | void markUsed(size_t /*usedsize*/) override { } |
77 | ptrdiff_t getHandle() const override { return 0; } |
78 | |
79 | private: |
80 | |
81 | uint8 *data; |
82 | |
83 | }; // StreamBufferClientMemory |
84 | |
85 | class StreamBufferSubDataOrphan final : public love::graphics::StreamBuffer, public Volatile |
86 | { |
87 | public: |
88 | |
89 | StreamBufferSubDataOrphan(BufferType mode, size_t size) |
90 | : love::graphics::StreamBuffer(mode, size) |
91 | , vbo(0) |
92 | , glMode(OpenGL::getGLBufferType(mode)) |
93 | , data(nullptr) |
94 | , orphan(false) |
95 | { |
96 | try |
97 | { |
98 | data = new uint8[size]; |
99 | } |
100 | catch (std::exception &) |
101 | { |
102 | throw love::Exception("Out of memory." ); |
103 | } |
104 | |
105 | loadVolatile(); |
106 | } |
107 | |
108 | virtual ~StreamBufferSubDataOrphan() |
109 | { |
110 | unloadVolatile(); |
111 | delete[] data; |
112 | } |
113 | |
114 | MapInfo map(size_t /*minsize*/) override |
115 | { |
116 | if (orphan) |
117 | { |
118 | orphan = false; |
119 | frameGPUReadOffset = 0; |
120 | gl.bindBuffer(mode, vbo); |
121 | glBufferData(glMode, bufferSize, nullptr, GL_STREAM_DRAW); |
122 | } |
123 | |
124 | return MapInfo(data, bufferSize - frameGPUReadOffset); |
125 | } |
126 | |
127 | size_t unmap(size_t usedsize) override |
128 | { |
129 | gl.bindBuffer(mode, vbo); |
130 | glBufferSubData(glMode, frameGPUReadOffset, usedsize, data); |
131 | return frameGPUReadOffset; |
132 | } |
133 | |
134 | void markUsed(size_t usedsize) override |
135 | { |
136 | frameGPUReadOffset += usedsize; |
137 | } |
138 | |
139 | void nextFrame() override |
140 | { |
141 | // Orphan the buffer before its first use in the next frame. |
142 | frameGPUReadOffset = 0; |
143 | orphan = true; |
144 | } |
145 | |
146 | ptrdiff_t getHandle() const override { return vbo; } |
147 | |
148 | bool loadVolatile() override |
149 | { |
150 | if (vbo != 0) |
151 | return true; |
152 | |
153 | glGenBuffers(1, &vbo); |
154 | gl.bindBuffer(mode, vbo); |
155 | glBufferData(glMode, bufferSize, nullptr, GL_STREAM_DRAW); |
156 | |
157 | frameGPUReadOffset = 0; |
158 | orphan = false; |
159 | |
160 | return true; |
161 | } |
162 | |
163 | void unloadVolatile() override |
164 | { |
165 | if (vbo == 0) |
166 | return; |
167 | |
168 | gl.deleteBuffer(vbo); |
169 | vbo = 0; |
170 | } |
171 | |
172 | protected: |
173 | |
174 | GLuint vbo; |
175 | GLenum glMode; |
176 | |
177 | uint8 *data; |
178 | |
179 | bool orphan; |
180 | |
181 | }; // StreamBufferSubDataOrphan |
182 | |
183 | class StreamBufferSync : public love::graphics::StreamBuffer |
184 | { |
185 | public: |
186 | |
187 | StreamBufferSync(BufferType type, size_t size) |
188 | : love::graphics::StreamBuffer(type, size) |
189 | , frameIndex(0) |
190 | , syncs() |
191 | {} |
192 | |
193 | virtual ~StreamBufferSync() {} |
194 | |
195 | void nextFrame() override |
196 | { |
197 | // Insert a GPU fence for this frame's section of the data, we'll wait |
198 | // for it when we try to map that data for writing in subsequent frames. |
199 | syncs[frameIndex].fence(); |
200 | |
201 | frameIndex = (frameIndex + 1) % BUFFER_FRAMES; |
202 | frameGPUReadOffset = 0; |
203 | } |
204 | |
205 | void markUsed(size_t usedsize) override |
206 | { |
207 | // We insert a fence for all data from this frame at the end of the |
208 | // frame (in nextFrame), rather than doing anything more fine-grained. |
209 | frameGPUReadOffset += usedsize; |
210 | } |
211 | |
212 | protected: |
213 | |
214 | int frameIndex; |
215 | FenceSync syncs[BUFFER_FRAMES]; |
216 | |
217 | }; // StreamBufferSync |
218 | |
219 | class StreamBufferMapSync final : public StreamBufferSync, public Volatile |
220 | { |
221 | public: |
222 | |
223 | StreamBufferMapSync(BufferType type, size_t size) |
224 | : StreamBufferSync(type, size) |
225 | , vbo(0) |
226 | , glMode(OpenGL::getGLBufferType(mode)) |
227 | { |
228 | loadVolatile(); |
229 | } |
230 | |
231 | ~StreamBufferMapSync() |
232 | { |
233 | unloadVolatile(); |
234 | } |
235 | |
236 | MapInfo map(size_t /*minsize*/) override |
237 | { |
238 | gl.bindBuffer(mode, vbo); |
239 | |
240 | // Make sure this frame's section of the buffer is done being used. |
241 | syncs[frameIndex].cpuWait(); |
242 | |
243 | MapInfo info; |
244 | info.size = bufferSize - frameGPUReadOffset; |
245 | |
246 | GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT; |
247 | |
248 | size_t mapoffset = (frameIndex * bufferSize) + frameGPUReadOffset; |
249 | info.data = (uint8 *) glMapBufferRange(glMode, mapoffset, info.size, flags); |
250 | |
251 | return info; |
252 | } |
253 | |
254 | size_t unmap(size_t usedsize) override |
255 | { |
256 | gl.bindBuffer(mode, vbo); |
257 | glFlushMappedBufferRange(glMode, 0, usedsize); |
258 | glUnmapBuffer(glMode); |
259 | |
260 | return (frameIndex * bufferSize) + frameGPUReadOffset; |
261 | } |
262 | |
263 | ptrdiff_t getHandle() const override { return vbo; } |
264 | |
265 | bool loadVolatile() override |
266 | { |
267 | if (vbo != 0) |
268 | return true; |
269 | |
270 | glGenBuffers(1, &vbo); |
271 | gl.bindBuffer(mode, vbo); |
272 | glBufferData(glMode, bufferSize * BUFFER_FRAMES, nullptr, GL_STREAM_DRAW); |
273 | |
274 | frameGPUReadOffset = 0; |
275 | frameIndex = 0; |
276 | |
277 | return true; |
278 | } |
279 | |
280 | void unloadVolatile() override |
281 | { |
282 | if (vbo != 0) |
283 | { |
284 | gl.deleteBuffer(vbo); |
285 | vbo = 0; |
286 | } |
287 | |
288 | for (FenceSync &sync : syncs) |
289 | sync.cleanup(); |
290 | } |
291 | |
292 | private: |
293 | |
294 | GLuint vbo; |
295 | GLenum glMode; |
296 | |
297 | }; // StreamBufferMapSync |
298 | |
299 | class StreamBufferPersistentMapSync final : public StreamBufferSync, public Volatile |
300 | { |
301 | public: |
302 | |
303 | // Coherent mapping is supposedly faster on intel/nvidia aside from a couple |
304 | // old nvidia GPUs. |
305 | StreamBufferPersistentMapSync(BufferType type, size_t size, bool coherent = true) |
306 | : StreamBufferSync(type, size) |
307 | , vbo(0) |
308 | , glMode(OpenGL::getGLBufferType(mode)) |
309 | , data(nullptr) |
310 | , coherent(coherent) |
311 | { |
312 | loadVolatile(); |
313 | } |
314 | |
315 | ~StreamBufferPersistentMapSync() |
316 | { |
317 | unloadVolatile(); |
318 | } |
319 | |
320 | MapInfo map(size_t /*minsize*/) override |
321 | { |
322 | // Make sure this frame's section of the buffer is done being used. |
323 | syncs[frameIndex].cpuWait(); |
324 | |
325 | MapInfo info; |
326 | info.size = bufferSize - frameGPUReadOffset; |
327 | info.data = data + (frameIndex * bufferSize) + frameGPUReadOffset; |
328 | return info; |
329 | } |
330 | |
331 | size_t unmap(size_t usedsize) override |
332 | { |
333 | size_t offset = (frameIndex * bufferSize) + frameGPUReadOffset; |
334 | |
335 | if (!coherent) |
336 | { |
337 | gl.bindBuffer(mode, vbo); |
338 | glFlushMappedBufferRange(glMode, offset, usedsize); |
339 | } |
340 | |
341 | return offset; |
342 | } |
343 | |
344 | ptrdiff_t getHandle() const override { return vbo; } |
345 | |
346 | bool loadVolatile() override |
347 | { |
348 | if (vbo != 0) |
349 | return true; |
350 | |
351 | glGenBuffers(1, &vbo); |
352 | gl.bindBuffer(mode, vbo); |
353 | |
354 | GLbitfield storageflags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; |
355 | GLbitfield mapflags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; |
356 | |
357 | storageflags |= (coherent ? GL_MAP_COHERENT_BIT : 0); |
358 | mapflags |= (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT); |
359 | |
360 | glBufferStorage(glMode, bufferSize * BUFFER_FRAMES, nullptr, storageflags); |
361 | data = (uint8 *) glMapBufferRange(glMode, 0, bufferSize * BUFFER_FRAMES, mapflags); |
362 | |
363 | frameGPUReadOffset = 0; |
364 | frameIndex = 0; |
365 | |
366 | return true; |
367 | } |
368 | |
369 | void unloadVolatile() override |
370 | { |
371 | if (vbo != 0) |
372 | { |
373 | gl.bindBuffer(mode, vbo); |
374 | glUnmapBuffer(glMode); |
375 | gl.deleteBuffer(vbo); |
376 | vbo = 0; |
377 | } |
378 | |
379 | for (FenceSync &sync : syncs) |
380 | sync.cleanup(); |
381 | } |
382 | |
383 | private: |
384 | |
385 | GLuint vbo; |
386 | GLenum glMode; |
387 | uint8 *data; |
388 | bool coherent; |
389 | |
390 | }; // StreamBufferPersistentMapSync |
391 | |
392 | class StreamBufferPinnedMemory final : public StreamBufferSync, public Volatile |
393 | { |
394 | public: |
395 | |
396 | StreamBufferPinnedMemory(BufferType type, size_t size) |
397 | : StreamBufferSync(type, size) |
398 | , vbo(0) |
399 | , glMode(OpenGL::getGLBufferType(mode)) |
400 | , data(nullptr) |
401 | , alignedSize(0) |
402 | { |
403 | size_t alignment = getPageSize(); |
404 | alignedSize = alignUp(size * BUFFER_FRAMES, alignment); |
405 | |
406 | if (!alignedMalloc((void **) &data, alignedSize, alignment)) |
407 | throw love::Exception("Out of memory." ); |
408 | |
409 | if (!loadVolatile()) |
410 | { |
411 | ptrdiff_t pointer = (ptrdiff_t) data; |
412 | alignedFree(data); |
413 | throw love::Exception("AMD Pinned Memory StreamBuffer implementation failed to create buffer (address: %p, alignment: %ld, aiigned size: %ld)" , pointer, alignment, alignedSize); |
414 | } |
415 | } |
416 | |
417 | ~StreamBufferPinnedMemory() |
418 | { |
419 | unloadVolatile(); |
420 | alignedFree(data); |
421 | } |
422 | |
423 | MapInfo map(size_t /*minsize*/) override |
424 | { |
425 | // Make sure this frame's section of the buffer is done being used. |
426 | syncs[frameIndex].cpuWait(); |
427 | |
428 | MapInfo info; |
429 | info.size = bufferSize - frameGPUReadOffset; |
430 | info.data = data + (frameIndex * bufferSize) + frameGPUReadOffset; |
431 | return info; |
432 | } |
433 | |
434 | size_t unmap(size_t /*usedsize*/) override |
435 | { |
436 | size_t offset = (frameIndex * bufferSize) + frameGPUReadOffset; |
437 | return offset; |
438 | } |
439 | |
440 | ptrdiff_t getHandle() const override { return vbo; } |
441 | |
442 | bool loadVolatile() override |
443 | { |
444 | if (vbo != 0) |
445 | return true; |
446 | |
447 | glGenBuffers(1, &vbo); |
448 | |
449 | while (glGetError() != GL_NO_ERROR) |
450 | /* Clear errors. */; |
451 | |
452 | glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, vbo); |
453 | glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, alignedSize, data, GL_STREAM_DRAW); |
454 | |
455 | if (glGetError() != GL_NO_ERROR) |
456 | { |
457 | gl.deleteBuffer(vbo); |
458 | vbo = 0; |
459 | return false; |
460 | } |
461 | |
462 | frameGPUReadOffset = 0; |
463 | frameIndex = 0; |
464 | |
465 | return true; |
466 | } |
467 | |
468 | void unloadVolatile() override |
469 | { |
470 | if (vbo != 0) |
471 | { |
472 | // Make sure the GPU has completed all work before freeing the |
473 | // memory. glFlush+sync.cpuWait doesn't seem to be enough. |
474 | glFinish(); |
475 | |
476 | gl.bindBuffer(mode, vbo); |
477 | gl.deleteBuffer(vbo); |
478 | vbo = 0; |
479 | } |
480 | |
481 | for (FenceSync &sync : syncs) |
482 | sync.cleanup(); |
483 | } |
484 | |
485 | private: |
486 | |
487 | GLuint vbo; |
488 | GLenum glMode; |
489 | uint8 *data; |
490 | size_t alignedSize; |
491 | |
492 | }; // StreamBufferPinnedMemory |
493 | |
494 | love::graphics::StreamBuffer *CreateStreamBuffer(BufferType mode, size_t size) |
495 | { |
496 | if (gl.isCoreProfile()) |
497 | { |
498 | if (!gl.bugs.clientWaitSyncStalls) |
499 | { |
500 | // AMD's pinned memory seems to be faster than persistent mapping, |
501 | // on AMD GPUs. |
502 | if (GLAD_AMD_pinned_memory && gl.getVendor() == OpenGL::VENDOR_AMD) |
503 | { |
504 | try |
505 | { |
506 | return new StreamBufferPinnedMemory(mode, size); |
507 | } |
508 | catch (love::Exception &) |
509 | { |
510 | // According to the spec, pinned memory can fail if the RAM |
511 | // allocation can't be mapped to the GPU's address space. |
512 | // This seems to happen in practice on Mesa + amdgpu: |
513 | // https://bitbucket.org/rude/love/issues/1540 |
514 | // Fall through to other implementations when that happens. |
515 | } |
516 | } |
517 | |
518 | if (GLAD_VERSION_4_4 || GLAD_ARB_buffer_storage) |
519 | return new StreamBufferPersistentMapSync(mode, size); |
520 | |
521 | // Most modern drivers have a separate internal thread which queues |
522 | // GL commands for the GPU. The queue causes mapping to stall until |
523 | // the items in the queue are flushed, which makes this approach |
524 | // slow on most drivers. On macOS, having a separate driver thread |
525 | // is opt-in via an API, and we don't do it, so we can use this |
526 | // instead of the (potentially slower) SubData approach. |
527 | #ifdef LOVE_MACOSX |
528 | return new StreamBufferMapSync(mode, size); |
529 | #endif |
530 | } |
531 | |
532 | return new StreamBufferSubDataOrphan(mode, size); |
533 | } |
534 | else |
535 | return new StreamBufferClientMemory(mode, size); |
536 | } |
537 | |
538 | } // opengl |
539 | } // graphics |
540 | } // love |
541 | |