StreamBuffer.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. // Copyright 2013 Dolphin Emulator Project
  2. // Licensed under GPLv2+
  3. // Refer to the license.txt file included.
  4. #include "Common/MemoryUtil.h"
  5. #include "VideoBackends/OGL/GLUtil.h"
  6. #include "VideoBackends/OGL/Render.h"
  7. #include "VideoBackends/OGL/StreamBuffer.h"
  8. #include "VideoCommon/DriverDetails.h"
  9. #include "VideoCommon/OnScreenDisplay.h"
  10. namespace OGL
  11. {
  12. // moved out of constructor, so m_buffer is allowed to be const
  13. static u32 genBuffer()
  14. {
  15. u32 id;
  16. glGenBuffers(1, &id);
  17. return id;
  18. }
  19. StreamBuffer::StreamBuffer(u32 type, u32 size)
  20. : m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
  21. {
  22. m_iterator = 0;
  23. m_used_iterator = 0;
  24. m_free_iterator = 0;
  25. }
  26. StreamBuffer::~StreamBuffer()
  27. {
  28. glDeleteBuffers(1, &m_buffer);
  29. }
  30. /* Shared synchronization code for ring buffers
  31. *
  32. * The next three functions are to create/delete/use the OpenGL synchronization.
  33. * ARB_sync (OpenGL 3.2) is used and required.
  34. *
  35. * To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks.
  36. * For each of this chunks, there is a fence which checks if this chunk is still in use.
  37. *
  38. * As our API allows to alloc more memory then it has to use, we have to catch how much is already written.
  39. *
  40. * m_iterator - writing position
  41. * m_free_iterator - last position checked if free
  42. * m_used_iterator - last position known to be written
  43. *
  44. * So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set m_free_iterator to m_iterator afterwards).
  45. *
  46. * We also assume that this buffer is accessed by the GPU between the Unmap and Map function,
  47. * so we may create the fences on the start of mapping.
  48. * Some here, new fences for the chunks between m_used_iterator and m_iterator (also update m_used_iterator).
  49. *
  50. * As ring buffers have an ugly behavior on rollover, have fun to read this code ;)
  51. */
  52. void StreamBuffer::CreateFences()
  53. {
  54. for (int i=0; i<SYNC_POINTS; i++)
  55. {
  56. fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  57. }
  58. }
  59. void StreamBuffer::DeleteFences()
  60. {
  61. for (int i = SLOT(m_free_iterator) + 1; i < SYNC_POINTS; i++)
  62. {
  63. glDeleteSync(fences[i]);
  64. }
  65. for (int i = 0; i < SLOT(m_iterator); i++)
  66. {
  67. glDeleteSync(fences[i]);
  68. }
  69. }
  70. void StreamBuffer::AllocMemory(u32 size)
  71. {
  72. // insert waiting slots for used memory
  73. for (int i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++)
  74. {
  75. fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  76. }
  77. m_used_iterator = m_iterator;
  78. // wait for new slots to end of buffer
  79. for (int i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++)
  80. {
  81. glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
  82. glDeleteSync(fences[i]);
  83. }
  84. m_free_iterator = m_iterator + size;
  85. // if buffer is full
  86. if (m_iterator + size >= m_size)
  87. {
  88. // insert waiting slots in unused space at the end of the buffer
  89. for (int i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
  90. {
  91. fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  92. }
  93. // move to the start
  94. m_used_iterator = m_iterator = 0; // offset 0 is always aligned
  95. // wait for space at the start
  96. for (int i = 0; i <= SLOT(m_iterator + size); i++)
  97. {
  98. glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
  99. glDeleteSync(fences[i]);
  100. }
  101. m_free_iterator = m_iterator + size;
  102. }
  103. }
  104. /* The usual way to stream data to the GPU.
  105. * Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping
  106. * Just do unsync appends until the buffer is full.
  107. * When it's full, orphan (alloc a new buffer and free the old one)
  108. *
  109. * As reallocation is an overhead, this method isn't as fast as it is known to be.
  110. */
  111. class MapAndOrphan : public StreamBuffer
  112. {
  113. public:
  114. MapAndOrphan(u32 type, u32 size) : StreamBuffer(type, size)
  115. {
  116. glBindBuffer(m_buffertype, m_buffer);
  117. glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
  118. }
  119. ~MapAndOrphan()
  120. {
  121. }
  122. std::pair<u8*, u32> Map(u32 size) override
  123. {
  124. if (m_iterator + size >= m_size)
  125. {
  126. glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
  127. m_iterator = 0;
  128. }
  129. u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
  130. GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
  131. return std::make_pair(pointer, m_iterator);
  132. }
  133. void Unmap(u32 used_size) override
  134. {
  135. glFlushMappedBufferRange(m_buffertype, 0, used_size);
  136. glUnmapBuffer(m_buffertype);
  137. m_iterator += used_size;
  138. }
  139. };
  140. /* A modified streaming way without reallocation
  141. * This one fixes the reallocation overhead of the MapAndOrphan one.
  142. * So it alloc a ring buffer on initialization.
  143. * But with this limited resource, we have to care about the CPU-GPU distance.
  144. * Else this fifo may overflow.
  145. * So we had traded orphan vs syncing.
  146. */
  147. class MapAndSync : public StreamBuffer
  148. {
  149. public:
  150. MapAndSync(u32 type, u32 size) : StreamBuffer(type, size)
  151. {
  152. CreateFences();
  153. glBindBuffer(m_buffertype, m_buffer);
  154. glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
  155. }
  156. ~MapAndSync()
  157. {
  158. DeleteFences();
  159. }
  160. std::pair<u8*, u32> Map(u32 size) override
  161. {
  162. AllocMemory(size);
  163. u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
  164. GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
  165. return std::make_pair(pointer, m_iterator);
  166. }
  167. void Unmap(u32 used_size) override
  168. {
  169. glFlushMappedBufferRange(m_buffertype, 0, used_size);
  170. glUnmapBuffer(m_buffertype);
  171. m_iterator += used_size;
  172. }
  173. };
  174. /* Streaming fifo without mapping overhead.
  175. * This one usually requires ARB_buffer_storage (OpenGL 4.4).
  176. * And is usually not available on OpenGL3 GPUs.
  177. *
  178. * ARB_buffer_storage allows us to render from a mapped buffer.
  179. * So we map it persistently in the initialization.
  180. *
  181. * Unsync mapping sounds like an easy task, but it isn't for threaded drivers.
  182. * So every mapping on current close-source driver _will_ end in
  183. * at least a round trip time between two threads.
  184. *
  185. * As persistently mapped buffer can't use orphaning, we also have to sync.
  186. */
  187. class BufferStorage : public StreamBuffer
  188. {
  189. public:
  190. BufferStorage(u32 type, u32 size) : StreamBuffer(type, size)
  191. {
  192. CreateFences();
  193. glBindBuffer(m_buffertype, m_buffer);
  194. // PERSISTANT_BIT to make sure that the buffer can be used while mapped
  195. // COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
  196. // CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side
  197. glBufferStorage(m_buffertype, m_size, nullptr,
  198. GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
  199. m_pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size,
  200. GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT);
  201. }
  202. ~BufferStorage()
  203. {
  204. DeleteFences();
  205. glUnmapBuffer(m_buffertype);
  206. glBindBuffer(m_buffertype, 0);
  207. }
  208. std::pair<u8*, u32> Map(u32 size) override
  209. {
  210. AllocMemory(size);
  211. return std::make_pair(m_pointer + m_iterator, m_iterator);
  212. }
  213. void Unmap(u32 used_size) override
  214. {
  215. glFlushMappedBufferRange(m_buffertype, m_iterator, used_size);
  216. m_iterator += used_size;
  217. }
  218. u8* m_pointer;
  219. };
  220. /* --- AMD only ---
  221. * Another streaming fifo without mapping overhead.
  222. * As we can't orphan without mapping, we have to sync.
  223. *
  224. * This one uses AMD_pinned_memory which is available on all AMD GPUs.
  225. * OpenGL 4.4 drivers should use BufferStorage.
  226. */
  227. class PinnedMemory : public StreamBuffer
  228. {
  229. public:
  230. PinnedMemory(u32 type, u32 size) : StreamBuffer(type, size)
  231. {
  232. CreateFences();
  233. m_pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY );
  234. glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
  235. glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY);
  236. glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
  237. glBindBuffer(m_buffertype, m_buffer);
  238. }
  239. ~PinnedMemory()
  240. {
  241. DeleteFences();
  242. glBindBuffer(m_buffertype, 0);
  243. glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
  244. FreeAlignedMemory(m_pointer);
  245. m_pointer = nullptr;
  246. }
  247. std::pair<u8*, u32> Map(u32 size) override
  248. {
  249. AllocMemory(size);
  250. return std::make_pair(m_pointer + m_iterator, m_iterator);
  251. }
  252. void Unmap(u32 used_size) override
  253. {
  254. m_iterator += used_size;
  255. }
  256. u8* m_pointer;
  257. static const u32 ALIGN_PINNED_MEMORY = 4096;
  258. };
  259. /* Fifo based on the glBufferSubData call.
  260. * As everything must be copied before glBufferSubData returns,
  261. * an additional memcpy in the driver will be done.
  262. * So this is a huge overhead, only use it if required.
  263. */
  264. class BufferSubData : public StreamBuffer
  265. {
  266. public:
  267. BufferSubData(u32 type, u32 size) : StreamBuffer(type, size)
  268. {
  269. glBindBuffer(m_buffertype, m_buffer);
  270. glBufferData(m_buffertype, size, nullptr, GL_STATIC_DRAW);
  271. m_pointer = new u8[m_size];
  272. }
  273. ~BufferSubData()
  274. {
  275. delete [] m_pointer;
  276. }
  277. std::pair<u8*, u32> Map(u32 size) override
  278. {
  279. return std::make_pair(m_pointer, 0);
  280. }
  281. void Unmap(u32 used_size) override
  282. {
  283. glBufferSubData(m_buffertype, 0, used_size, m_pointer);
  284. }
  285. u8* m_pointer;
  286. };
  287. /* Fifo based on the glBufferData call.
  288. * Some trashy drivers stall in BufferSubData.
  289. * So here we use glBufferData, which realloc this buffer every time.
  290. * This may avoid stalls, but it is a bigger overhead than BufferSubData.
  291. */
  292. class BufferData : public StreamBuffer
  293. {
  294. public:
  295. BufferData(u32 type, u32 size) : StreamBuffer(type, size)
  296. {
  297. glBindBuffer(m_buffertype, m_buffer);
  298. m_pointer = new u8[m_size];
  299. }
  300. ~BufferData()
  301. {
  302. delete [] m_pointer;
  303. }
  304. std::pair<u8*, u32> Map(u32 size) override
  305. {
  306. return std::make_pair(m_pointer, 0);
  307. }
  308. void Unmap(u32 used_size) override
  309. {
  310. glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW);
  311. }
  312. u8* m_pointer;
  313. };
  314. // choose best streaming library based on the supported extensions and known issues
  315. StreamBuffer* StreamBuffer::Create(u32 type, u32 size)
  316. {
  317. // without basevertex support, only streaming methods whith uploads everything to zero works fine:
  318. if (!g_ogl_config.bSupportsGLBaseVertex)
  319. {
  320. if (!DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
  321. return new BufferSubData(type, size);
  322. // BufferData is by far the worst way, only use it if needed
  323. return new BufferData(type, size);
  324. }
  325. // Prefer the syncing buffers over the orphaning one
  326. if (g_ogl_config.bSupportsGLSync)
  327. {
  328. // pinned memory is much faster than buffer storage on AMD cards
  329. if (g_ogl_config.bSupportsGLPinnedMemory &&
  330. !(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
  331. return new PinnedMemory(type, size);
  332. // buffer storage works well in most situations
  333. if (g_ogl_config.bSupportsGLBufferStorage &&
  334. !(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER) &&
  335. !(DriverDetails::HasBug(DriverDetails::BUG_INTELBROKENBUFFERSTORAGE) && type == GL_ELEMENT_ARRAY_BUFFER))
  336. return new BufferStorage(type, size);
  337. // don't fall back to MapAnd* for Nvidia drivers
  338. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENUNSYNCMAPPING))
  339. return new BufferSubData(type, size);
  340. // mapping fallback
  341. if (g_ogl_config.bSupportsGLSync)
  342. return new MapAndSync(type, size);
  343. }
  344. // default fallback, should work everywhere, but isn't the best way to do this job
  345. return new MapAndOrphan(type, size);
  346. }
  347. }