vcacheoptimizer.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // This work is based on:
  6. // Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
  7. // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
  8. namespace meshopt
  9. {
  10. const size_t kCacheSizeMax = 16;
  11. const size_t kValenceMax = 8;
  12. struct VertexScoreTable
  13. {
  14. float cache[1 + kCacheSizeMax];
  15. float live[1 + kValenceMax];
  16. };
  17. // Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
  18. static const VertexScoreTable kVertexScoreTable = {
  19. {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
  20. {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
  21. };
  22. // Tuned to minimize the encoded index buffer size
  23. static const VertexScoreTable kVertexScoreTableStrip = {
  24. {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
  25. {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
  26. };
  27. struct TriangleAdjacency
  28. {
  29. unsigned int* counts;
  30. unsigned int* offsets;
  31. unsigned int* data;
  32. };
  33. static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  34. {
  35. size_t face_count = index_count / 3;
  36. // allocate arrays
  37. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  38. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  39. adjacency.data = allocator.allocate<unsigned int>(index_count);
  40. // fill triangle counts
  41. memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
  42. for (size_t i = 0; i < index_count; ++i)
  43. {
  44. assert(indices[i] < vertex_count);
  45. adjacency.counts[indices[i]]++;
  46. }
  47. // fill offset table
  48. unsigned int offset = 0;
  49. for (size_t i = 0; i < vertex_count; ++i)
  50. {
  51. adjacency.offsets[i] = offset;
  52. offset += adjacency.counts[i];
  53. }
  54. assert(offset == index_count);
  55. // fill triangle data
  56. for (size_t i = 0; i < face_count; ++i)
  57. {
  58. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  59. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  60. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  61. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  62. }
  63. // fix offsets that have been disturbed by the previous pass
  64. for (size_t i = 0; i < vertex_count; ++i)
  65. {
  66. assert(adjacency.offsets[i] >= adjacency.counts[i]);
  67. adjacency.offsets[i] -= adjacency.counts[i];
  68. }
  69. }
  70. static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
  71. {
  72. // check dead-end stack
  73. while (dead_end_top)
  74. {
  75. unsigned int vertex = dead_end[--dead_end_top];
  76. if (live_triangles[vertex] > 0)
  77. return vertex;
  78. }
  79. // input order
  80. while (input_cursor < vertex_count)
  81. {
  82. if (live_triangles[input_cursor] > 0)
  83. return input_cursor;
  84. ++input_cursor;
  85. }
  86. return ~0u;
  87. }
  88. static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
  89. {
  90. unsigned int best_candidate = ~0u;
  91. int best_priority = -1;
  92. for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
  93. {
  94. unsigned int vertex = *next_candidate;
  95. // otherwise we don't need to process it
  96. if (live_triangles[vertex] > 0)
  97. {
  98. int priority = 0;
  99. // will it be in cache after fanning?
  100. if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
  101. {
  102. priority = timestamp - cache_timestamps[vertex]; // position in cache
  103. }
  104. if (priority > best_priority)
  105. {
  106. best_candidate = vertex;
  107. best_priority = priority;
  108. }
  109. }
  110. }
  111. return best_candidate;
  112. }
  113. static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
  114. {
  115. assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
  116. unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
  117. return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
  118. }
  119. static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
  120. {
  121. // input order
  122. while (input_cursor < face_count)
  123. {
  124. if (!emitted_flags[input_cursor])
  125. return input_cursor;
  126. ++input_cursor;
  127. }
  128. return ~0u;
  129. }
  130. } // namespace meshopt
  131. void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
  132. {
  133. using namespace meshopt;
  134. assert(index_count % 3 == 0);
  135. meshopt_Allocator allocator;
  136. // guard for empty meshes
  137. if (index_count == 0 || vertex_count == 0)
  138. return;
  139. // support in-place optimization
  140. if (destination == indices)
  141. {
  142. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  143. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  144. indices = indices_copy;
  145. }
  146. unsigned int cache_size = 16;
  147. assert(cache_size <= kCacheSizeMax);
  148. size_t face_count = index_count / 3;
  149. // build adjacency information
  150. TriangleAdjacency adjacency = {};
  151. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  152. // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
  153. unsigned int* live_triangles = adjacency.counts;
  154. // emitted flags
  155. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  156. memset(emitted_flags, 0, face_count);
  157. // compute initial vertex scores
  158. float* vertex_scores = allocator.allocate<float>(vertex_count);
  159. for (size_t i = 0; i < vertex_count; ++i)
  160. vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
  161. // compute triangle scores
  162. float* triangle_scores = allocator.allocate<float>(face_count);
  163. for (size_t i = 0; i < face_count; ++i)
  164. {
  165. unsigned int a = indices[i * 3 + 0];
  166. unsigned int b = indices[i * 3 + 1];
  167. unsigned int c = indices[i * 3 + 2];
  168. triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
  169. }
  170. unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
  171. unsigned int* cache = cache_holder;
  172. unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
  173. size_t cache_count = 0;
  174. unsigned int current_triangle = 0;
  175. unsigned int input_cursor = 1;
  176. unsigned int output_triangle = 0;
  177. while (current_triangle != ~0u)
  178. {
  179. assert(output_triangle < face_count);
  180. unsigned int a = indices[current_triangle * 3 + 0];
  181. unsigned int b = indices[current_triangle * 3 + 1];
  182. unsigned int c = indices[current_triangle * 3 + 2];
  183. // output indices
  184. destination[output_triangle * 3 + 0] = a;
  185. destination[output_triangle * 3 + 1] = b;
  186. destination[output_triangle * 3 + 2] = c;
  187. output_triangle++;
  188. // update emitted flags
  189. emitted_flags[current_triangle] = true;
  190. triangle_scores[current_triangle] = 0;
  191. // new triangle
  192. size_t cache_write = 0;
  193. cache_new[cache_write++] = a;
  194. cache_new[cache_write++] = b;
  195. cache_new[cache_write++] = c;
  196. // old triangles
  197. for (size_t i = 0; i < cache_count; ++i)
  198. {
  199. unsigned int index = cache[i];
  200. cache_new[cache_write] = index;
  201. cache_write += (index != a) & (index != b) & (index != c);
  202. }
  203. unsigned int* cache_temp = cache;
  204. cache = cache_new, cache_new = cache_temp;
  205. cache_count = cache_write > cache_size ? cache_size : cache_write;
  206. // remove emitted triangle from adjacency data
  207. // this makes sure that we spend less time traversing these lists on subsequent iterations
  208. // live triangle counts are updated as a byproduct of these adjustments
  209. for (size_t k = 0; k < 3; ++k)
  210. {
  211. unsigned int index = indices[current_triangle * 3 + k];
  212. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  213. size_t neighbors_size = adjacency.counts[index];
  214. for (size_t i = 0; i < neighbors_size; ++i)
  215. {
  216. unsigned int tri = neighbors[i];
  217. if (tri == current_triangle)
  218. {
  219. neighbors[i] = neighbors[neighbors_size - 1];
  220. adjacency.counts[index]--;
  221. break;
  222. }
  223. }
  224. }
  225. unsigned int best_triangle = ~0u;
  226. float best_score = 0;
  227. // update cache positions, vertex scores and triangle scores, and find next best triangle
  228. for (size_t i = 0; i < cache_write; ++i)
  229. {
  230. unsigned int index = cache[i];
  231. // no need to update scores if we are never going to use this vertex
  232. if (adjacency.counts[index] == 0)
  233. continue;
  234. int cache_position = i >= cache_size ? -1 : int(i);
  235. // update vertex score
  236. float score = vertexScore(table, cache_position, live_triangles[index]);
  237. float score_diff = score - vertex_scores[index];
  238. vertex_scores[index] = score;
  239. // update scores of vertex triangles
  240. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
  241. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
  242. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  243. {
  244. unsigned int tri = *it;
  245. assert(!emitted_flags[tri]);
  246. float tri_score = triangle_scores[tri] + score_diff;
  247. assert(tri_score > 0);
  248. best_triangle = best_score < tri_score ? tri : best_triangle;
  249. best_score = best_score < tri_score ? tri_score : best_score;
  250. triangle_scores[tri] = tri_score;
  251. }
  252. }
  253. // step through input triangles in order if we hit a dead-end
  254. current_triangle = best_triangle;
  255. if (current_triangle == ~0u)
  256. {
  257. current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
  258. }
  259. }
  260. assert(input_cursor == face_count);
  261. assert(output_triangle == face_count);
  262. }
  263. void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  264. {
  265. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
  266. }
  267. void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  268. {
  269. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
  270. }
  271. void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
  272. {
  273. using namespace meshopt;
  274. assert(index_count % 3 == 0);
  275. assert(cache_size >= 3);
  276. meshopt_Allocator allocator;
  277. // guard for empty meshes
  278. if (index_count == 0 || vertex_count == 0)
  279. return;
  280. // support in-place optimization
  281. if (destination == indices)
  282. {
  283. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  284. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  285. indices = indices_copy;
  286. }
  287. size_t face_count = index_count / 3;
  288. // build adjacency information
  289. TriangleAdjacency adjacency = {};
  290. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  291. // live triangle counts
  292. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  293. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  294. // cache time stamps
  295. unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
  296. memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
  297. // dead-end stack
  298. unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
  299. unsigned int dead_end_top = 0;
  300. // emitted flags
  301. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  302. memset(emitted_flags, 0, face_count);
  303. unsigned int current_vertex = 0;
  304. unsigned int timestamp = cache_size + 1;
  305. unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
  306. unsigned int output_triangle = 0;
  307. while (current_vertex != ~0u)
  308. {
  309. const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
  310. // emit all vertex neighbors
  311. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
  312. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
  313. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  314. {
  315. unsigned int triangle = *it;
  316. if (!emitted_flags[triangle])
  317. {
  318. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  319. // output indices
  320. destination[output_triangle * 3 + 0] = a;
  321. destination[output_triangle * 3 + 1] = b;
  322. destination[output_triangle * 3 + 2] = c;
  323. output_triangle++;
  324. // update dead-end stack
  325. dead_end[dead_end_top + 0] = a;
  326. dead_end[dead_end_top + 1] = b;
  327. dead_end[dead_end_top + 2] = c;
  328. dead_end_top += 3;
  329. // update live triangle counts
  330. live_triangles[a]--;
  331. live_triangles[b]--;
  332. live_triangles[c]--;
  333. // update cache info
  334. // if vertex is not in cache, put it in cache
  335. if (timestamp - cache_timestamps[a] > cache_size)
  336. cache_timestamps[a] = timestamp++;
  337. if (timestamp - cache_timestamps[b] > cache_size)
  338. cache_timestamps[b] = timestamp++;
  339. if (timestamp - cache_timestamps[c] > cache_size)
  340. cache_timestamps[c] = timestamp++;
  341. // update emitted flags
  342. emitted_flags[triangle] = true;
  343. }
  344. }
  345. // next candidates are the ones we pushed to dead-end stack just now
  346. const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
  347. // get next vertex
  348. current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
  349. if (current_vertex == ~0u)
  350. {
  351. current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
  352. }
  353. }
  354. assert(output_triangle == face_count);
  355. }