vcacheoptimizer.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // This work is based on:
  6. // Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
  7. // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
  8. namespace meshopt
  9. {
  10. const size_t kCacheSizeMax = 16;
  11. const size_t kValenceMax = 8;
  12. struct VertexScoreTable
  13. {
  14. float cache[1 + kCacheSizeMax];
  15. float live[1 + kValenceMax];
  16. };
  17. // Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
  18. static const VertexScoreTable kVertexScoreTable = {
  19. {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
  20. {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
  21. };
  22. // Tuned to minimize the encoded index buffer size
  23. static const VertexScoreTable kVertexScoreTableStrip = {
  24. {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
  25. {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
  26. };
  27. struct TriangleAdjacency
  28. {
  29. unsigned int* counts;
  30. unsigned int* offsets;
  31. unsigned int* data;
  32. };
  33. static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
  34. {
  35. size_t face_count = index_count / 3;
  36. // allocate arrays
  37. adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
  38. adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
  39. adjacency.data = allocator.allocate<unsigned int>(index_count);
  40. // fill triangle counts
  41. memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
  42. for (size_t i = 0; i < index_count; ++i)
  43. {
  44. assert(indices[i] < vertex_count);
  45. adjacency.counts[indices[i]]++;
  46. }
  47. // fill offset table
  48. unsigned int offset = 0;
  49. for (size_t i = 0; i < vertex_count; ++i)
  50. {
  51. adjacency.offsets[i] = offset;
  52. offset += adjacency.counts[i];
  53. }
  54. assert(offset == index_count);
  55. // fill triangle data
  56. for (size_t i = 0; i < face_count; ++i)
  57. {
  58. unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
  59. adjacency.data[adjacency.offsets[a]++] = unsigned(i);
  60. adjacency.data[adjacency.offsets[b]++] = unsigned(i);
  61. adjacency.data[adjacency.offsets[c]++] = unsigned(i);
  62. }
  63. // fix offsets that have been disturbed by the previous pass
  64. for (size_t i = 0; i < vertex_count; ++i)
  65. {
  66. assert(adjacency.offsets[i] >= adjacency.counts[i]);
  67. adjacency.offsets[i] -= adjacency.counts[i];
  68. }
  69. }
  70. static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
  71. {
  72. // check dead-end stack
  73. while (dead_end_top)
  74. {
  75. unsigned int vertex = dead_end[--dead_end_top];
  76. if (live_triangles[vertex] > 0)
  77. return vertex;
  78. }
  79. // input order
  80. while (input_cursor < vertex_count)
  81. {
  82. if (live_triangles[input_cursor] > 0)
  83. return input_cursor;
  84. ++input_cursor;
  85. }
  86. return ~0u;
  87. }
  88. static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
  89. {
  90. unsigned int best_candidate = ~0u;
  91. int best_priority = -1;
  92. for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
  93. {
  94. unsigned int vertex = *next_candidate;
  95. // otherwise we don't need to process it
  96. if (live_triangles[vertex] > 0)
  97. {
  98. int priority = 0;
  99. // will it be in cache after fanning?
  100. if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
  101. {
  102. priority = timestamp - cache_timestamps[vertex]; // position in cache
  103. }
  104. if (priority > best_priority)
  105. {
  106. best_candidate = vertex;
  107. best_priority = priority;
  108. }
  109. }
  110. }
  111. return best_candidate;
  112. }
  113. static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
  114. {
  115. assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
  116. unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
  117. return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
  118. }
  119. static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
  120. {
  121. // input order
  122. while (input_cursor < face_count)
  123. {
  124. if (!emitted_flags[input_cursor])
  125. return input_cursor;
  126. ++input_cursor;
  127. }
  128. return ~0u;
  129. }
  130. } // namespace meshopt
  131. void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
  132. {
  133. using namespace meshopt;
  134. assert(index_count % 3 == 0);
  135. meshopt_Allocator allocator;
  136. // guard for empty meshes
  137. if (index_count == 0 || vertex_count == 0)
  138. return;
  139. // support in-place optimization
  140. if (destination == indices)
  141. {
  142. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  143. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  144. indices = indices_copy;
  145. }
  146. unsigned int cache_size = 16;
  147. assert(cache_size <= kCacheSizeMax);
  148. size_t face_count = index_count / 3;
  149. // build adjacency information
  150. TriangleAdjacency adjacency = {};
  151. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  152. // live triangle counts
  153. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  154. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  155. // emitted flags
  156. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  157. memset(emitted_flags, 0, face_count);
  158. // compute initial vertex scores
  159. float* vertex_scores = allocator.allocate<float>(vertex_count);
  160. for (size_t i = 0; i < vertex_count; ++i)
  161. vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
  162. // compute triangle scores
  163. float* triangle_scores = allocator.allocate<float>(face_count);
  164. for (size_t i = 0; i < face_count; ++i)
  165. {
  166. unsigned int a = indices[i * 3 + 0];
  167. unsigned int b = indices[i * 3 + 1];
  168. unsigned int c = indices[i * 3 + 2];
  169. triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
  170. }
  171. unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
  172. unsigned int* cache = cache_holder;
  173. unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
  174. size_t cache_count = 0;
  175. unsigned int current_triangle = 0;
  176. unsigned int input_cursor = 1;
  177. unsigned int output_triangle = 0;
  178. while (current_triangle != ~0u)
  179. {
  180. assert(output_triangle < face_count);
  181. unsigned int a = indices[current_triangle * 3 + 0];
  182. unsigned int b = indices[current_triangle * 3 + 1];
  183. unsigned int c = indices[current_triangle * 3 + 2];
  184. // output indices
  185. destination[output_triangle * 3 + 0] = a;
  186. destination[output_triangle * 3 + 1] = b;
  187. destination[output_triangle * 3 + 2] = c;
  188. output_triangle++;
  189. // update emitted flags
  190. emitted_flags[current_triangle] = true;
  191. triangle_scores[current_triangle] = 0;
  192. // new triangle
  193. size_t cache_write = 0;
  194. cache_new[cache_write++] = a;
  195. cache_new[cache_write++] = b;
  196. cache_new[cache_write++] = c;
  197. // old triangles
  198. for (size_t i = 0; i < cache_count; ++i)
  199. {
  200. unsigned int index = cache[i];
  201. if (index != a && index != b && index != c)
  202. {
  203. cache_new[cache_write++] = index;
  204. }
  205. }
  206. unsigned int* cache_temp = cache;
  207. cache = cache_new, cache_new = cache_temp;
  208. cache_count = cache_write > cache_size ? cache_size : cache_write;
  209. // update live triangle counts
  210. live_triangles[a]--;
  211. live_triangles[b]--;
  212. live_triangles[c]--;
  213. // remove emitted triangle from adjacency data
  214. // this makes sure that we spend less time traversing these lists on subsequent iterations
  215. for (size_t k = 0; k < 3; ++k)
  216. {
  217. unsigned int index = indices[current_triangle * 3 + k];
  218. unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
  219. size_t neighbors_size = adjacency.counts[index];
  220. for (size_t i = 0; i < neighbors_size; ++i)
  221. {
  222. unsigned int tri = neighbors[i];
  223. if (tri == current_triangle)
  224. {
  225. neighbors[i] = neighbors[neighbors_size - 1];
  226. adjacency.counts[index]--;
  227. break;
  228. }
  229. }
  230. }
  231. unsigned int best_triangle = ~0u;
  232. float best_score = 0;
  233. // update cache positions, vertex scores and triangle scores, and find next best triangle
  234. for (size_t i = 0; i < cache_write; ++i)
  235. {
  236. unsigned int index = cache[i];
  237. int cache_position = i >= cache_size ? -1 : int(i);
  238. // update vertex score
  239. float score = vertexScore(table, cache_position, live_triangles[index]);
  240. float score_diff = score - vertex_scores[index];
  241. vertex_scores[index] = score;
  242. // update scores of vertex triangles
  243. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
  244. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
  245. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  246. {
  247. unsigned int tri = *it;
  248. assert(!emitted_flags[tri]);
  249. float tri_score = triangle_scores[tri] + score_diff;
  250. assert(tri_score > 0);
  251. if (best_score < tri_score)
  252. {
  253. best_triangle = tri;
  254. best_score = tri_score;
  255. }
  256. triangle_scores[tri] = tri_score;
  257. }
  258. }
  259. // step through input triangles in order if we hit a dead-end
  260. current_triangle = best_triangle;
  261. if (current_triangle == ~0u)
  262. {
  263. current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
  264. }
  265. }
  266. assert(input_cursor == face_count);
  267. assert(output_triangle == face_count);
  268. }
  269. void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  270. {
  271. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
  272. }
  273. void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
  274. {
  275. meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
  276. }
  277. void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
  278. {
  279. using namespace meshopt;
  280. assert(index_count % 3 == 0);
  281. assert(cache_size >= 3);
  282. meshopt_Allocator allocator;
  283. // guard for empty meshes
  284. if (index_count == 0 || vertex_count == 0)
  285. return;
  286. // support in-place optimization
  287. if (destination == indices)
  288. {
  289. unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
  290. memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
  291. indices = indices_copy;
  292. }
  293. size_t face_count = index_count / 3;
  294. // build adjacency information
  295. TriangleAdjacency adjacency = {};
  296. buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
  297. // live triangle counts
  298. unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
  299. memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
  300. // cache time stamps
  301. unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
  302. memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
  303. // dead-end stack
  304. unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
  305. unsigned int dead_end_top = 0;
  306. // emitted flags
  307. unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
  308. memset(emitted_flags, 0, face_count);
  309. unsigned int current_vertex = 0;
  310. unsigned int timestamp = cache_size + 1;
  311. unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
  312. unsigned int output_triangle = 0;
  313. while (current_vertex != ~0u)
  314. {
  315. const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
  316. // emit all vertex neighbors
  317. const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
  318. const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
  319. for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
  320. {
  321. unsigned int triangle = *it;
  322. if (!emitted_flags[triangle])
  323. {
  324. unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
  325. // output indices
  326. destination[output_triangle * 3 + 0] = a;
  327. destination[output_triangle * 3 + 1] = b;
  328. destination[output_triangle * 3 + 2] = c;
  329. output_triangle++;
  330. // update dead-end stack
  331. dead_end[dead_end_top + 0] = a;
  332. dead_end[dead_end_top + 1] = b;
  333. dead_end[dead_end_top + 2] = c;
  334. dead_end_top += 3;
  335. // update live triangle counts
  336. live_triangles[a]--;
  337. live_triangles[b]--;
  338. live_triangles[c]--;
  339. // update cache info
  340. // if vertex is not in cache, put it in cache
  341. if (timestamp - cache_timestamps[a] > cache_size)
  342. cache_timestamps[a] = timestamp++;
  343. if (timestamp - cache_timestamps[b] > cache_size)
  344. cache_timestamps[b] = timestamp++;
  345. if (timestamp - cache_timestamps[c] > cache_size)
  346. cache_timestamps[c] = timestamp++;
  347. // update emitted flags
  348. emitted_flags[triangle] = true;
  349. }
  350. }
  351. // next candidates are the ones we pushed to dead-end stack just now
  352. const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
  353. // get next vertex
  354. current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
  355. if (current_vertex == ~0u)
  356. {
  357. current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
  358. }
  359. }
  360. assert(output_triangle == face_count);
  361. }