astcenc_find_best_partitioning.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  18. /**
  19. * @brief Functions for finding best partition for a block.
  20. *
  21. * The partition search operates in two stages. The first pass uses kmeans clustering to group
  22. * texels into an ideal partitioning for the requested partition count, and then compares that
  23. * against the 1024 partitionings generated by the ASTC partition hash function. The generated
  24. * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
  25. * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
  26. * partitionings that actually generate fewer than the requested partition count, but only the top
  27. * N candidates are actually put through a more detailed search. N is determined by the compressor
  28. * quality preset.
  29. *
  30. * For the detailed search, each candidate is checked against two possible encoding methods:
  31. *
  32. * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
  33. * - The best partitioning assuming same chroma colors (RGB + scale endpoints).
  34. *
  35. * This is implemented by computing the compute mean color and dominant direction for each
  36. * partition. This defines two lines, both of which go through the mean color value.
  37. *
  38. * - One line has a direction defined by the dominant direction; this is used to assess the error
  39. * from using an uncorrelated color representation.
  40. * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
  41. * (RGB + scale) color representation.
  42. *
  43. * The best candidate is selected by computing the squared-errors that result from using these
  44. * lines for endpoint selection.
  45. */
  46. #include <limits>
  47. #include "astcenc_internal.h"
  48. /**
  49. * @brief Pick some initial kmeans cluster centers.
  50. *
  51. * @param blk The image block color data to compress.
  52. * @param texel_count The number of texels in the block.
  53. * @param partition_count The number of partitions in the block.
  54. * @param[out] cluster_centers The initial partition cluster center colors.
  55. */
  56. static void kmeans_init(
  57. const image_block& blk,
  58. unsigned int texel_count,
  59. unsigned int partition_count,
  60. vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
  61. ) {
  62. promise(texel_count > 0);
  63. promise(partition_count > 0);
  64. unsigned int clusters_selected = 0;
  65. float distances[BLOCK_MAX_TEXELS];
  66. // Pick a random sample as first cluster center; 145897 from random.org
  67. unsigned int sample = 145897 % texel_count;
  68. vfloat4 center_color = blk.texel(sample);
  69. cluster_centers[clusters_selected] = center_color;
  70. clusters_selected++;
  71. // Compute the distance to the first cluster center
  72. float distance_sum = 0.0f;
  73. for (unsigned int i = 0; i < texel_count; i++)
  74. {
  75. vfloat4 color = blk.texel(i);
  76. vfloat4 diff = color - center_color;
  77. float distance = dot_s(diff * diff, blk.channel_weight);
  78. distance_sum += distance;
  79. distances[i] = distance;
  80. }
  81. // More numbers from random.org for weighted-random center selection
  82. const float cluster_cutoffs[9] {
  83. 0.626220f, 0.932770f, 0.275454f,
  84. 0.318558f, 0.240113f, 0.009190f,
  85. 0.347661f, 0.731960f, 0.156391f
  86. };
  87. unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
  88. // Pick the remaining samples as needed
  89. while (true)
  90. {
  91. // Pick the next center in a weighted-random fashion.
  92. float summa = 0.0f;
  93. float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
  94. for (sample = 0; sample < texel_count; sample++)
  95. {
  96. summa += distances[sample];
  97. if (summa >= distance_cutoff)
  98. {
  99. break;
  100. }
  101. }
  102. // Clamp to a valid range and store the selected cluster center
  103. sample = astc::min(sample, texel_count - 1);
  104. center_color = blk.texel(sample);
  105. cluster_centers[clusters_selected++] = center_color;
  106. if (clusters_selected >= partition_count)
  107. {
  108. break;
  109. }
  110. // Compute the distance to the new cluster center, keep the min dist
  111. distance_sum = 0.0f;
  112. for (unsigned int i = 0; i < texel_count; i++)
  113. {
  114. vfloat4 color = blk.texel(i);
  115. vfloat4 diff = color - center_color;
  116. float distance = dot_s(diff * diff, blk.channel_weight);
  117. distance = astc::min(distance, distances[i]);
  118. distance_sum += distance;
  119. distances[i] = distance;
  120. }
  121. }
  122. }
  123. /**
  124. * @brief Assign texels to clusters, based on a set of chosen center points.
  125. *
  126. * @param blk The image block color data to compress.
  127. * @param texel_count The number of texels in the block.
  128. * @param partition_count The number of partitions in the block.
  129. * @param cluster_centers The partition cluster center colors.
  130. * @param[out] partition_of_texel The partition assigned for each texel.
  131. */
  132. static void kmeans_assign(
  133. const image_block& blk,
  134. unsigned int texel_count,
  135. unsigned int partition_count,
  136. const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
  137. uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
  138. ) {
  139. promise(texel_count > 0);
  140. promise(partition_count > 0);
  141. uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
  142. // Find the best partition for every texel
  143. for (unsigned int i = 0; i < texel_count; i++)
  144. {
  145. float best_distance = std::numeric_limits<float>::max();
  146. unsigned int best_partition = 0;
  147. vfloat4 color = blk.texel(i);
  148. for (unsigned int j = 0; j < partition_count; j++)
  149. {
  150. vfloat4 diff = color - cluster_centers[j];
  151. float distance = dot_s(diff * diff, blk.channel_weight);
  152. if (distance < best_distance)
  153. {
  154. best_distance = distance;
  155. best_partition = j;
  156. }
  157. }
  158. partition_of_texel[i] = static_cast<uint8_t>(best_partition);
  159. partition_texel_count[best_partition]++;
  160. }
  161. // It is possible to get a situation where a partition ends up without any texels. In this case,
  162. // assign texel N to partition N. This is silly, but ensures that every partition retains at
  163. // least one texel. Reassigning a texel in this manner may cause another partition to go empty,
  164. // so if we actually did a reassignment, run the whole loop over again.
  165. bool problem_case;
  166. do
  167. {
  168. problem_case = false;
  169. for (unsigned int i = 0; i < partition_count; i++)
  170. {
  171. if (partition_texel_count[i] == 0)
  172. {
  173. partition_texel_count[partition_of_texel[i]]--;
  174. partition_texel_count[i]++;
  175. partition_of_texel[i] = static_cast<uint8_t>(i);
  176. problem_case = true;
  177. }
  178. }
  179. } while (problem_case);
  180. }
  181. /**
  182. * @brief Compute new cluster centers based on their center of gravity.
  183. *
  184. * @param blk The image block color data to compress.
  185. * @param texel_count The number of texels in the block.
  186. * @param partition_count The number of partitions in the block.
  187. * @param[out] cluster_centers The new cluster center colors.
  188. * @param partition_of_texel The partition assigned for each texel.
  189. */
  190. static void kmeans_update(
  191. const image_block& blk,
  192. unsigned int texel_count,
  193. unsigned int partition_count,
  194. vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
  195. const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
  196. ) {
  197. promise(texel_count > 0);
  198. promise(partition_count > 0);
  199. vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
  200. vfloat4::zero(),
  201. vfloat4::zero(),
  202. vfloat4::zero(),
  203. vfloat4::zero()
  204. };
  205. uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
  206. // Find the center-of-gravity in each cluster
  207. for (unsigned int i = 0; i < texel_count; i++)
  208. {
  209. uint8_t partition = partition_of_texel[i];
  210. color_sum[partition] += blk.texel(i);
  211. partition_texel_count[partition]++;
  212. }
  213. // Set the center of gravity to be the new cluster center
  214. for (unsigned int i = 0; i < partition_count; i++)
  215. {
  216. float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
  217. cluster_centers[i] = color_sum[i] * scale;
  218. }
  219. }
  220. /**
  221. * @brief Compute bit-mismatch for partitioning in 2-partition mode.
  222. *
  223. * @param a The texel assignment bitvector for the block.
  224. * @param b The texel assignment bitvector for the partition table.
  225. *
  226. * @return The number of bit mismatches.
  227. */
  228. static inline uint8_t partition_mismatch2(
  229. const uint64_t a[2],
  230. const uint64_t b[2]
  231. ) {
  232. int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
  233. int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
  234. // Divide by 2 because XOR always counts errors twice, once when missing
  235. // in the expected position, and again when present in the wrong partition
  236. return static_cast<uint8_t>(astc::min(v1, v2) / 2);
  237. }
  238. /**
  239. * @brief Compute bit-mismatch for partitioning in 3-partition mode.
  240. *
  241. * @param a The texel assignment bitvector for the block.
  242. * @param b The texel assignment bitvector for the partition table.
  243. *
  244. * @return The number of bit mismatches.
  245. */
  246. static inline uint8_t partition_mismatch3(
  247. const uint64_t a[3],
  248. const uint64_t b[3]
  249. ) {
  250. int p00 = popcount(a[0] ^ b[0]);
  251. int p01 = popcount(a[0] ^ b[1]);
  252. int p02 = popcount(a[0] ^ b[2]);
  253. int p10 = popcount(a[1] ^ b[0]);
  254. int p11 = popcount(a[1] ^ b[1]);
  255. int p12 = popcount(a[1] ^ b[2]);
  256. int p20 = popcount(a[2] ^ b[0]);
  257. int p21 = popcount(a[2] ^ b[1]);
  258. int p22 = popcount(a[2] ^ b[2]);
  259. int s0 = p11 + p22;
  260. int s1 = p12 + p21;
  261. int v0 = astc::min(s0, s1) + p00;
  262. int s2 = p10 + p22;
  263. int s3 = p12 + p20;
  264. int v1 = astc::min(s2, s3) + p01;
  265. int s4 = p10 + p21;
  266. int s5 = p11 + p20;
  267. int v2 = astc::min(s4, s5) + p02;
  268. // Divide by 2 because XOR always counts errors twice, once when missing
  269. // in the expected position, and again when present in the wrong partition
  270. return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
  271. }
  272. /**
  273. * @brief Compute bit-mismatch for partitioning in 4-partition mode.
  274. *
  275. * @param a The texel assignment bitvector for the block.
  276. * @param b The texel assignment bitvector for the partition table.
  277. *
  278. * @return The number of bit mismatches.
  279. */
  280. static inline uint8_t partition_mismatch4(
  281. const uint64_t a[4],
  282. const uint64_t b[4]
  283. ) {
  284. int p00 = popcount(a[0] ^ b[0]);
  285. int p01 = popcount(a[0] ^ b[1]);
  286. int p02 = popcount(a[0] ^ b[2]);
  287. int p03 = popcount(a[0] ^ b[3]);
  288. int p10 = popcount(a[1] ^ b[0]);
  289. int p11 = popcount(a[1] ^ b[1]);
  290. int p12 = popcount(a[1] ^ b[2]);
  291. int p13 = popcount(a[1] ^ b[3]);
  292. int p20 = popcount(a[2] ^ b[0]);
  293. int p21 = popcount(a[2] ^ b[1]);
  294. int p22 = popcount(a[2] ^ b[2]);
  295. int p23 = popcount(a[2] ^ b[3]);
  296. int p30 = popcount(a[3] ^ b[0]);
  297. int p31 = popcount(a[3] ^ b[1]);
  298. int p32 = popcount(a[3] ^ b[2]);
  299. int p33 = popcount(a[3] ^ b[3]);
  300. int mx23 = astc::min(p22 + p33, p23 + p32);
  301. int mx13 = astc::min(p21 + p33, p23 + p31);
  302. int mx12 = astc::min(p21 + p32, p22 + p31);
  303. int mx03 = astc::min(p20 + p33, p23 + p30);
  304. int mx02 = astc::min(p20 + p32, p22 + p30);
  305. int mx01 = astc::min(p21 + p30, p20 + p31);
  306. int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
  307. int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
  308. int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
  309. int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
  310. // Divide by 2 because XOR always counts errors twice, once when missing
  311. // in the expected position, and again when present in the wrong partition
  312. return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
  313. }
  314. using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
  315. /**
  316. * @brief Count the partition table mismatches vs the data clustering.
  317. *
  318. * @param bsd The block size information.
  319. * @param partition_count The number of partitions in the block.
  320. * @param bitmaps The block texel partition assignment patterns.
  321. * @param[out] mismatch_counts The array storing per partitioning mismatch counts.
  322. */
  323. static void count_partition_mismatch_bits(
  324. const block_size_descriptor& bsd,
  325. unsigned int partition_count,
  326. const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
  327. uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
  328. ) {
  329. unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
  330. promise(active_count > 0);
  331. if (partition_count == 2)
  332. {
  333. for (unsigned int i = 0; i < active_count; i++)
  334. {
  335. mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
  336. assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
  337. assert(mismatch_counts[i] < bsd.texel_count);
  338. }
  339. }
  340. else if (partition_count == 3)
  341. {
  342. for (unsigned int i = 0; i < active_count; i++)
  343. {
  344. mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
  345. assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
  346. assert(mismatch_counts[i] < bsd.texel_count);
  347. }
  348. }
  349. else
  350. {
  351. for (unsigned int i = 0; i < active_count; i++)
  352. {
  353. mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
  354. assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
  355. assert(mismatch_counts[i] < bsd.texel_count);
  356. }
  357. }
  358. }
  359. /**
  360. * @brief Use counting sort on the mismatch array to sort partition candidates.
  361. *
  362. * @param partitioning_count The number of packed partitionings.
  363. * @param mismatch_count Partitioning mismatch counts, in index order.
  364. * @param[out] partition_ordering Partition index values, in mismatch order.
  365. *
  366. * @return The number of active partitions in this selection.
  367. */
  368. static unsigned int get_partition_ordering_by_mismatch_bits(
  369. unsigned int texel_count,
  370. unsigned int partitioning_count,
  371. const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
  372. uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
  373. ) {
  374. promise(partitioning_count > 0);
  375. uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
  376. // Create the histogram of mismatch counts
  377. for (unsigned int i = 0; i < partitioning_count; i++)
  378. {
  379. mscount[mismatch_count[i]]++;
  380. }
  381. // Create a running sum from the histogram array
  382. // Cells store previous values only; i.e. exclude self after sum
  383. unsigned int sum = 0;
  384. for (unsigned int i = 0; i < texel_count; i++)
  385. {
  386. uint16_t cnt = mscount[i];
  387. mscount[i] = sum;
  388. sum += cnt;
  389. }
  390. // Use the running sum as the index, incrementing after read to allow
  391. // sequential entries with the same count
  392. for (unsigned int i = 0; i < partitioning_count; i++)
  393. {
  394. unsigned int idx = mscount[mismatch_count[i]]++;
  395. partition_ordering[idx] = static_cast<uint16_t>(i);
  396. }
  397. return partitioning_count;
  398. }
  399. /**
  400. * @brief Use k-means clustering to compute a partition ordering for a block..
  401. *
  402. * @param bsd The block size information.
  403. * @param blk The image block color data to compress.
  404. * @param partition_count The desired number of partitions in the block.
  405. * @param[out] partition_ordering The list of recommended partition indices, in priority order.
  406. *
  407. * @return The number of active partitionings in this selection.
  408. */
  409. static unsigned int compute_kmeans_partition_ordering(
  410. const block_size_descriptor& bsd,
  411. const image_block& blk,
  412. unsigned int partition_count,
  413. uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
  414. ) {
  415. vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
  416. uint8_t texel_partitions[BLOCK_MAX_TEXELS];
  417. // Use three passes of k-means clustering to partition the block data
  418. for (unsigned int i = 0; i < 3; i++)
  419. {
  420. if (i == 0)
  421. {
  422. kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
  423. }
  424. else
  425. {
  426. kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
  427. }
  428. kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
  429. }
  430. // Construct the block bitmaps of texel assignments to each partition
  431. uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
  432. unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
  433. promise(texels_to_process > 0);
  434. for (unsigned int i = 0; i < texels_to_process; i++)
  435. {
  436. unsigned int idx = bsd.kmeans_texels[i];
  437. bitmaps[texel_partitions[idx]] |= 1ULL << i;
  438. }
  439. // Count the mismatch between the block and the format's partition tables
  440. uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
  441. count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
  442. // Sort the partitions based on the number of mismatched bits
  443. return get_partition_ordering_by_mismatch_bits(
  444. texels_to_process,
  445. bsd.partitioning_count_selected[partition_count - 1],
  446. mismatch_counts, partition_ordering);
  447. }
  448. /**
  449. * @brief Insert a partitioning into an order list of results, sorted by error.
  450. *
  451. * @param max_values The max number of entries in the best result arrays.
  452. * @param this_error The error of the new entry.
  453. * @param this_partition The partition ID of the new entry.
  454. * @param[out] best_errors The array of best error values.
  455. * @param[out] best_partitions The array of best partition values.
  456. */
  457. static void insert_result(
  458. unsigned int max_values,
  459. float this_error,
  460. unsigned int this_partition,
  461. float* best_errors,
  462. unsigned int* best_partitions)
  463. {
  464. promise(max_values > 0);
  465. // Don't bother searching if the current worst error beats the new error
  466. if (this_error >= best_errors[max_values - 1])
  467. {
  468. return;
  469. }
  470. // Else insert into the list in error-order
  471. for (unsigned int i = 0; i < max_values; i++)
  472. {
  473. // Existing result is better - move on ...
  474. if (this_error > best_errors[i])
  475. {
  476. continue;
  477. }
  478. // Move existing results down one
  479. for (unsigned int j = max_values - 1; j > i; j--)
  480. {
  481. best_errors[j] = best_errors[j - 1];
  482. best_partitions[j] = best_partitions[j - 1];
  483. }
  484. // Insert new result
  485. best_errors[i] = this_error;
  486. best_partitions[i] = this_partition;
  487. break;
  488. }
  489. }
  490. /* See header for documentation. */
  491. unsigned int find_best_partition_candidates(
  492. const block_size_descriptor& bsd,
  493. const image_block& blk,
  494. unsigned int partition_count,
  495. unsigned int partition_search_limit,
  496. unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
  497. unsigned int requested_candidates
  498. ) {
  499. // Constant used to estimate quantization error for a given partitioning; the optimal value for
  500. // this depends on bitrate. These values have been determined empirically.
  501. unsigned int texels_per_block = bsd.texel_count;
  502. float weight_imprecision_estim = 0.055f;
  503. if (texels_per_block <= 20)
  504. {
  505. weight_imprecision_estim = 0.03f;
  506. }
  507. else if (texels_per_block <= 31)
  508. {
  509. weight_imprecision_estim = 0.04f;
  510. }
  511. else if (texels_per_block <= 41)
  512. {
  513. weight_imprecision_estim = 0.05f;
  514. }
  515. promise(partition_count > 0);
  516. promise(partition_search_limit > 0);
  517. weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
  518. uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
  519. unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
  520. partition_search_limit = astc::min(partition_search_limit, sequence_len);
  521. requested_candidates = astc::min(partition_search_limit, requested_candidates);
  522. bool uses_alpha = !blk.is_constant_channel(3);
  523. // Partitioning errors assuming uncorrelated-chrominance endpoints
  524. float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
  525. unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
  526. // Partitioning errors assuming same-chrominance endpoints
  527. float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
  528. unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
  529. for (unsigned int i = 0; i < requested_candidates; i++)
  530. {
  531. uncor_best_errors[i] = ERROR_CALC_DEFAULT;
  532. samec_best_errors[i] = ERROR_CALC_DEFAULT;
  533. }
  534. if (uses_alpha)
  535. {
  536. for (unsigned int i = 0; i < partition_search_limit; i++)
  537. {
  538. unsigned int partition = partition_sequence[i];
  539. const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
  540. // Compute weighting to give to each component in each partition
  541. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  542. compute_avgs_and_dirs_4_comp(pi, blk, pms);
  543. line4 uncor_lines[BLOCK_MAX_PARTITIONS];
  544. line4 samec_lines[BLOCK_MAX_PARTITIONS];
  545. processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
  546. processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
  547. float line_lengths[BLOCK_MAX_PARTITIONS];
  548. for (unsigned int j = 0; j < partition_count; j++)
  549. {
  550. partition_metrics& pm = pms[j];
  551. uncor_lines[j].a = pm.avg;
  552. uncor_lines[j].b = normalize_safe(pm.dir, unit4());
  553. uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
  554. uncor_plines[j].bs = uncor_lines[j].b;
  555. samec_lines[j].a = vfloat4::zero();
  556. samec_lines[j].b = normalize_safe(pm.avg, unit4());
  557. samec_plines[j].amod = vfloat4::zero();
  558. samec_plines[j].bs = samec_lines[j].b;
  559. }
  560. float uncor_error = 0.0f;
  561. float samec_error = 0.0f;
  562. compute_error_squared_rgba(pi,
  563. blk,
  564. uncor_plines,
  565. samec_plines,
  566. line_lengths,
  567. uncor_error,
  568. samec_error);
  569. // Compute an estimate of error introduced by weight quantization imprecision.
  570. // This error is computed as follows, for each partition
  571. // 1: compute the principal-axis vector (full length) in error-space
  572. // 2: convert the principal-axis vector to regular RGB-space
  573. // 3: scale the vector by a constant that estimates average quantization error
  574. // 4: for each texel, square the vector, then do a dot-product with the texel's
  575. // error weight; sum up the results across all texels.
  576. // 4(optimized): square the vector once, then do a dot-product with the average
  577. // texel error, then multiply by the number of texels.
  578. for (unsigned int j = 0; j < partition_count; j++)
  579. {
  580. float tpp = static_cast<float>(pi.partition_texel_count[j]);
  581. vfloat4 error_weights(tpp * weight_imprecision_estim);
  582. vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
  583. vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
  584. uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
  585. samec_error += dot_s(samec_vector * samec_vector, error_weights);
  586. }
  587. insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
  588. insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
  589. }
  590. }
  591. else
  592. {
  593. for (unsigned int i = 0; i < partition_search_limit; i++)
  594. {
  595. unsigned int partition = partition_sequence[i];
  596. const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
  597. // Compute weighting to give to each component in each partition
  598. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  599. compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
  600. partition_lines3 plines[BLOCK_MAX_PARTITIONS];
  601. for (unsigned int j = 0; j < partition_count; j++)
  602. {
  603. partition_metrics& pm = pms[j];
  604. partition_lines3& pl = plines[j];
  605. pl.uncor_line.a = pm.avg;
  606. pl.uncor_line.b = normalize_safe(pm.dir, unit3());
  607. pl.samec_line.a = vfloat4::zero();
  608. pl.samec_line.b = normalize_safe(pm.avg, unit3());
  609. pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
  610. pl.uncor_pline.bs = pl.uncor_line.b;
  611. pl.samec_pline.amod = vfloat4::zero();
  612. pl.samec_pline.bs = pl.samec_line.b;
  613. }
  614. float uncor_error = 0.0f;
  615. float samec_error = 0.0f;
  616. compute_error_squared_rgb(pi,
  617. blk,
  618. plines,
  619. uncor_error,
  620. samec_error);
  621. // Compute an estimate of error introduced by weight quantization imprecision.
  622. // This error is computed as follows, for each partition
  623. // 1: compute the principal-axis vector (full length) in error-space
  624. // 2: convert the principal-axis vector to regular RGB-space
  625. // 3: scale the vector by a constant that estimates average quantization error
  626. // 4: for each texel, square the vector, then do a dot-product with the texel's
  627. // error weight; sum up the results across all texels.
  628. // 4(optimized): square the vector once, then do a dot-product with the average
  629. // texel error, then multiply by the number of texels.
  630. for (unsigned int j = 0; j < partition_count; j++)
  631. {
  632. partition_lines3& pl = plines[j];
  633. float tpp = static_cast<float>(pi.partition_texel_count[j]);
  634. vfloat4 error_weights(tpp * weight_imprecision_estim);
  635. vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
  636. vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
  637. uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
  638. samec_error += dot3_s(samec_vector * samec_vector, error_weights);
  639. }
  640. insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
  641. insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
  642. }
  643. }
  644. unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
  645. for (unsigned int i = 0; i < requested_candidates; i++)
  646. {
  647. interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
  648. interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
  649. }
  650. uint64_t bitmasks[1024/64] { 0 };
  651. unsigned int emitted = 0;
  652. // Deduplicate the first "requested" entries
  653. for (unsigned int i = 0; i < requested_candidates * 2; i++)
  654. {
  655. unsigned int partition = interleave[i];
  656. unsigned int word = partition / 64;
  657. unsigned int bit = partition % 64;
  658. bool written = bitmasks[word] & (1ull << bit);
  659. if (!written)
  660. {
  661. best_partitions[emitted] = partition;
  662. bitmasks[word] |= 1ull << bit;
  663. emitted++;
  664. if (emitted == requested_candidates)
  665. {
  666. break;
  667. }
  668. }
  669. }
  670. return emitted;
  671. }
  672. #endif