ConvectionKernels_BC67.cpp 159 KB


  1. /*
  2. Convection Texture Tools
  3. Copyright (c) 2018-2019 Eric Lasota
  4. Permission is hereby granted, free of charge, to any person obtaining
  5. a copy of this software and associated documentation files (the
  6. "Software"), to deal in the Software without restriction, including
  7. without limitation the rights to use, copy, modify, merge, publish,
  8. distribute, sublicense, and/or sell copies of the Software, and to
  9. permit persons to whom the Software is furnished to do so, subject
  10. to the following conditions:
  11. The above copyright notice and this permission notice shall be included
  12. in all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  14. OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  15. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  16. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  17. CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  18. TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  19. SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  20. -------------------------------------------------------------------------------------
  21. Portions based on DirectX Texture Library (DirectXTex)
  22. Copyright (c) Microsoft Corporation. All rights reserved.
  23. Licensed under the MIT License.
  24. http://go.microsoft.com/fwlink/?LinkId=248926
  25. */
  26. #include "ConvectionKernels_Config.h"
  27. #if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
  28. #include "ConvectionKernels_BC67.h"
  29. #include "ConvectionKernels_AggregatedError.h"
  30. #include "ConvectionKernels_BCCommon.h"
  31. #include "ConvectionKernels_BC7_Prio.h"
  32. #include "ConvectionKernels_BC7_SingleColor.h"
  33. #include "ConvectionKernels_BC6H_IO.h"
  34. #include "ConvectionKernels_EndpointRefiner.h"
  35. #include "ConvectionKernels_EndpointSelector.h"
  36. #include "ConvectionKernels_IndexSelectorHDR.h"
  37. #include "ConvectionKernels_ParallelMath.h"
  38. #include "ConvectionKernels_UnfinishedEndpoints.h"
  39. namespace cvtt
  40. {
  41. namespace Internal
  42. {
  43. namespace BC67
  44. {
  45. typedef ParallelMath::Float MFloat;
  46. typedef ParallelMath::UInt15 MUInt15;
  47. struct WorkInfo
  48. {
  49. MUInt15 m_mode;
  50. MFloat m_error;
  51. MUInt15 m_ep[3][2][4];
  52. MUInt15 m_indexes[16];
  53. MUInt15 m_indexes2[16];
  54. union
  55. {
  56. MUInt15 m_partition;
  57. struct IndexSelectorAndRotation
  58. {
  59. MUInt15 m_indexSelector;
  60. MUInt15 m_rotation;
  61. } m_isr;
  62. } m_u;
  63. };
  64. }
  65. namespace BC6HData
  66. {
  67. enum EField
  68. {
  69. NA, // N/A
  70. M, // Mode
  71. D, // Shape
  72. RW,
  73. RX,
  74. RY,
  75. RZ,
  76. GW,
  77. GX,
  78. GY,
  79. GZ,
  80. BW,
  81. BX,
  82. BY,
  83. BZ,
  84. };
  85. struct ModeDescriptor
  86. {
  87. EField m_eField;
  88. uint8_t m_uBit;
  89. };
  90. const ModeDescriptor g_modeDescriptors[14][82] =
  91. {
  92. { // Mode 1 (0x00) - 10 5 5 5
  93. { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  94. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  95. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  96. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  97. { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  98. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  99. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  100. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  101. { D, 3 },{ D, 4 },
  102. },
  103. { // Mode 2 (0x01) - 7 6 6 6
  104. { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  105. { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  106. { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  107. { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  108. { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  109. { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  110. { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  111. { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
  112. { D, 3 },{ D, 4 },
  113. },
  114. { // Mode 3 (0x02) - 11 5 4 4
  115. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  116. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  117. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  118. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  119. { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
  120. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
  121. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  122. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  123. { D, 3 },{ D, 4 },
  124. },
  125. { // Mode 4 (0x06) - 11 4 5 4
  126. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  127. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  128. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  129. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
  130. { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  131. { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
  132. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
  133. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  134. { D, 3 },{ D, 4 },
  135. },
  136. { // Mode 5 (0x0a) - 11 4 4 5
  137. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  138. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  139. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  140. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
  141. { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
  142. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  143. { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
  144. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  145. { D, 3 },{ D, 4 },
  146. },
  147. { // Mode 6 (0x0e) - 9 5 5 5
  148. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  149. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  150. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  151. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  152. { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  153. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  154. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  155. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  156. { D, 3 },{ D, 4 },
  157. },
  158. { // Mode 7 (0x12) - 8 6 5 5
  159. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  160. { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  161. { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  162. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  163. { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  164. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  165. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  166. { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
  167. { D, 3 },{ D, 4 },
  168. },
  169. { // Mode 8 (0x16) - 8 5 6 5
  170. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  171. { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  172. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  173. { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  174. { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  175. { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  176. { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  177. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  178. { D, 3 },{ D, 4 },
  179. },
  180. { // Mode 9 (0x1a) - 8 5 5 6
  181. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  182. { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  183. { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  184. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  185. { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  186. { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  187. { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  188. { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
  189. { D, 3 },{ D, 4 },
  190. },
  191. { // Mode 10 (0x1e) - 6 6 6 6
  192. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  193. { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  194. { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  195. { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  196. { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  197. { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  198. { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
  199. { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
  200. { D, 3 },{ D, 4 },
  201. },
  202. { // Mode 11 (0x03) - 10 10
  203. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  204. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  205. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  206. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  207. { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  208. { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  209. { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  210. { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  211. { NA, 0 },{ NA, 0 },
  212. },
  213. { // Mode 12 (0x07) - 11 9
  214. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  215. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  216. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  217. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  218. { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  219. { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  220. { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  221. { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  222. { NA, 0 },{ NA, 0 },
  223. },
  224. { // Mode 13 (0x0b) - 12 8
  225. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  226. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  227. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  228. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
  229. { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
  230. { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
  231. { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  232. { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  233. { NA, 0 },{ NA, 0 },
  234. },
  235. { // Mode 14 (0x0f) - 16 4
  236. { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
  237. { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
  238. { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
  239. { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
  240. { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
  241. { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
  242. { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  243. { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
  244. { NA, 0 },{ NA, 0 },
  245. },
  246. };
  247. }
  248. namespace BC7Data
  249. {
  250. enum AlphaMode
  251. {
  252. AlphaMode_Combined,
  253. AlphaMode_Separate,
  254. AlphaMode_None,
  255. };
  256. enum PBitMode
  257. {
  258. PBitMode_PerEndpoint,
  259. PBitMode_PerSubset,
  260. PBitMode_None
  261. };
  262. struct BC7ModeInfo
  263. {
  264. PBitMode m_pBitMode;
  265. AlphaMode m_alphaMode;
  266. int m_rgbBits;
  267. int m_alphaBits;
  268. int m_partitionBits;
  269. int m_numSubsets;
  270. int m_indexBits;
  271. int m_alphaIndexBits;
  272. bool m_hasIndexSelector;
  273. };
  274. BC7ModeInfo g_modes[] =
  275. {
  276. { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false }, // 0
  277. { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false }, // 1
  278. { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false }, // 2
  279. { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false }, // 3 (Mode reference has an error, P-bit is really per-endpoint)
  280. { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true }, // 4
  281. { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false }, // 5
  282. { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
  283. { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false } // 7
  284. };
  285. const int g_weight2[] = { 0, 21, 43, 64 };
  286. const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
  287. const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
  288. const int *g_weightTables[] =
  289. {
  290. NULL,
  291. NULL,
  292. g_weight2,
  293. g_weight3,
  294. g_weight4
  295. };
  296. struct BC6HModeInfo
  297. {
  298. uint16_t m_modeID;
  299. bool m_partitioned;
  300. bool m_transformed;
  301. int m_aPrec;
  302. int m_bPrec[3];
  303. };
  304. // [partitioned][precision]
  305. bool g_hdrModesExistForPrecision[2][17] =
  306. {
  307. //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
  308. { false, false, false, false, false, false, false, false, false, false, true, true, true, false, false, false, true },
  309. { false, false, false, false, false, false, true, true, true, true, true, true, false, false, false, false, false },
  310. };
  311. BC6HModeInfo g_hdrModes[] =
  312. {
  313. { 0x00, true, true, 10,{ 5, 5, 5 } },
  314. { 0x01, true, true, 7,{ 6, 6, 6 } },
  315. { 0x02, true, true, 11,{ 5, 4, 4 } },
  316. { 0x06, true, true, 11,{ 4, 5, 4 } },
  317. { 0x0a, true, true, 11,{ 4, 4, 5 } },
  318. { 0x0e, true, true, 9,{ 5, 5, 5 } },
  319. { 0x12, true, true, 8,{ 6, 5, 5 } },
  320. { 0x16, true, true, 8,{ 5, 6, 5 } },
  321. { 0x1a, true, true, 8,{ 5, 5, 6 } },
  322. { 0x1e, true, false, 6,{ 6, 6, 6 } },
  323. { 0x03, false, false, 10,{ 10, 10, 10 } },
  324. { 0x07, false, true, 11,{ 9, 9, 9 } },
  325. { 0x0b, false, true, 12,{ 8, 8, 8 } },
  326. { 0x0f, false, true, 16,{ 4, 4, 4 } },
  327. };
  328. const int g_maxHDRPrecision = 16;
  329. static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
  330. static uint16_t g_partitionMap[64] =
  331. {
  332. 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
  333. 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
  334. 0xC800, 0xFFEC, 0xFE80, 0xE800,
  335. 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
  336. 0xF710, 0x008E, 0x7100, 0x08CE,
  337. 0x008C, 0x7310, 0x3100, 0x8CCE,
  338. 0x088C, 0x3110, 0x6666, 0x366C,
  339. 0x17E8, 0x0FF0, 0x718E, 0x399C,
  340. 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
  341. 0x3c3c, 0x55aa, 0x9696, 0xa55a,
  342. 0x73ce, 0x13c8, 0x324c, 0x3bdc,
  343. 0x6996, 0xc33c, 0x9966, 0x660,
  344. 0x272, 0x4e4, 0x4e40, 0x2720,
  345. 0xc936, 0x936c, 0x39c6, 0x639c,
  346. 0x9336, 0x9cc6, 0x817e, 0xe718,
  347. 0xccf0, 0xfcc, 0x7744, 0xee22,
  348. };
  349. static uint32_t g_partitionMap2[64] =
  350. {
  351. 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
  352. 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
  353. 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
  354. 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
  355. 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
  356. 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
  357. 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
  358. 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
  359. 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
  360. 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
  361. 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
  362. 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
  363. 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
  364. 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
  365. 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
  366. 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
  367. };
  368. static int g_fixupIndexes2[64] =
  369. {
  370. 15,15,15,15,
  371. 15,15,15,15,
  372. 15,15,15,15,
  373. 15,15,15,15,
  374. 15, 2, 8, 2,
  375. 2, 8, 8,15,
  376. 2, 8, 2, 2,
  377. 8, 8, 2, 2,
  378. 15,15, 6, 8,
  379. 2, 8,15,15,
  380. 2, 8, 2, 2,
  381. 2,15,15, 6,
  382. 6, 2, 6, 8,
  383. 15,15, 2, 2,
  384. 15,15,15,15,
  385. 15, 2, 2,15,
  386. };
  387. static int g_fixupIndexes3[64][2] =
  388. {
  389. { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
  390. { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
  391. { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
  392. { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
  393. { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
  394. { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
  395. { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
  396. { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
  397. { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
  398. { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
  399. { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
  400. { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
  401. { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
  402. { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
  403. { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
  404. { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
  405. };
  406. static const unsigned char g_fragments[] =
  407. {
  408. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 16
  409. 0, 1, 2, 3, // 16, 4
  410. 0, 1, 4, // 20, 3
  411. 0, 1, 2, 4, // 23, 4
  412. 2, 3, 7, // 27, 3
  413. 1, 2, 3, 7, // 30, 4
  414. 0, 1, 2, 3, 4, 5, 6, 7, // 34, 8
  415. 0, 1, 4, 8, // 42, 4
  416. 0, 1, 2, 4, 5, 8, // 46, 6
  417. 0, 1, 2, 3, 4, 5, 6, 8, // 52, 8
  418. 1, 4, 5, 6, 9, // 60, 5
  419. 2, 5, 6, 7, 10, // 65, 5
  420. 5, 6, 9, 10, // 70, 4
  421. 2, 3, 7, 11, // 74, 4
  422. 1, 2, 3, 6, 7, 11, // 78, 6
  423. 0, 1, 2, 3, 5, 6, 7, 11, // 84, 8
  424. 0, 1, 2, 3, 8, 9, 10, 11, // 92, 8
  425. 2, 3, 6, 7, 8, 9, 10, 11, // 100, 8
  426. 4, 5, 6, 7, 8, 9, 10, 11, // 108, 8
  427. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 116, 12
  428. 0, 4, 8, 12, // 128, 4
  429. 0, 2, 3, 4, 6, 7, 8, 12, // 132, 8
  430. 0, 1, 2, 4, 5, 8, 9, 12, // 140, 8
  431. 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, // 148, 10
  432. 3, 6, 7, 8, 9, 12, // 158, 6
  433. 3, 5, 6, 7, 8, 9, 10, 12, // 164, 8
  434. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, // 172, 12
  435. 0, 1, 2, 5, 6, 7, 11, 12, // 184, 8
  436. 5, 8, 9, 10, 13, // 192, 5
  437. 8, 12, 13, // 197, 3
  438. 4, 8, 12, 13, // 200, 4
  439. 2, 3, 6, 9, 12, 13, // 204, 6
  440. 0, 1, 2, 3, 8, 9, 12, 13, // 210, 8
  441. 0, 1, 4, 5, 8, 9, 12, 13, // 218, 8
  442. 2, 3, 6, 7, 8, 9, 12, 13, // 226, 8
  443. 2, 3, 5, 6, 9, 10, 12, 13, // 234, 8
  444. 0, 3, 6, 7, 9, 10, 12, 13, // 242, 8
  445. 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, // 250, 12
  446. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, // 262, 13
  447. 2, 3, 4, 7, 8, 11, 12, 13, // 275, 8
  448. 1, 2, 6, 7, 8, 11, 12, 13, // 283, 8
  449. 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, // 291, 10
  450. 2, 3, 4, 5, 10, 11, 12, 13, // 301, 8
  451. 0, 1, 6, 7, 10, 11, 12, 13, // 309, 8
  452. 6, 9, 10, 11, 14, // 317, 5
  453. 0, 2, 4, 6, 8, 10, 12, 14, // 322, 8
  454. 1, 3, 5, 7, 8, 10, 12, 14, // 330, 8
  455. 1, 3, 4, 6, 9, 11, 12, 14, // 338, 8
  456. 0, 2, 5, 7, 9, 11, 12, 14, // 346, 8
  457. 0, 3, 4, 5, 8, 9, 13, 14, // 354, 8
  458. 2, 3, 4, 7, 8, 9, 13, 14, // 362, 8
  459. 1, 2, 5, 6, 9, 10, 13, 14, // 370, 8
  460. 0, 3, 4, 7, 9, 10, 13, 14, // 378, 8
  461. 0, 3, 5, 6, 8, 11, 13, 14, // 386, 8
  462. 1, 2, 4, 7, 8, 11, 13, 14, // 394, 8
  463. 0, 1, 4, 7, 10, 11, 13, 14, // 402, 8
  464. 0, 3, 6, 7, 10, 11, 13, 14, // 410, 8
  465. 8, 12, 13, 14, // 418, 4
  466. 1, 2, 3, 7, 8, 12, 13, 14, // 422, 8
  467. 4, 8, 9, 12, 13, 14, // 430, 6
  468. 0, 4, 5, 8, 9, 12, 13, 14, // 436, 8
  469. 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, // 444, 10
  470. 2, 6, 8, 9, 10, 12, 13, 14, // 454, 8
  471. 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, // 462, 12
  472. 0, 7, 9, 10, 11, 12, 13, 14, // 474, 8
  473. 1, 2, 3, 4, 5, 6, 8, 15, // 482, 8
  474. 3, 7, 11, 15, // 490, 4
  475. 0, 1, 3, 4, 5, 7, 11, 15, // 494, 8
  476. 0, 4, 5, 10, 11, 15, // 502, 6
  477. 1, 2, 3, 6, 7, 10, 11, 15, // 508, 8
  478. 0, 1, 2, 3, 5, 6, 7, 10, 11, 15, // 516, 10
  479. 0, 4, 5, 6, 9, 10, 11, 15, // 526, 8
  480. 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15, // 534, 12
  481. 1, 2, 4, 5, 8, 9, 12, 15, // 546, 8
  482. 2, 3, 5, 6, 8, 9, 12, 15, // 554, 8
  483. 0, 3, 5, 6, 9, 10, 12, 15, // 562, 8
  484. 1, 2, 4, 7, 9, 10, 12, 15, // 570, 8
  485. 1, 2, 5, 6, 8, 11, 12, 15, // 578, 8
  486. 0, 3, 4, 7, 8, 11, 12, 15, // 586, 8
  487. 0, 1, 5, 6, 10, 11, 12, 15, // 594, 8
  488. 1, 2, 6, 7, 10, 11, 12, 15, // 602, 8
  489. 1, 3, 4, 6, 8, 10, 13, 15, // 610, 8
  490. 0, 2, 5, 7, 8, 10, 13, 15, // 618, 8
  491. 0, 2, 4, 6, 9, 11, 13, 15, // 626, 8
  492. 1, 3, 5, 7, 9, 11, 13, 15, // 634, 8
  493. 0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15, // 642, 11
  494. 2, 3, 4, 5, 8, 9, 14, 15, // 653, 8
  495. 0, 1, 6, 7, 8, 9, 14, 15, // 661, 8
  496. 0, 1, 5, 10, 14, 15, // 669, 6
  497. 0, 3, 4, 5, 9, 10, 14, 15, // 675, 8
  498. 0, 1, 5, 6, 9, 10, 14, 15, // 683, 8
  499. 11, 14, 15, // 691, 3
  500. 7, 11, 14, 15, // 694, 4
  501. 1, 2, 4, 5, 8, 11, 14, 15, // 698, 8
  502. 0, 1, 4, 7, 8, 11, 14, 15, // 706, 8
  503. 0, 1, 4, 5, 10, 11, 14, 15, // 714, 8
  504. 2, 3, 6, 7, 10, 11, 14, 15, // 722, 8
  505. 4, 5, 6, 7, 10, 11, 14, 15, // 730, 8
  506. 0, 1, 4, 5, 7, 8, 10, 11, 14, 15, // 738, 10
  507. 0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15, // 748, 12
  508. 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, // 760, 13
  509. 0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15, // 773, 11
  510. 3, 4, 8, 9, 10, 13, 14, 15, // 784, 8
  511. 11, 13, 14, 15, // 792, 4
  512. 0, 1, 2, 4, 11, 13, 14, 15, // 796, 8
  513. 0, 1, 2, 4, 5, 10, 11, 13, 14, 15, // 804, 10
  514. 7, 10, 11, 13, 14, 15, // 814, 6
  515. 3, 6, 7, 10, 11, 13, 14, 15, // 820, 8
  516. 1, 5, 9, 10, 11, 13, 14, 15, // 828, 8
  517. 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, // 836, 12
  518. 12, 13, 14, 15, // 848, 4
  519. 0, 1, 2, 3, 12, 13, 14, 15, // 852, 8
  520. 0, 1, 4, 5, 12, 13, 14, 15, // 860, 8
  521. 4, 5, 6, 7, 12, 13, 14, 15, // 868, 8
  522. 4, 8, 9, 10, 12, 13, 14, 15, // 876, 8
  523. 0, 4, 5, 8, 9, 10, 12, 13, 14, 15, // 884, 10
  524. 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, // 894, 12
  525. 0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15, // 906, 12
  526. 0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15, // 918, 11
  527. 0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15, // 929, 11
  528. 7, 9, 10, 11, 12, 13, 14, 15, // 940, 8
  529. 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 948, 10
  530. 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 958, 12
  531. 8, 9, 10, 11, 12, 13, 14, 15, // 970, 8
  532. 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 978, 12
  533. 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 990, 13
  534. 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1003, 12
  535. 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1015, 13
  536. 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1028, 12
  537. 0, 2, // 1040, 2
  538. 1, 3, // 1042, 2
  539. 0, 1, 4, 5, // 1044, 4
  540. 0, 1, 2, 4, 5, // 1048, 5
  541. 2, 3, 6, // 1053, 3
  542. 0, 2, 4, 6, // 1056, 4
  543. 1, 2, 5, 6, // 1060, 4
  544. 0, 1, 2, 3, 5, 6, // 1064, 6
  545. 0, 1, 2, 4, 5, 6, // 1070, 6
  546. 0, 1, 2, 3, 4, 5, 6, // 1076, 7
  547. 0, 3, 4, 7, // 1083, 4
  548. 0, 1, 2, 3, 4, 7, // 1087, 6
  549. 1, 3, 5, 7, // 1093, 4
  550. 2, 3, 6, 7, // 1097, 4
  551. 1, 2, 3, 6, 7, // 1101, 5
  552. 1, 2, 3, 5, 6, 7, // 1106, 6
  553. 0, 1, 2, 3, 5, 6, 7, // 1112, 7
  554. 4, 5, 6, 7, // 1119, 4
  555. 0, 8, // 1123, 2
  556. 0, 1, 4, 5, 8, // 1125, 5
  557. 0, 1, 8, 9, // 1130, 4
  558. 4, 5, 8, 9, // 1134, 4
  559. 0, 1, 4, 5, 8, 9, // 1138, 6
  560. 2, 6, 8, 9, // 1144, 4
  561. 6, 7, 8, 9, // 1148, 4
  562. 0, 2, 4, 6, 8, 10, // 1152, 6
  563. 1, 2, 5, 6, 9, 10, // 1158, 6
  564. 0, 3, 4, 7, 9, 10, // 1164, 6
  565. 0, 1, 2, 8, 9, 10, // 1170, 6
  566. 4, 5, 6, 8, 9, 10, // 1176, 6
  567. 3, 11, // 1182, 2
  568. 2, 3, 6, 7, 11, // 1184, 5
  569. 0, 3, 8, 11, // 1189, 4
  570. 0, 3, 4, 7, 8, 11, // 1193, 6
  571. 1, 3, 5, 7, 9, 11, // 1199, 6
  572. 2, 3, 10, 11, // 1205, 4
  573. 1, 5, 10, 11, // 1209, 4
  574. 4, 5, 10, 11, // 1213, 4
  575. 6, 7, 10, 11, // 1217, 4
  576. 2, 3, 6, 7, 10, 11, // 1221, 6
  577. 1, 2, 3, 9, 10, 11, // 1227, 6
  578. 5, 6, 7, 9, 10, 11, // 1233, 6
  579. 8, 9, 10, 11, // 1239, 4
  580. 4, 12, // 1243, 2
  581. 0, 1, 2, 3, 4, 5, 8, 12, // 1245, 8
  582. 8, 9, 12, // 1253, 3
  583. 0, 4, 5, 8, 9, 12, // 1256, 6
  584. 0, 1, 4, 5, 8, 9, 12, // 1262, 7
  585. 2, 3, 5, 6, 8, 9, 12, // 1269, 7
  586. 1, 5, 9, 13, // 1276, 4
  587. 6, 7, 9, 13, // 1280, 4
  588. 1, 4, 7, 10, 13, // 1284, 5
  589. 1, 6, 8, 11, 13, // 1289, 5
  590. 0, 1, 12, 13, // 1294, 4
  591. 4, 5, 12, 13, // 1298, 4
  592. 0, 1, 6, 7, 12, 13, // 1302, 6
  593. 0, 1, 4, 8, 12, 13, // 1308, 6
  594. 8, 9, 12, 13, // 1314, 4
  595. 4, 8, 9, 12, 13, // 1318, 5
  596. 4, 5, 8, 9, 12, 13, // 1323, 6
  597. 0, 4, 5, 8, 9, 12, 13, // 1329, 7
  598. 0, 1, 6, 10, 12, 13, // 1336, 6
  599. 3, 6, 7, 9, 10, 12, 13, // 1342, 7
  600. 0, 1, 10, 11, 12, 13, // 1349, 6
  601. 2, 4, 7, 9, 14, // 1355, 5
  602. 4, 5, 10, 14, // 1360, 4
  603. 2, 6, 10, 14, // 1364, 4
  604. 2, 5, 8, 11, 14, // 1368, 5
  605. 0, 2, 12, 14, // 1373, 4
  606. 8, 10, 12, 14, // 1377, 4
  607. 4, 6, 8, 10, 12, 14, // 1381, 6
  608. 13, 14, // 1387, 2
  609. 9, 10, 13, 14, // 1389, 4
  610. 5, 6, 9, 10, 13, 14, // 1393, 6
  611. 0, 1, 2, 12, 13, 14, // 1399, 6
  612. 4, 5, 6, 12, 13, 14, // 1405, 6
  613. 8, 9, 12, 13, 14, // 1411, 5
  614. 8, 9, 10, 12, 13, 14, // 1416, 6
  615. 7, 15, // 1422, 2
  616. 0, 5, 10, 15, // 1424, 4
  617. 0, 1, 2, 3, 6, 7, 11, 15, // 1428, 8
  618. 10, 11, 15, // 1436, 3
  619. 0, 1, 5, 6, 10, 11, 15, // 1439, 7
  620. 3, 6, 7, 10, 11, 15, // 1446, 6
  621. 12, 15, // 1452, 2
  622. 0, 3, 12, 15, // 1454, 4
  623. 4, 7, 12, 15, // 1458, 4
  624. 0, 3, 6, 9, 12, 15, // 1462, 6
  625. 0, 3, 5, 10, 12, 15, // 1468, 6
  626. 8, 11, 12, 15, // 1474, 4
  627. 5, 6, 8, 11, 12, 15, // 1478, 6
  628. 4, 7, 8, 11, 12, 15, // 1484, 6
  629. 1, 3, 13, 15, // 1490, 4
  630. 9, 11, 13, 15, // 1494, 4
  631. 5, 7, 9, 11, 13, 15, // 1498, 6
  632. 2, 3, 14, 15, // 1504, 4
  633. 2, 3, 4, 5, 14, 15, // 1508, 6
  634. 6, 7, 14, 15, // 1514, 4
  635. 2, 3, 5, 9, 14, 15, // 1518, 6
  636. 2, 3, 8, 9, 14, 15, // 1524, 6
  637. 10, 14, 15, // 1530, 3
  638. 0, 4, 5, 9, 10, 14, 15, // 1533, 7
  639. 2, 3, 7, 11, 14, 15, // 1540, 6
  640. 10, 11, 14, 15, // 1546, 4
  641. 7, 10, 11, 14, 15, // 1550, 5
  642. 6, 7, 10, 11, 14, 15, // 1555, 6
  643. 1, 2, 3, 13, 14, 15, // 1561, 6
  644. 5, 6, 7, 13, 14, 15, // 1567, 6
  645. 10, 11, 13, 14, 15, // 1573, 5
  646. 9, 10, 11, 13, 14, 15, // 1578, 6
  647. 0, 4, 8, 9, 12, 13, 14, 15, // 1584, 8
  648. 9, 10, 12, 13, 14, 15, // 1592, 6
  649. 8, 11, 12, 13, 14, 15, // 1598, 6
  650. 3, 7, 10, 11, 12, 13, 14, 15, // 1604, 8
  651. };
  652. static const int g_shapeRanges[][2] =
  653. {
  654. { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
  655. { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
  656. { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
  657. { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
  658. { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
  659. { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
  660. { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
  661. { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
  662. { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
  663. { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
  664. { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
  665. { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
  666. { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
  667. { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
  668. { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
  669. { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
  670. { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
  671. { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
  672. { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
  673. { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
  674. { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
  675. { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
  676. { 1604, 8 },
  677. };
  678. static const int g_shapes1[][2] =
  679. {
  680. { 0, 16 }
  681. };
  682. static const int g_shapes2[64][2] =
  683. {
  684. { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
  685. { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
  686. { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
  687. { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
  688. { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
  689. { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
  690. { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
  691. { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
  692. };
  693. static const int g_shapes3[64][3] =
  694. {
  695. { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
  696. { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
  697. { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
  698. { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
  699. { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
  700. { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
  701. { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
  702. { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
  703. };
  704. static const int g_shapeList1[] =
  705. {
  706. 0,
  707. };
  708. static const int g_shapeList2[] =
  709. {
  710. 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
  711. 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
  712. 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
  713. 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
  714. 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
  715. 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
  716. 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
  717. 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
  718. 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
  719. 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
  720. 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
  721. 122, 123, 124, 125, 126, 127, 128,
  722. };
  723. static const int g_shapeList12[] =
  724. {
  725. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
  726. 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
  727. 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
  728. 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
  729. 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
  730. 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
  731. 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
  732. 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
  733. 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
  734. 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
  735. 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
  736. 121, 122, 123, 124, 125, 126, 127, 128,
  737. };
  738. static const int g_shapeList3[] =
  739. {
  740. 1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
  741. 33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
  742. 110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
  743. 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
  744. 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
  745. 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
  746. 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
  747. 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
  748. 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
  749. 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
  750. 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
  751. 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
  752. 235, 236, 237, 238, 239, 240, 241, 242,
  753. };
  754. static const int g_shapeList3Short[] =
  755. {
  756. 1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
  757. 106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
  758. 171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
  759. 233, 237, 240,
  760. };
  761. static const int g_shapeListAll[] =
  762. {
  763. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
  764. 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
  765. 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
  766. 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
  767. 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
  768. 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
  769. 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
  770. 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
  771. 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
  772. 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
  773. 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
  774. 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
  775. 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
  776. 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
  777. 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
  778. 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
  779. 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
  780. 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
  781. 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
  782. 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
  783. 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
  784. 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
  785. 242,
  786. };
  787. static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
  788. static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
  789. static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
  790. static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
  791. static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
  792. static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
  793. static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
  794. }
  795. struct PackingVector
  796. {
  797. uint32_t m_vector[4];
  798. int m_offset;
  799. void Init()
  800. {
  801. for (int i = 0; i < 4; i++)
  802. m_vector[i] = 0;
  803. m_offset = 0;
  804. }
  805. void InitPacked(const uint32_t *v, int bits)
  806. {
  807. for (int b = 0; b < bits; b += 32)
  808. m_vector[b / 32] = v[b / 32];
  809. m_offset = bits;
  810. }
  811. inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
  812. {
  813. int vOffset = m_offset >> 5;
  814. int bitOffset = m_offset & 0x1f;
  815. m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
  816. int overflowBits = bitOffset + bits - 32;
  817. if (overflowBits > 0)
  818. m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
  819. m_offset += bits;
  820. }
  821. inline void Flush(uint8_t* output)
  822. {
  823. assert(m_offset == 128);
  824. for (int v = 0; v < 4; v++)
  825. {
  826. uint32_t chunk = m_vector[v];
  827. for (int b = 0; b < 4; b++)
  828. output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
  829. }
  830. }
  831. };
  832. struct UnpackingVector
  833. {
  834. uint32_t m_vector[4];
  835. void Init(const uint8_t *bytes)
  836. {
  837. for (int i = 0; i < 4; i++)
  838. m_vector[i] = 0;
  839. for (int b = 0; b < 16; b++)
  840. m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
  841. }
  842. inline void UnpackStart(uint32_t *v, int bits)
  843. {
  844. for (int b = 0; b < bits; b += 32)
  845. v[b / 32] = m_vector[b / 32];
  846. int entriesShifted = bits / 32;
  847. int carry = bits % 32;
  848. for (int i = entriesShifted; i < 4; i++)
  849. m_vector[i - entriesShifted] = m_vector[i];
  850. int entriesRemaining = 4 - entriesShifted;
  851. if (carry)
  852. {
  853. uint32_t bitMask = (1 << carry) - 1;
  854. for (int i = 0; i < entriesRemaining; i++)
  855. {
  856. m_vector[i] >>= carry;
  857. if (i != entriesRemaining - 1)
  858. m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
  859. }
  860. }
  861. }
  862. inline ParallelMath::ScalarUInt16 Unpack(int bits)
  863. {
  864. uint32_t bitMask = (1 << bits) - 1;
  865. ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
  866. for (int i = 0; i < 4; i++)
  867. {
  868. m_vector[i] >>= bits;
  869. if (i != 3)
  870. m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
  871. }
  872. return result;
  873. }
  874. };
  875. ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
  876. {
  877. if (isSigned)
  878. {
  879. ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
  880. return (v * 32.0f + offset) / 31.0f;
  881. }
  882. else
  883. return (v * 64.0f + 30.0f) / 31.0f;
  884. }
  885. ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
  886. {
  887. #ifdef CVTT_ENABLE_ASSERTS
  888. for (int i = 0; i < ParallelMath::ParallelSize; i++)
  889. assert(ParallelMath::Extract(v, i) != -32768)
  890. #endif
  891. ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
  892. ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
  893. ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
  894. ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
  895. ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
  896. ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
  897. return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
  898. }
  899. ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
  900. {
  901. return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
  902. }
  903. void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
  904. {
  905. for (int epi = 0; epi < 2; epi++)
  906. {
  907. for (int ch = 0; ch < 3; ch++)
  908. {
  909. if (isSigned)
  910. outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
  911. else
  912. outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
  913. }
  914. }
  915. }
  916. struct SinglePlaneTemporaries
  917. {
  918. UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
  919. UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
  920. ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
  921. ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
  922. ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
  923. };
  924. }
  925. }
  926. void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
  927. {
  928. ParallelMath::RoundTowardNearestForScope roundingMode;
  929. float tf[2];
  930. Util::ComputeTweakFactors(tweak, range, tf);
  931. MFloat base = ParallelMath::ToFloat(original[0]);
  932. MFloat offs = ParallelMath::ToFloat(original[1]) - base;
  933. result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
  934. result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
  935. }
  936. void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
  937. {
  938. for (int ch = 0; ch < channels; ch++)
  939. color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
  940. }
  941. void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
  942. {
  943. int16_t addend;
  944. if (p)
  945. addend = ((1 << (8 - bits)) - 1);
  946. else
  947. addend = 255;
  948. for (int ch = 0; ch < channels; ch++)
  949. {
  950. MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
  951. ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
  952. ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
  953. color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
  954. }
  955. }
  956. void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
  957. {
  958. for (int ch = 0; ch < channels; ch++)
  959. {
  960. MUInt15 clr = color[ch];
  961. clr = clr << (8 - bits);
  962. color[ch] = clr | ParallelMath::RightShift(clr, bits);
  963. }
  964. }
  965. void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
  966. {
  967. for (int j = 0; j < 2; j++)
  968. {
  969. QuantizeP(ep[j], 4, p[j], 3);
  970. Unquantize(ep[j], 5, 3);
  971. ep[j][3] = ParallelMath::MakeUInt15(255);
  972. }
  973. }
  974. void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
  975. {
  976. for (int j = 0; j < 2; j++)
  977. {
  978. QuantizeP(ep[j], 6, p, 3);
  979. Unquantize(ep[j], 7, 3);
  980. ep[j][3] = ParallelMath::MakeUInt15(255);
  981. }
  982. }
  983. void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
  984. {
  985. for (int j = 0; j < 2; j++)
  986. {
  987. Quantize(ep[j], 5, 3);
  988. Unquantize(ep[j], 5, 3);
  989. ep[j][3] = ParallelMath::MakeUInt15(255);
  990. }
  991. }
  992. void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
  993. {
  994. for (int j = 0; j < 2; j++)
  995. {
  996. QuantizeP(ep[j], 7, p[j], 3);
  997. ep[j][3] = ParallelMath::MakeUInt15(255);
  998. }
  999. }
  1000. void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
  1001. {
  1002. for (int j = 0; j < 2; j++)
  1003. {
  1004. Quantize(epRGB[j], 5, 3);
  1005. Unquantize(epRGB[j], 5, 3);
  1006. Quantize(epA + j, 6, 1);
  1007. Unquantize(epA + j, 6, 1);
  1008. }
  1009. }
  1010. void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
  1011. {
  1012. for (int j = 0; j < 2; j++)
  1013. {
  1014. Quantize(epRGB[j], 7, 3);
  1015. Unquantize(epRGB[j], 7, 3);
  1016. }
  1017. // Alpha is full precision
  1018. (void)epA;
  1019. }
  1020. void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
  1021. {
  1022. for (int j = 0; j < 2; j++)
  1023. QuantizeP(ep[j], 7, p[j], 4);
  1024. }
  1025. void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
  1026. {
  1027. for (int j = 0; j < 2; j++)
  1028. {
  1029. QuantizeP(ep[j], 5, p[j], 4);
  1030. Unquantize(ep[j], 6, 4);
  1031. }
  1032. }
  1033. void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
  1034. {
  1035. MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
  1036. MUInt15 intAverage[4];
  1037. for (int ch = 0; ch < 4; ch++)
  1038. intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
  1039. MUInt15 eps[2][4];
  1040. MUInt15 reconstructed[4];
  1041. MUInt15 index = ParallelMath::MakeUInt15(0);
  1042. for (int epi = 0; epi < 2; epi++)
  1043. {
  1044. for (int ch = 0; ch < 3; ch++)
  1045. eps[epi][ch] = ParallelMath::MakeUInt15(0);
  1046. eps[epi][3] = ParallelMath::MakeUInt15(255);
  1047. }
  1048. for (int ch = 0; ch < 3; ch++)
  1049. reconstructed[ch] = ParallelMath::MakeUInt15(0);
  1050. reconstructed[3] = ParallelMath::MakeUInt15(255);
  1051. // Depending on the target index and parity bits, there are multiple valid solid colors.
  1052. // We want to find the one closest to the actual average.
  1053. MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
  1054. for (int t = 0; t < numTables; t++)
  1055. {
  1056. const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
  1057. ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
  1058. MUInt15 candidateReconstructed[4];
  1059. MUInt15 candidateEPs[2][4];
  1060. for (int i = 0; i < ParallelMath::ParallelSize; i++)
  1061. {
  1062. for (int ch = 0; ch < numRealChannels; ch++)
  1063. {
  1064. ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
  1065. assert(avgValue >= 0 && avgValue <= 255);
  1066. const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
  1067. ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
  1068. ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
  1069. ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
  1070. }
  1071. }
  1072. MFloat avgError = ParallelMath::MakeFloatZero();
  1073. for (int ch = 0; ch < numRealChannels; ch++)
  1074. {
  1075. MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
  1076. avgError = avgError + delta * delta * channelWeightsSq[ch];
  1077. }
  1078. ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
  1079. better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
  1080. if (ParallelMath::AnySet(better))
  1081. {
  1082. ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
  1083. MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
  1084. ParallelMath::ConditionalSet(index, better, candidateIndex);
  1085. for (int ch = 0; ch < numRealChannels; ch++)
  1086. ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
  1087. for (int epi = 0; epi < 2; epi++)
  1088. for (int ch = 0; ch < numRealChannels; ch++)
  1089. ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
  1090. }
  1091. }
  1092. AggregatedError<4> aggError;
  1093. for (int pxi = 0; pxi < shapeLength; pxi++)
  1094. {
  1095. int px = fragmentStart[pxi];
  1096. BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
  1097. }
  1098. MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
  1099. ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
  1100. if (ParallelMath::AnySet(better))
  1101. {
  1102. shapeBestError = ParallelMath::Min(shapeBestError, error);
  1103. for (int epi = 0; epi < 2; epi++)
  1104. {
  1105. for (int ch = 0; ch < numRealChannels; ch++)
  1106. ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
  1107. }
  1108. for (int pxi = 0; pxi < shapeLength; pxi++)
  1109. ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
  1110. }
  1111. }
  1112. void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
  1113. {
  1114. if (numRefineRounds < 1)
  1115. numRefineRounds = 1;
  1116. float channelWeightsSq[4];
  1117. for (int ch = 0; ch < 4; ch++)
  1118. channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
  1119. SinglePlaneTemporaries temps;
  1120. MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
  1121. MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
  1122. ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
  1123. for (int px = 0; px < 16; px++)
  1124. {
  1125. MUInt15 a = pixels[px][3];
  1126. maxAlpha = ParallelMath::Max(maxAlpha, a);
  1127. minAlpha = ParallelMath::Min(minAlpha, a);
  1128. isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
  1129. }
  1130. ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
  1131. ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
  1132. bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
  1133. // Try RGB modes if any block has a min alpha 251 or higher
  1134. bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
  1135. // Try mode 7 if any block has alpha.
  1136. // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
  1137. // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
  1138. // situations, and only by at most 1 unit of error per pixel.
  1139. bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);
  1140. MFloat preWeightedPixels[16][4];
  1141. BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
  1142. // Get initial RGB endpoints
  1143. if (allowRGBModes)
  1144. {
  1145. const uint8_t *shapeList = encodingPlan.rgbShapeList;
  1146. int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;
  1147. for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
  1148. {
  1149. int shape = shapeList[shapeIter];
  1150. int shapeStart = BC7Data::g_shapeRanges[shape][0];
  1151. int shapeSize = BC7Data::g_shapeRanges[shape][1];
  1152. EndpointSelector<3, 8> epSelector;
  1153. for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
  1154. {
  1155. for (int spx = 0; spx < shapeSize; spx++)
  1156. {
  1157. int px = BC7Data::g_fragments[shapeStart + spx];
  1158. epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
  1159. }
  1160. epSelector.FinishPass(epPass);
  1161. }
  1162. temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
  1163. }
  1164. }
  1165. // Get initial RGBA endpoints
  1166. {
  1167. const uint8_t *shapeList = encodingPlan.rgbaShapeList;
  1168. int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;
  1169. for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
  1170. {
  1171. int shape = shapeList[shapeIter];
  1172. if (anyBlockHasAlpha || !allowRGBModes)
  1173. {
  1174. int shapeStart = BC7Data::g_shapeRanges[shape][0];
  1175. int shapeSize = BC7Data::g_shapeRanges[shape][1];
  1176. EndpointSelector<4, 8> epSelector;
  1177. for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
  1178. {
  1179. for (int spx = 0; spx < shapeSize; spx++)
  1180. {
  1181. int px = BC7Data::g_fragments[shapeStart + spx];
  1182. epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
  1183. }
  1184. epSelector.FinishPass(epPass);
  1185. }
  1186. temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
  1187. }
  1188. else
  1189. {
  1190. temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
  1191. }
  1192. }
  1193. }
  1194. for (uint16_t mode = 0; mode <= 7; mode++)
  1195. {
  1196. if (mode == 4 || mode == 5)
  1197. continue;
  1198. if (mode < 4 && !allowRGBModes)
  1199. continue;
  1200. if (mode == 7 && !allowMode7)
  1201. continue;
  1202. uint64_t partitionEnabledBits = 0;
  1203. switch (mode)
  1204. {
  1205. case 0:
  1206. partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
  1207. break;
  1208. case 1:
  1209. partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
  1210. break;
  1211. case 2:
  1212. partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
  1213. break;
  1214. case 3:
  1215. partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
  1216. break;
  1217. case 6:
  1218. partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
  1219. break;
  1220. case 7:
  1221. if (anyBlockHasAlpha)
  1222. partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
  1223. else
  1224. partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
  1225. break;
  1226. default:
  1227. break;
  1228. }
  1229. bool isRGB = (mode < 4);
  1230. unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
  1231. int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
  1232. int indexPrec = BC7Data::g_modes[mode].m_indexBits;
  1233. int parityBitMax = 1;
  1234. if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
  1235. parityBitMax = 4;
  1236. else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
  1237. parityBitMax = 2;
  1238. int numRealChannels = isRGB ? 3 : 4;
  1239. int numShapes;
  1240. const int *shapeList;
  1241. if (numSubsets == 1)
  1242. {
  1243. numShapes = BC7Data::g_numShapes1;
  1244. shapeList = BC7Data::g_shapeList1;
  1245. }
  1246. else if (numSubsets == 2)
  1247. {
  1248. numShapes = BC7Data::g_numShapes2;
  1249. shapeList = BC7Data::g_shapeList2;
  1250. }
  1251. else
  1252. {
  1253. assert(numSubsets == 3);
  1254. if (numPartitions == 16)
  1255. {
  1256. numShapes = BC7Data::g_numShapes3Short;
  1257. shapeList = BC7Data::g_shapeList3Short;
  1258. }
  1259. else
  1260. {
  1261. assert(numPartitions == 64);
  1262. numShapes = BC7Data::g_numShapes3;
  1263. shapeList = BC7Data::g_shapeList3;
  1264. }
  1265. }
  1266. for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
  1267. temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
  1268. for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
  1269. {
  1270. int shape = shapeList[shapeIter];
  1271. int numTweakRounds = 0;
  1272. if (isRGB)
  1273. numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
  1274. else
  1275. numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];
  1276. if (numTweakRounds == 0)
  1277. continue;
  1278. if (numTweakRounds > MaxTweakRounds)
  1279. numTweakRounds = MaxTweakRounds;
  1280. int shapeStart = BC7Data::g_shapeRanges[shape][0];
  1281. int shapeLength = BC7Data::g_shapeRanges[shape][1];
  1282. AggregatedError<1> alphaAggError;
  1283. if (isRGB && anyBlockHasAlpha)
  1284. {
  1285. MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
  1286. for (int pxi = 0; pxi < shapeLength; pxi++)
  1287. {
  1288. int px = BC7Data::g_fragments[shapeStart + pxi];
  1289. MUInt15 original[1] = { pixels[px][3] };
  1290. BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
  1291. }
  1292. }
  1293. float alphaWeightsSq[1] = { channelWeightsSq[3] };
  1294. MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
  1295. MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
  1296. for (int tweak = 0; tweak < numTweakRounds; tweak++)
  1297. {
  1298. if (isRGB)
  1299. {
  1300. temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
  1301. tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
  1302. }
  1303. else
  1304. {
  1305. temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
  1306. }
  1307. }
  1308. ParallelMath::Int16CompFlag punchThroughInvalid[4];
  1309. for (int pIter = 0; pIter < parityBitMax; pIter++)
  1310. {
  1311. punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
  1312. if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
  1313. {
  1314. // Modes 6 and 7 have parity bits that affect alpha
  1315. if (pIter == 0)
  1316. punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
  1317. else if (pIter == parityBitMax - 1)
  1318. punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
  1319. else
  1320. punchThroughInvalid[pIter] = isPunchThrough;
  1321. }
  1322. }
  1323. for (int pIter = 0; pIter < parityBitMax; pIter++)
  1324. {
  1325. if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
  1326. continue;
  1327. bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
  1328. for (int tweak = 0; tweak < numTweakRounds; tweak++)
  1329. {
  1330. uint16_t p[2];
  1331. p[0] = (pIter & 1);
  1332. p[1] = ((pIter >> 1) & 1);
  1333. MUInt15 ep[2][4];
  1334. for (int epi = 0; epi < 2; epi++)
  1335. for (int ch = 0; ch < 4; ch++)
  1336. ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
  1337. for (int refine = 0; refine < numRefineRounds; refine++)
  1338. {
  1339. switch (mode)
  1340. {
  1341. case 0:
  1342. CompressEndpoints0(ep, p);
  1343. break;
  1344. case 1:
  1345. CompressEndpoints1(ep, p[0]);
  1346. break;
  1347. case 2:
  1348. CompressEndpoints2(ep);
  1349. break;
  1350. case 3:
  1351. CompressEndpoints3(ep, p);
  1352. break;
  1353. case 6:
  1354. CompressEndpoints6(ep, p);
  1355. break;
  1356. case 7:
  1357. CompressEndpoints7(ep, p);
  1358. break;
  1359. default:
  1360. assert(false);
  1361. break;
  1362. };
  1363. MFloat shapeError = ParallelMath::MakeFloatZero();
  1364. IndexSelector<4> indexSelector;
  1365. indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
  1366. EndpointRefiner<4> epRefiner;
  1367. epRefiner.Init(1 << indexPrec, channelWeights);
  1368. MUInt15 indexes[16];
  1369. AggregatedError<4> aggError;
  1370. for (int pxi = 0; pxi < shapeLength; pxi++)
  1371. {
  1372. int px = BC7Data::g_fragments[shapeStart + pxi];
  1373. MUInt15 index;
  1374. MUInt15 reconstructed[4];
  1375. index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
  1376. indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
  1377. if (flags & cvtt::Flags::BC7_FastIndexing)
  1378. BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
  1379. else
  1380. {
  1381. MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
  1382. MUInt15 altIndexes[2];
  1383. altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
  1384. altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
  1385. for (int ii = 0; ii < 2; ii++)
  1386. {
  1387. indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
  1388. MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
  1389. ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
  1390. error = ParallelMath::Min(error, altError);
  1391. ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
  1392. }
  1393. shapeError = shapeError + error;
  1394. }
  1395. if (refine != numRefineRounds - 1)
  1396. epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
  1397. indexes[pxi] = index;
  1398. }
  1399. if (flags & cvtt::Flags::BC7_FastIndexing)
  1400. shapeError = aggError.Finalize(flags, channelWeightsSq);
  1401. if (isRGB)
  1402. shapeError = shapeError + staticAlphaError;
  1403. ParallelMath::FloatCompFlag shapeErrorBetter;
  1404. ParallelMath::Int16CompFlag shapeErrorBetter16;
  1405. shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
  1406. shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
  1407. if (ParallelMath::AnySet(shapeErrorBetter16))
  1408. {
  1409. bool punchThroughOK = true;
  1410. if (needPunchThroughCheck)
  1411. {
  1412. shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
  1413. shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
  1414. if (!ParallelMath::AnySet(shapeErrorBetter16))
  1415. punchThroughOK = false;
  1416. }
  1417. if (punchThroughOK)
  1418. {
  1419. ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
  1420. for (int epi = 0; epi < 2; epi++)
  1421. for (int ch = 0; ch < numRealChannels; ch++)
  1422. ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);
  1423. for (int pxi = 0; pxi < shapeLength; pxi++)
  1424. ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
  1425. }
  1426. }
  1427. if (refine != numRefineRounds - 1)
  1428. epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
  1429. } // refine
  1430. } // tweak
  1431. } // p
  1432. if (flags & cvtt::Flags::BC7_TrySingleColor)
  1433. {
  1434. MUInt15 total[4];
  1435. for (int ch = 0; ch < 4; ch++)
  1436. total[ch] = ParallelMath::MakeUInt15(0);
  1437. for (int pxi = 0; pxi < shapeLength; pxi++)
  1438. {
  1439. int px = BC7Data::g_fragments[shapeStart + pxi];
  1440. for (int ch = 0; ch < 4; ch++)
  1441. total[ch] = total[ch] + pixels[pxi][ch];
  1442. }
  1443. MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
  1444. MFloat average[4];
  1445. for (int ch = 0; ch < 4; ch++)
  1446. average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
  1447. const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
  1448. MFloat &shapeBestError = temps.shapeBestError[shape];
  1449. MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
  1450. MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
  1451. const cvtt::Tables::BC7SC::Table **scTables = NULL;
  1452. int numSCTables = 0;
  1453. const cvtt::Tables::BC7SC::Table *tables0[] =
  1454. {
  1455. &cvtt::Tables::BC7SC::g_mode0_p00_i1,
  1456. &cvtt::Tables::BC7SC::g_mode0_p00_i2,
  1457. &cvtt::Tables::BC7SC::g_mode0_p00_i3,
  1458. &cvtt::Tables::BC7SC::g_mode0_p01_i1,
  1459. &cvtt::Tables::BC7SC::g_mode0_p01_i2,
  1460. &cvtt::Tables::BC7SC::g_mode0_p01_i3,
  1461. &cvtt::Tables::BC7SC::g_mode0_p10_i1,
  1462. &cvtt::Tables::BC7SC::g_mode0_p10_i2,
  1463. &cvtt::Tables::BC7SC::g_mode0_p10_i3,
  1464. &cvtt::Tables::BC7SC::g_mode0_p11_i1,
  1465. &cvtt::Tables::BC7SC::g_mode0_p11_i2,
  1466. &cvtt::Tables::BC7SC::g_mode0_p11_i3,
  1467. };
  1468. const cvtt::Tables::BC7SC::Table *tables1[] =
  1469. {
  1470. &cvtt::Tables::BC7SC::g_mode1_p0_i1,
  1471. &cvtt::Tables::BC7SC::g_mode1_p0_i2,
  1472. &cvtt::Tables::BC7SC::g_mode1_p0_i3,
  1473. &cvtt::Tables::BC7SC::g_mode1_p1_i1,
  1474. &cvtt::Tables::BC7SC::g_mode1_p1_i2,
  1475. &cvtt::Tables::BC7SC::g_mode1_p1_i3,
  1476. };
  1477. const cvtt::Tables::BC7SC::Table *tables2[] =
  1478. {
  1479. &cvtt::Tables::BC7SC::g_mode2,
  1480. };
  1481. const cvtt::Tables::BC7SC::Table *tables3[] =
  1482. {
  1483. &cvtt::Tables::BC7SC::g_mode3_p0,
  1484. &cvtt::Tables::BC7SC::g_mode3_p1,
  1485. };
  1486. const cvtt::Tables::BC7SC::Table *tables6[] =
  1487. {
  1488. &cvtt::Tables::BC7SC::g_mode6_p0_i1,
  1489. &cvtt::Tables::BC7SC::g_mode6_p0_i2,
  1490. &cvtt::Tables::BC7SC::g_mode6_p0_i3,
  1491. &cvtt::Tables::BC7SC::g_mode6_p0_i4,
  1492. &cvtt::Tables::BC7SC::g_mode6_p0_i5,
  1493. &cvtt::Tables::BC7SC::g_mode6_p0_i6,
  1494. &cvtt::Tables::BC7SC::g_mode6_p0_i7,
  1495. &cvtt::Tables::BC7SC::g_mode6_p1_i1,
  1496. &cvtt::Tables::BC7SC::g_mode6_p1_i2,
  1497. &cvtt::Tables::BC7SC::g_mode6_p1_i3,
  1498. &cvtt::Tables::BC7SC::g_mode6_p1_i4,
  1499. &cvtt::Tables::BC7SC::g_mode6_p1_i5,
  1500. &cvtt::Tables::BC7SC::g_mode6_p1_i6,
  1501. &cvtt::Tables::BC7SC::g_mode6_p1_i7,
  1502. };
  1503. const cvtt::Tables::BC7SC::Table *tables7[] =
  1504. {
  1505. &cvtt::Tables::BC7SC::g_mode7_p00,
  1506. &cvtt::Tables::BC7SC::g_mode7_p01,
  1507. &cvtt::Tables::BC7SC::g_mode7_p10,
  1508. &cvtt::Tables::BC7SC::g_mode7_p11,
  1509. };
  1510. switch (mode)
  1511. {
  1512. case 0:
  1513. {
  1514. scTables = tables0;
  1515. numSCTables = sizeof(tables0) / sizeof(tables0[0]);
  1516. }
  1517. break;
  1518. case 1:
  1519. {
  1520. scTables = tables1;
  1521. numSCTables = sizeof(tables1) / sizeof(tables1[0]);
  1522. }
  1523. break;
  1524. case 2:
  1525. {
  1526. scTables = tables2;
  1527. numSCTables = sizeof(tables2) / sizeof(tables2[0]);
  1528. }
  1529. break;
  1530. case 3:
  1531. {
  1532. scTables = tables3;
  1533. numSCTables = sizeof(tables3) / sizeof(tables3[0]);
  1534. }
  1535. break;
  1536. case 6:
  1537. {
  1538. scTables = tables6;
  1539. numSCTables = sizeof(tables6) / sizeof(tables6[0]);
  1540. }
  1541. break;
  1542. case 7:
  1543. {
  1544. scTables = tables7;
  1545. numSCTables = sizeof(tables7) / sizeof(tables7[0]);
  1546. }
  1547. break;
  1548. default:
  1549. assert(false);
  1550. break;
  1551. }
  1552. TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
  1553. }
  1554. } // shapeIter
  1555. uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;
  1556. switch (mode)
  1557. {
  1558. case 0:
  1559. partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
  1560. break;
  1561. case 1:
  1562. partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
  1563. break;
  1564. case 2:
  1565. partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
  1566. break;
  1567. case 3:
  1568. partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
  1569. break;
  1570. case 6:
  1571. partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
  1572. break;
  1573. case 7:
  1574. if (anyBlockHasAlpha)
  1575. partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
  1576. else
  1577. partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
  1578. break;
  1579. default:
  1580. break;
  1581. };
  1582. for (uint16_t partition = 0; partition < numPartitions; partition++)
  1583. {
  1584. if (((partitionsEnabledBits >> partition) & 1) == 0)
  1585. continue;
  1586. const int *partitionShapes;
  1587. if (numSubsets == 1)
  1588. partitionShapes = BC7Data::g_shapes1[partition];
  1589. else if (numSubsets == 2)
  1590. partitionShapes = BC7Data::g_shapes2[partition];
  1591. else
  1592. {
  1593. assert(numSubsets == 3);
  1594. partitionShapes = BC7Data::g_shapes3[partition];
  1595. }
  1596. MFloat totalError = ParallelMath::MakeFloatZero();
  1597. for (int subset = 0; subset < numSubsets; subset++)
  1598. totalError = totalError + temps.shapeBestError[partitionShapes[subset]];
  1599. ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
  1600. ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
  1601. if (mode == 7 && anyBlockHasAlpha)
  1602. {
  1603. // Some lanes could be better, but we filter them out to ensure consistency with scalar
  1604. bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);
  1605. if (!isRGBAllowedForThisPartition)
  1606. {
  1607. errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
  1608. errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
  1609. }
  1610. }
  1611. if (ParallelMath::AnySet(errorBetter16))
  1612. {
  1613. for (int subset = 0; subset < numSubsets; subset++)
  1614. {
  1615. int shape = partitionShapes[subset];
  1616. int shapeStart = BC7Data::g_shapeRanges[shape][0];
  1617. int shapeLength = BC7Data::g_shapeRanges[shape][1];
  1618. for (int epi = 0; epi < 2; epi++)
  1619. for (int ch = 0; ch < 4; ch++)
  1620. ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);
  1621. for (int pxi = 0; pxi < shapeLength; pxi++)
  1622. {
  1623. int px = BC7Data::g_fragments[shapeStart + pxi];
  1624. ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
  1625. }
  1626. }
  1627. ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
  1628. ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
  1629. ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
  1630. }
  1631. }
  1632. }
  1633. }
  1634. void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
  1635. {
  1636. // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
  1637. // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
  1638. // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
  1639. // - Separate alpha channel, then weighted RGB
  1640. // - Alpha+2 other channels, then the independent channel
  1641. if (numRefineRounds < 1)
  1642. numRefineRounds = 1;
  1643. float channelWeightsSq[4];
  1644. for (int ch = 0; ch < 4; ch++)
  1645. channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
  1646. for (uint16_t mode = 4; mode <= 5; mode++)
  1647. {
  1648. int numSP[2] = { 0, 0 };
  1649. for (uint16_t rotation = 0; rotation < 4; rotation++)
  1650. {
  1651. if (mode == 4)
  1652. {
  1653. numSP[0] = encodingPlan.mode4SP[rotation][0];
  1654. numSP[1] = encodingPlan.mode4SP[rotation][1];
  1655. }
  1656. else
  1657. numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];
  1658. if (numSP[0] == 0 && numSP[1] == 0)
  1659. continue;
  1660. int alphaChannel = (rotation + 3) & 3;
  1661. int redChannel = (rotation == 1) ? 3 : 0;
  1662. int greenChannel = (rotation == 2) ? 3 : 1;
  1663. int blueChannel = (rotation == 3) ? 3 : 2;
  1664. MUInt15 rotatedRGB[16][3];
  1665. MFloat floatRotatedRGB[16][3];
  1666. for (int px = 0; px < 16; px++)
  1667. {
  1668. rotatedRGB[px][0] = pixels[px][redChannel];
  1669. rotatedRGB[px][1] = pixels[px][greenChannel];
  1670. rotatedRGB[px][2] = pixels[px][blueChannel];
  1671. for (int ch = 0; ch < 3; ch++)
  1672. floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
  1673. }
  1674. uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
  1675. float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
  1676. float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
  1677. float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
  1678. float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
  1679. float uniformWeight[1] = { 1.0f }; // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
  1680. MFloat preWeightedRotatedRGB[16][3];
  1681. BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
  1682. for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
  1683. {
  1684. int numTweakRounds = numSP[indexSelector];
  1685. if (numTweakRounds <= 0)
  1686. continue;
  1687. if (numTweakRounds > MaxTweakRounds)
  1688. numTweakRounds = MaxTweakRounds;
  1689. EndpointSelector<3, 8> rgbSelector;
  1690. for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
  1691. {
  1692. for (int px = 0; px < 16; px++)
  1693. rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
  1694. rgbSelector.FinishPass(epPass);
  1695. }
  1696. MUInt15 alphaRange[2];
  1697. alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
  1698. for (int px = 1; px < 16; px++)
  1699. {
  1700. alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
  1701. alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
  1702. }
  1703. int rgbPrec = 0;
  1704. int alphaPrec = 0;
  1705. if (mode == 4)
  1706. {
  1707. rgbPrec = indexSelector ? 3 : 2;
  1708. alphaPrec = indexSelector ? 2 : 3;
  1709. }
  1710. else
  1711. rgbPrec = alphaPrec = 2;
  1712. UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
  1713. MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
  1714. MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
  1715. MUInt15 bestRGBIndexes[16];
  1716. MUInt15 bestAlphaIndexes[16];
  1717. MUInt15 bestEP[2][4];
  1718. for (int px = 0; px < 16; px++)
  1719. bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
  1720. for (int tweak = 0; tweak < numTweakRounds; tweak++)
  1721. {
  1722. MUInt15 rgbEP[2][3];
  1723. MUInt15 alphaEP[2];
  1724. unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
  1725. TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
  1726. for (int refine = 0; refine < numRefineRounds; refine++)
  1727. {
  1728. if (mode == 4)
  1729. CompressEndpoints4(rgbEP, alphaEP);
  1730. else
  1731. CompressEndpoints5(rgbEP, alphaEP);
  1732. IndexSelector<1> alphaIndexSelector;
  1733. IndexSelector<3> rgbIndexSelector;
  1734. {
  1735. MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
  1736. alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
  1737. }
  1738. rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
  1739. EndpointRefiner<3> rgbRefiner;
  1740. EndpointRefiner<1> alphaRefiner;
  1741. rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
  1742. alphaRefiner.Init(1 << alphaPrec, uniformWeight);
  1743. MFloat errorRGB = ParallelMath::MakeFloatZero();
  1744. MFloat errorA = ParallelMath::MakeFloatZero();
  1745. MUInt15 rgbIndexes[16];
  1746. MUInt15 alphaIndexes[16];
  1747. AggregatedError<3> rgbAggError;
  1748. AggregatedError<1> alphaAggError;
  1749. for (int px = 0; px < 16; px++)
  1750. {
  1751. MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
  1752. MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
  1753. MUInt15 reconstructedRGB[3];
  1754. MUInt15 reconstructedAlpha[1];
  1755. rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
  1756. alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
  1757. if (flags & cvtt::Flags::BC7_FastIndexing)
  1758. {
  1759. BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
  1760. BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
  1761. }
  1762. else
  1763. {
  1764. AggregatedError<3> baseRGBAggError;
  1765. AggregatedError<1> baseAlphaAggError;
  1766. BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
  1767. BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
  1768. MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
  1769. MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
  1770. MUInt15 altRGBIndexes[2];
  1771. MUInt15 altAlphaIndexes[2];
  1772. altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
  1773. altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
  1774. altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
  1775. altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
  1776. for (int ii = 0; ii < 2; ii++)
  1777. {
  1778. rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
  1779. alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
  1780. AggregatedError<3> altRGBAggError;
  1781. AggregatedError<1> altAlphaAggError;
  1782. BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
  1783. BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
  1784. MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
  1785. MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
  1786. ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
  1787. ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
  1788. rgbError = ParallelMath::Min(altRGBError, rgbError);
  1789. alphaError = ParallelMath::Min(altAlphaError, alphaError);
  1790. ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
  1791. ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
  1792. }
  1793. errorRGB = errorRGB + rgbError;
  1794. errorA = errorA + alphaError;
  1795. }
  1796. if (refine != numRefineRounds - 1)
  1797. {
  1798. rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
  1799. alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
  1800. }
  1801. if (flags & Flags::BC7_FastIndexing)
  1802. {
  1803. errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
  1804. errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
  1805. }
  1806. rgbIndexes[px] = rgbIndex;
  1807. alphaIndexes[px] = alphaIndex;
  1808. }
  1809. ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
  1810. ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
  1811. ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
  1812. ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
  1813. if (ParallelMath::AnySet(rgbBetterInt16))
  1814. {
  1815. bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
  1816. for (int px = 0; px < 16; px++)
  1817. ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
  1818. for (int ep = 0; ep < 2; ep++)
  1819. {
  1820. for (int ch = 0; ch < 3; ch++)
  1821. ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
  1822. }
  1823. }
  1824. if (ParallelMath::AnySet(alphaBetterInt16))
  1825. {
  1826. bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
  1827. for (int px = 0; px < 16; px++)
  1828. ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
  1829. for (int ep = 0; ep < 2; ep++)
  1830. ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
  1831. }
  1832. if (refine != numRefineRounds - 1)
  1833. {
  1834. rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
  1835. MUInt15 alphaEPTemp[2][1];
  1836. alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
  1837. for (int i = 0; i < 2; i++)
  1838. alphaEP[i] = alphaEPTemp[i][0];
  1839. }
  1840. } // refine
  1841. } // tweak
  1842. MFloat combinedError = bestRGBError + bestAlphaError;
  1843. ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
  1844. ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
  1845. work.m_error = ParallelMath::Min(combinedError, work.m_error);
  1846. ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
  1847. ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
  1848. ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
  1849. for (int px = 0; px < 16; px++)
  1850. {
  1851. ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
  1852. ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
  1853. }
  1854. for (int ep = 0; ep < 2; ep++)
  1855. for (int ch = 0; ch < 4; ch++)
  1856. ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
  1857. }
  1858. }
  1859. }
  1860. }
  1861. template<class T>
  1862. void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
  1863. {
  1864. T temp = a;
  1865. a = b;
  1866. b = temp;
  1867. }
  1868. void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
  1869. {
  1870. MUInt15 pixels[16][4];
  1871. MFloat floatPixels[16][4];
  1872. for (int px = 0; px < 16; px++)
  1873. {
  1874. for (int ch = 0; ch < 4; ch++)
  1875. ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
  1876. }
  1877. for (int px = 0; px < 16; px++)
  1878. {
  1879. for (int ch = 0; ch < 4; ch++)
  1880. floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
  1881. }
  1882. BC67::WorkInfo work;
  1883. memset(&work, 0, sizeof(work));
  1884. work.m_error = ParallelMath::MakeFloat(FLT_MAX);
  1885. {
  1886. ParallelMath::RoundTowardNearestForScope rtn;
  1887. TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
  1888. TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
  1889. }
  1890. for (int block = 0; block < ParallelMath::ParallelSize; block++)
  1891. {
  1892. PackingVector pv;
  1893. pv.Init();
  1894. ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
  1895. ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
  1896. ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
  1897. const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
  1898. ParallelMath::ScalarUInt16 indexes[16];
  1899. ParallelMath::ScalarUInt16 indexes2[16];
  1900. ParallelMath::ScalarUInt16 endPoints[3][2][4];
  1901. for (int i = 0; i < 16; i++)
  1902. {
  1903. indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
  1904. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  1905. indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
  1906. }
  1907. for (int subset = 0; subset < 3; subset++)
  1908. {
  1909. for (int ep = 0; ep < 2; ep++)
  1910. {
  1911. for (int ch = 0; ch < 4; ch++)
  1912. endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
  1913. }
  1914. }
  1915. int fixups[3] = { 0, 0, 0 };
  1916. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  1917. {
  1918. bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
  1919. bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
  1920. if (flipRGB)
  1921. {
  1922. uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
  1923. for (int px = 0; px < 16; px++)
  1924. indexes[px] = highIndex - indexes[px];
  1925. }
  1926. if (flipAlpha)
  1927. {
  1928. uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
  1929. for (int px = 0; px < 16; px++)
  1930. indexes2[px] = highIndex - indexes2[px];
  1931. }
  1932. if (indexSelector)
  1933. Swap(flipRGB, flipAlpha);
  1934. if (flipRGB)
  1935. {
  1936. for (int ch = 0; ch < 3; ch++)
  1937. Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
  1938. }
  1939. if (flipAlpha)
  1940. Swap(endPoints[0][0][3], endPoints[0][1][3]);
  1941. }
  1942. else
  1943. {
  1944. if (modeInfo.m_numSubsets == 2)
  1945. fixups[1] = BC7Data::g_fixupIndexes2[partition];
  1946. else if (modeInfo.m_numSubsets == 3)
  1947. {
  1948. fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
  1949. fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
  1950. }
  1951. bool flip[3] = { false, false, false };
  1952. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  1953. flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
  1954. if (flip[0] || flip[1] || flip[2])
  1955. {
  1956. uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
  1957. for (int px = 0; px < 16; px++)
  1958. {
  1959. int subset = 0;
  1960. if (modeInfo.m_numSubsets == 2)
  1961. subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
  1962. else if (modeInfo.m_numSubsets == 3)
  1963. subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
  1964. if (flip[subset])
  1965. indexes[px] = highIndex - indexes[px];
  1966. }
  1967. int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
  1968. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  1969. {
  1970. if (flip[subset])
  1971. for (int ch = 0; ch < maxCH; ch++)
  1972. Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
  1973. }
  1974. }
  1975. }
  1976. pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
  1977. if (modeInfo.m_partitionBits)
  1978. pv.Pack(partition, modeInfo.m_partitionBits);
  1979. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  1980. {
  1981. ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
  1982. pv.Pack(rotation, 2);
  1983. }
  1984. if (modeInfo.m_hasIndexSelector)
  1985. pv.Pack(indexSelector, 1);
  1986. // Encode RGB
  1987. for (int ch = 0; ch < 3; ch++)
  1988. {
  1989. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  1990. {
  1991. for (int ep = 0; ep < 2; ep++)
  1992. {
  1993. ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
  1994. epPart >>= (8 - modeInfo.m_rgbBits);
  1995. pv.Pack(epPart, modeInfo.m_rgbBits);
  1996. }
  1997. }
  1998. }
  1999. // Encode alpha
  2000. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2001. {
  2002. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2003. {
  2004. for (int ep = 0; ep < 2; ep++)
  2005. {
  2006. ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
  2007. epPart >>= (8 - modeInfo.m_alphaBits);
  2008. pv.Pack(epPart, modeInfo.m_alphaBits);
  2009. }
  2010. }
  2011. }
  2012. // Encode parity bits
  2013. if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
  2014. {
  2015. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2016. {
  2017. ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
  2018. epPart >>= (7 - modeInfo.m_rgbBits);
  2019. epPart &= 1;
  2020. pv.Pack(epPart, 1);
  2021. }
  2022. }
  2023. else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
  2024. {
  2025. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2026. {
  2027. for (int ep = 0; ep < 2; ep++)
  2028. {
  2029. ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
  2030. epPart >>= (7 - modeInfo.m_rgbBits);
  2031. epPart &= 1;
  2032. pv.Pack(epPart, 1);
  2033. }
  2034. }
  2035. }
  2036. // Encode indexes
  2037. for (int px = 0; px < 16; px++)
  2038. {
  2039. int bits = modeInfo.m_indexBits;
  2040. if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
  2041. bits--;
  2042. pv.Pack(indexes[px], bits);
  2043. }
  2044. // Encode secondary indexes
  2045. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  2046. {
  2047. for (int px = 0; px < 16; px++)
  2048. {
  2049. int bits = modeInfo.m_alphaIndexBits;
  2050. if (px == 0)
  2051. bits--;
  2052. pv.Pack(indexes2[px], bits);
  2053. }
  2054. }
  2055. pv.Flush(packedBlocks);
  2056. packedBlocks += 16;
  2057. }
  2058. }
  2059. void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
  2060. {
  2061. UnpackingVector pv;
  2062. pv.Init(packedBlock);
  2063. int mode = 8;
  2064. for (int i = 0; i < 8; i++)
  2065. {
  2066. if (pv.Unpack(1) == 1)
  2067. {
  2068. mode = i;
  2069. break;
  2070. }
  2071. }
  2072. if (mode > 7)
  2073. {
  2074. for (int px = 0; px < 16; px++)
  2075. for (int ch = 0; ch < 4; ch++)
  2076. output.m_pixels[px][ch] = 0;
  2077. return;
  2078. }
  2079. const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
  2080. int partition = 0;
  2081. if (modeInfo.m_partitionBits)
  2082. partition = pv.Unpack(modeInfo.m_partitionBits);
  2083. int rotation = 0;
  2084. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  2085. rotation = pv.Unpack(2);
  2086. int indexSelector = 0;
  2087. if (modeInfo.m_hasIndexSelector)
  2088. indexSelector = pv.Unpack(1);
  2089. // Resolve fixups
  2090. int fixups[3] = { 0, 0, 0 };
  2091. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
  2092. {
  2093. if (modeInfo.m_numSubsets == 2)
  2094. fixups[1] = BC7Data::g_fixupIndexes2[partition];
  2095. else if (modeInfo.m_numSubsets == 3)
  2096. {
  2097. fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
  2098. fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
  2099. }
  2100. }
  2101. int endPoints[3][2][4];
  2102. // Decode RGB
  2103. for (int ch = 0; ch < 3; ch++)
  2104. {
  2105. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2106. {
  2107. for (int ep = 0; ep < 2; ep++)
  2108. endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
  2109. }
  2110. }
  2111. // Decode alpha
  2112. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2113. {
  2114. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2115. {
  2116. for (int ep = 0; ep < 2; ep++)
  2117. endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
  2118. }
  2119. }
  2120. else
  2121. {
  2122. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2123. {
  2124. for (int ep = 0; ep < 2; ep++)
  2125. endPoints[subset][ep][3] = 255;
  2126. }
  2127. }
  2128. int parityBits = 0;
  2129. // Decode parity bits
  2130. if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
  2131. {
  2132. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2133. {
  2134. int p = pv.Unpack(1);
  2135. for (int ep = 0; ep < 2; ep++)
  2136. {
  2137. for (int ch = 0; ch < 3; ch++)
  2138. endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
  2139. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2140. endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
  2141. }
  2142. }
  2143. parityBits = 1;
  2144. }
  2145. else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
  2146. {
  2147. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2148. {
  2149. for (int ep = 0; ep < 2; ep++)
  2150. {
  2151. int p = pv.Unpack(1);
  2152. for (int ch = 0; ch < 3; ch++)
  2153. endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
  2154. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2155. endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
  2156. }
  2157. }
  2158. parityBits = 1;
  2159. }
  2160. // Fill endpoint bits
  2161. for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
  2162. {
  2163. for (int ep = 0; ep < 2; ep++)
  2164. {
  2165. for (int ch = 0; ch < 3; ch++)
  2166. endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
  2167. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2168. endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
  2169. }
  2170. }
  2171. int indexes[16];
  2172. int indexes2[16];
  2173. // Decode indexes
  2174. for (int px = 0; px < 16; px++)
  2175. {
  2176. int bits = modeInfo.m_indexBits;
  2177. if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
  2178. bits--;
  2179. indexes[px] = pv.Unpack(bits);
  2180. }
  2181. // Decode secondary indexes
  2182. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  2183. {
  2184. for (int px = 0; px < 16; px++)
  2185. {
  2186. int bits = modeInfo.m_alphaIndexBits;
  2187. if (px == 0)
  2188. bits--;
  2189. indexes2[px] = pv.Unpack(bits);
  2190. }
  2191. }
  2192. else
  2193. {
  2194. for (int px = 0; px < 16; px++)
  2195. indexes2[px] = 0;
  2196. }
  2197. const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
  2198. const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
  2199. // Decode each pixel
  2200. for (int px = 0; px < 16; px++)
  2201. {
  2202. int rgbWeight = 0;
  2203. int alphaWeight = 0;
  2204. int rgbIndex = indexes[px];
  2205. rgbWeight = rgbWeights[indexes[px]];
  2206. if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
  2207. alphaWeight = rgbWeight;
  2208. else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
  2209. alphaWeight = alphaWeights[indexes2[px]];
  2210. if (indexSelector == 1)
  2211. {
  2212. int temp = rgbWeight;
  2213. rgbWeight = alphaWeight;
  2214. alphaWeight = temp;
  2215. }
  2216. int pixel[4] = { 0, 0, 0, 255 };
  2217. int subset = 0;
  2218. if (modeInfo.m_numSubsets == 2)
  2219. subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
  2220. else if (modeInfo.m_numSubsets == 3)
  2221. subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
  2222. for (int ch = 0; ch < 3; ch++)
  2223. pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
  2224. if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
  2225. pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
  2226. if (rotation != 0)
  2227. {
  2228. int ch = rotation - 1;
  2229. int temp = pixel[ch];
  2230. pixel[ch] = pixel[3];
  2231. pixel[3] = temp;
  2232. }
  2233. for (int ch = 0; ch < 4; ch++)
  2234. output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
  2235. }
  2236. }
  2237. cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
  2238. {
  2239. assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
  2240. assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
  2241. // Expand to full range
  2242. ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
  2243. MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
  2244. absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
  2245. MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
  2246. return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
  2247. }
  2248. cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
  2249. {
  2250. MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
  2251. return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
  2252. }
  2253. void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
  2254. {
  2255. MSInt16 zero = ParallelMath::MakeSInt16(0);
  2256. ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
  2257. MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
  2258. MSInt16 unq;
  2259. MUInt15 absUnq;
  2260. if (precision >= 16)
  2261. {
  2262. unq = comp;
  2263. absUnq = absComp;
  2264. }
  2265. else
  2266. {
  2267. MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
  2268. ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
  2269. ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
  2270. absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
  2271. ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
  2272. ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
  2273. unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
  2274. }
  2275. outUnquantized = unq;
  2276. MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
  2277. outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
  2278. }
  2279. void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
  2280. {
  2281. MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
  2282. if (precision < 15)
  2283. {
  2284. MUInt15 zero = ParallelMath::MakeUInt15(0);
  2285. MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
  2286. ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
  2287. ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
  2288. unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
  2289. ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
  2290. ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
  2291. }
  2292. outUnquantized = unq;
  2293. outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
  2294. }
  2295. void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
  2296. {
  2297. MSInt16 unquantizedEP[2][3];
  2298. MSInt16 finishedUnquantizedEP[2][3];
  2299. {
  2300. ParallelMath::RoundUpForScope ru;
  2301. for (int epi = 0; epi < 2; epi++)
  2302. {
  2303. for (int ch = 0; ch < 3; ch++)
  2304. {
  2305. MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
  2306. UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
  2307. quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
  2308. }
  2309. }
  2310. }
  2311. indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
  2312. indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
  2313. MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
  2314. MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
  2315. ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
  2316. if (ParallelMath::AnySet(invert))
  2317. {
  2318. ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
  2319. indexSelector.ConditionalInvert(invert);
  2320. for (int ch = 0; ch < 3; ch++)
  2321. {
  2322. MAInt16 firstEP = quantizedEndPoints[0][ch];
  2323. MAInt16 secondEP = quantizedEndPoints[1][ch];
  2324. quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
  2325. quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
  2326. }
  2327. }
  2328. indexes[fixupIndex] = index;
  2329. }
  2330. void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
  2331. {
  2332. MUInt16 unquantizedEP[2][3];
  2333. MUInt16 finishedUnquantizedEP[2][3];
  2334. {
  2335. ParallelMath::RoundUpForScope ru;
  2336. for (int epi = 0; epi < 2; epi++)
  2337. {
  2338. for (int ch = 0; ch < 3; ch++)
  2339. {
  2340. MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
  2341. UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
  2342. quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
  2343. }
  2344. }
  2345. }
  2346. indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
  2347. indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
  2348. MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
  2349. MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
  2350. ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
  2351. if (ParallelMath::AnySet(invert))
  2352. {
  2353. ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
  2354. indexSelector.ConditionalInvert(invert);
  2355. for (int ch = 0; ch < 3; ch++)
  2356. {
  2357. MAInt16 firstEP = quantizedEndPoints[0][ch];
  2358. MAInt16 secondEP = quantizedEndPoints[1][ch];
  2359. quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
  2360. quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
  2361. }
  2362. }
  2363. indexes[fixupIndex] = index;
  2364. }
  2365. void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
  2366. {
  2367. ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
  2368. MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
  2369. for (int ch = 0; ch < 3; ch++)
  2370. {
  2371. outEncodedEPs[0][0][ch] = ep0[0][ch];
  2372. outEncodedEPs[0][1][ch] = ep0[1][ch];
  2373. outEncodedEPs[1][0][ch] = ep1[0][ch];
  2374. outEncodedEPs[1][1][ch] = ep1[1][ch];
  2375. if (isTransformed)
  2376. {
  2377. for (int subset = 0; subset < 2; subset++)
  2378. {
  2379. for (int epi = 0; epi < 2; epi++)
  2380. {
  2381. if (epi == 0 && subset == 0)
  2382. continue;
  2383. MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
  2384. MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
  2385. outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
  2386. MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
  2387. allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
  2388. }
  2389. }
  2390. }
  2391. if (!ParallelMath::AnySet(allLegal))
  2392. break;
  2393. }
  2394. outIsLegal = allLegal;
  2395. }
  2396. void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
  2397. {
  2398. ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
  2399. MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
  2400. for (int ch = 0; ch < 3; ch++)
  2401. {
  2402. outEncodedEPs[0][ch] = ep[0][ch];
  2403. outEncodedEPs[1][ch] = ep[1][ch];
  2404. if (isTransformed)
  2405. {
  2406. MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
  2407. MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
  2408. outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
  2409. MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
  2410. allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
  2411. }
  2412. }
  2413. outIsLegal = allLegal;
  2414. }
  2415. void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
  2416. {
  2417. if (numTweakRounds < 1)
  2418. numTweakRounds = 1;
  2419. else if (numTweakRounds > MaxTweakRounds)
  2420. numTweakRounds = MaxTweakRounds;
  2421. if (numRefineRounds < 1)
  2422. numRefineRounds = 1;
  2423. else if (numRefineRounds > MaxRefineRounds)
  2424. numRefineRounds = MaxRefineRounds;
  2425. bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
  2426. float channelWeightsSq[3];
  2427. ParallelMath::RoundTowardNearestForScope rtn;
  2428. MSInt16 pixels[16][3];
  2429. MFloat floatPixels2CL[16][3];
  2430. MFloat floatPixelsLinearWeighted[16][3];
  2431. MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
  2432. for (int ch = 0; ch < 3; ch++)
  2433. channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
  2434. for (int px = 0; px < 16; px++)
  2435. {
  2436. for (int ch = 0; ch < 3; ch++)
  2437. {
  2438. MSInt16 pixelValue;
  2439. ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
  2440. // Convert from sign+magnitude to 2CL
  2441. if (isSigned)
  2442. {
  2443. ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
  2444. MSInt16 magnitude = (pixelValue & low15Bits);
  2445. ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
  2446. pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
  2447. }
  2448. else
  2449. pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
  2450. pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
  2451. pixels[px][ch] = pixelValue;
  2452. floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
  2453. floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
  2454. }
  2455. }
  2456. MFloat preWeightedPixels[16][3];
  2457. BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
  2458. MAInt16 bestEndPoints[2][2][3];
  2459. MUInt15 bestIndexes[16];
  2460. MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
  2461. MUInt15 bestMode = ParallelMath::MakeUInt15(0);
  2462. MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
  2463. for (int px = 0; px < 16; px++)
  2464. bestIndexes[px] = ParallelMath::MakeUInt15(0);
  2465. for (int subset = 0; subset < 2; subset++)
  2466. for (int epi = 0; epi < 2; epi++)
  2467. for (int ch = 0; ch < 3; ch++)
  2468. bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
  2469. UnfinishedEndpoints<3> partitionedUFEP[32][2];
  2470. UnfinishedEndpoints<3> singleUFEP;
  2471. // Generate UFEP for partitions
  2472. for (int p = 0; p < 32; p++)
  2473. {
  2474. int partitionMask = BC7Data::g_partitionMap[p];
  2475. EndpointSelector<3, 8> epSelectors[2];
  2476. for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
  2477. {
  2478. for (int px = 0; px < 16; px++)
  2479. {
  2480. int subset = (partitionMask >> px) & 1;
  2481. epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
  2482. }
  2483. for (int subset = 0; subset < 2; subset++)
  2484. epSelectors[subset].FinishPass(pass);
  2485. }
  2486. for (int subset = 0; subset < 2; subset++)
  2487. partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
  2488. }
  2489. // Generate UFEP for single
  2490. {
  2491. EndpointSelector<3, 8> epSelector;
  2492. for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
  2493. {
  2494. for (int px = 0; px < 16; px++)
  2495. epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
  2496. epSelector.FinishPass(pass);
  2497. }
  2498. singleUFEP = epSelector.GetEndpoints(channelWeights);
  2499. }
  2500. for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
  2501. {
  2502. bool partitioned = (partitionedInt == 1);
  2503. for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
  2504. {
  2505. if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
  2506. continue;
  2507. int numPartitions = partitioned ? 32 : 1;
  2508. int numSubsets = partitioned ? 2 : 1;
  2509. int indexBits = partitioned ? 3 : 4;
  2510. int indexRange = (1 << indexBits);
  2511. for (int p = 0; p < numPartitions; p++)
  2512. {
  2513. int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
  2514. const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
  2515. MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
  2516. MUInt15 metaIndexes[MaxMetaRounds][16];
  2517. MFloat metaError[MaxMetaRounds][2];
  2518. bool roundValid[MaxMetaRounds][2];
  2519. for (int r = 0; r < MaxMetaRounds; r++)
  2520. for (int subset = 0; subset < 2; subset++)
  2521. roundValid[r][subset] = true;
  2522. for (int subset = 0; subset < numSubsets; subset++)
  2523. {
  2524. for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
  2525. {
  2526. EndpointRefiner<3> refiners[2];
  2527. bool abortRemainingRefines = false;
  2528. for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
  2529. {
  2530. int metaRound = tweak * MaxRefineRounds + refinePass;
  2531. if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
  2532. abortRemainingRefines = true;
  2533. if (abortRemainingRefines)
  2534. {
  2535. roundValid[metaRound][subset] = false;
  2536. continue;
  2537. }
  2538. MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
  2539. MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
  2540. MSInt16 endPointsColorSpace[2][3];
  2541. if (refinePass == 0)
  2542. {
  2543. UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
  2544. if (isSigned)
  2545. ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
  2546. else
  2547. ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
  2548. }
  2549. else
  2550. refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
  2551. refiners[subset].Init(indexRange, channelWeights);
  2552. int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
  2553. IndexSelectorHDR<3> indexSelector;
  2554. if (isSigned)
  2555. QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
  2556. else
  2557. QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
  2558. if (metaRound > 0)
  2559. {
  2560. ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
  2561. for (int prevRound = 0; prevRound < metaRound; prevRound++)
  2562. {
  2563. MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
  2564. ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
  2565. for (int epi = 0; epi < 2; epi++)
  2566. for (int ch = 0; ch < 3; ch++)
  2567. same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
  2568. anySame = (anySame | same);
  2569. if (ParallelMath::AllSet(anySame))
  2570. break;
  2571. }
  2572. if (ParallelMath::AllSet(anySame))
  2573. {
  2574. roundValid[metaRound][subset] = false;
  2575. continue;
  2576. }
  2577. }
  2578. MFloat subsetError = ParallelMath::MakeFloatZero();
  2579. {
  2580. for (int px = 0; px < 16; px++)
  2581. {
  2582. if (subset != ((partitionMask >> px) & 1))
  2583. continue;
  2584. MUInt15 index;
  2585. if (px == fixupIndex)
  2586. index = mrIndexes[px];
  2587. else
  2588. {
  2589. index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
  2590. mrIndexes[px] = index;
  2591. }
  2592. MSInt16 reconstructed[3];
  2593. if (isSigned)
  2594. indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
  2595. else
  2596. indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
  2597. subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
  2598. if (refinePass != numRefineRounds - 1)
  2599. refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
  2600. }
  2601. }
  2602. metaError[metaRound][subset] = subsetError;
  2603. }
  2604. }
  2605. }
  2606. // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
  2607. int numMeta1 = partitioned ? MaxMetaRounds : 1;
  2608. for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
  2609. {
  2610. if (!roundValid[meta0][0])
  2611. continue;
  2612. for (int meta1 = 0; meta1 < numMeta1; meta1++)
  2613. {
  2614. MFloat combinedError = metaError[meta0][0];
  2615. if (partitioned)
  2616. {
  2617. if (!roundValid[meta1][1])
  2618. continue;
  2619. combinedError = combinedError + metaError[meta1][1];
  2620. }
  2621. ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
  2622. if (!ParallelMath::AnySet(errorBetter))
  2623. continue;
  2624. ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
  2625. // Figure out if this is encodable
  2626. for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
  2627. {
  2628. const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
  2629. if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
  2630. continue;
  2631. MAInt16 encodedEPs[2][2][3];
  2632. ParallelMath::Int16CompFlag isLegal;
  2633. if (partitioned)
  2634. EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
  2635. else
  2636. EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
  2637. ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
  2638. if (!ParallelMath::AnySet(isLegalAndBetter))
  2639. continue;
  2640. ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
  2641. ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
  2642. ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
  2643. ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
  2644. for (int subset = 0; subset < numSubsets; subset++)
  2645. {
  2646. for (int epi = 0; epi < 2; epi++)
  2647. {
  2648. for (int ch = 0; ch < 3; ch++)
  2649. ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
  2650. }
  2651. }
  2652. for (int px = 0; px < 16; px++)
  2653. {
  2654. int subset = ((partitionMask >> px) & 1);
  2655. if (subset == 0)
  2656. ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
  2657. else
  2658. ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
  2659. }
  2660. needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
  2661. if (!ParallelMath::AnySet(needsCommit))
  2662. break;
  2663. }
  2664. }
  2665. }
  2666. }
  2667. }
  2668. }
  2669. // At this point, everything should be set
  2670. for (int block = 0; block < ParallelMath::ParallelSize; block++)
  2671. {
  2672. ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
  2673. ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
  2674. int32_t eps[2][2][3];
  2675. ParallelMath::ScalarUInt16 indexes[16];
  2676. const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
  2677. const BC6HData::ModeDescriptor *desc = BC6HData::g_modeDescriptors[mode];
  2678. const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
  2679. for (int subset = 0; subset < 2; subset++)
  2680. {
  2681. for (int epi = 0; epi < 2; epi++)
  2682. {
  2683. for (int ch = 0; ch < 3; ch++)
  2684. eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
  2685. }
  2686. }
  2687. for (int px = 0; px < 16; px++)
  2688. indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
  2689. uint16_t modeID = modeInfo.m_modeID;
  2690. PackingVector pv;
  2691. pv.Init();
  2692. for (size_t i = 0; i < headerBits; i++) {
  2693. int32_t codedValue = 0;
  2694. switch (desc[i].m_eField) {
  2695. case BC6HData::M:
  2696. codedValue = modeID;
  2697. break;
  2698. case BC6HData::D:
  2699. codedValue = partition;
  2700. break;
  2701. case BC6HData::RW:
  2702. codedValue = eps[0][0][0];
  2703. break;
  2704. case BC6HData::RX:
  2705. codedValue = eps[0][1][0];
  2706. break;
  2707. case BC6HData::RY:
  2708. codedValue = eps[1][0][0];
  2709. break;
  2710. case BC6HData::RZ:
  2711. codedValue = eps[1][1][0];
  2712. break;
  2713. case BC6HData::GW:
  2714. codedValue = eps[0][0][1];
  2715. break;
  2716. case BC6HData::GX:
  2717. codedValue = eps[0][1][1];
  2718. break;
  2719. case BC6HData::GY:
  2720. codedValue = eps[1][0][1];
  2721. break;
  2722. case BC6HData::GZ:
  2723. codedValue = eps[1][1][1];
  2724. break;
  2725. case BC6HData::BW:
  2726. codedValue = eps[0][0][2];
  2727. break;
  2728. case BC6HData::BX:
  2729. codedValue = eps[0][1][2];
  2730. break;
  2731. case BC6HData::BY:
  2732. codedValue = eps[1][0][2];
  2733. break;
  2734. case BC6HData::BZ:
  2735. codedValue = eps[1][1][2];
  2736. break;
  2737. default:
  2738. assert(false);
  2739. break;
  2740. }
  2741. pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
  2742. }
  2743. int fixupIndex1 = 0;
  2744. int indexBits = 4;
  2745. if (modeInfo.m_partitioned)
  2746. {
  2747. fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
  2748. indexBits = 3;
  2749. }
  2750. for (int px = 0; px < 16; px++)
  2751. {
  2752. ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
  2753. if (px == 0 || px == fixupIndex1)
  2754. pv.Pack(index, indexBits - 1);
  2755. else
  2756. pv.Pack(index, indexBits);
  2757. }
  2758. pv.Flush(packedBlocks + 16 * block);
  2759. }
  2760. }
  2761. void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
  2762. {
  2763. if (v & (1 << (bits - 1)))
  2764. v |= -(1 << bits);
  2765. }
  2766. void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
  2767. {
  2768. UnpackingVector pv;
  2769. pv.Init(pBC);
  2770. int numModeBits = 2;
  2771. int modeBits = pv.Unpack(2);
  2772. if (modeBits != 0 && modeBits != 1)
  2773. {
  2774. modeBits |= pv.Unpack(3) << 2;
  2775. numModeBits += 3;
  2776. }
  2777. int mode = -1;
  2778. for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
  2779. {
  2780. if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
  2781. {
  2782. mode = possibleMode;
  2783. break;
  2784. }
  2785. }
  2786. if (mode < 0)
  2787. {
  2788. for (int px = 0; px < 16; px++)
  2789. {
  2790. for (int ch = 0; ch < 3; ch++)
  2791. output.m_pixels[px][ch] = 0;
  2792. output.m_pixels[px][3] = 0x3c00; // 1.0
  2793. }
  2794. return;
  2795. }
  2796. const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
  2797. const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
  2798. const BC6HData::ModeDescriptor *desc = BC6HData::g_modeDescriptors[mode];
  2799. int32_t partition = 0;
  2800. int32_t eps[2][2][3];
  2801. for (int subset = 0; subset < 2; subset++)
  2802. for (int epi = 0; epi < 2; epi++)
  2803. for (int ch = 0; ch < 3; ch++)
  2804. eps[subset][epi][ch] = 0;
  2805. for (size_t i = numModeBits; i < headerBits; i++) {
  2806. int32_t *pCodedValue = NULL;
  2807. switch (desc[i].m_eField) {
  2808. case BC6HData::D:
  2809. pCodedValue = &partition;
  2810. break;
  2811. case BC6HData::RW:
  2812. pCodedValue = &eps[0][0][0];
  2813. break;
  2814. case BC6HData::RX:
  2815. pCodedValue = &eps[0][1][0];
  2816. break;
  2817. case BC6HData::RY:
  2818. pCodedValue = &eps[1][0][0];
  2819. break;
  2820. case BC6HData::RZ:
  2821. pCodedValue = &eps[1][1][0];
  2822. break;
  2823. case BC6HData::GW:
  2824. pCodedValue = &eps[0][0][1];
  2825. break;
  2826. case BC6HData::GX:
  2827. pCodedValue = &eps[0][1][1];
  2828. break;
  2829. case BC6HData::GY:
  2830. pCodedValue = &eps[1][0][1];
  2831. break;
  2832. case BC6HData::GZ:
  2833. pCodedValue = &eps[1][1][1];
  2834. break;
  2835. case BC6HData::BW:
  2836. pCodedValue = &eps[0][0][2];
  2837. break;
  2838. case BC6HData::BX:
  2839. pCodedValue = &eps[0][1][2];
  2840. break;
  2841. case BC6HData::BY:
  2842. pCodedValue = &eps[1][0][2];
  2843. break;
  2844. case BC6HData::BZ:
  2845. pCodedValue = &eps[1][1][2];
  2846. break;
  2847. default:
  2848. assert(false);
  2849. break;
  2850. }
  2851. (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
  2852. }
  2853. uint16_t modeID = modeInfo.m_modeID;
  2854. int fixupIndex1 = 0;
  2855. int indexBits = 4;
  2856. int numSubsets = 1;
  2857. if (modeInfo.m_partitioned)
  2858. {
  2859. fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
  2860. indexBits = 3;
  2861. numSubsets = 2;
  2862. }
  2863. int indexes[16];
  2864. for (int px = 0; px < 16; px++)
  2865. {
  2866. if (px == 0 || px == fixupIndex1)
  2867. indexes[px] = pv.Unpack(indexBits - 1);
  2868. else
  2869. indexes[px] = pv.Unpack(indexBits);
  2870. }
  2871. if (modeInfo.m_partitioned)
  2872. {
  2873. for (int ch = 0; ch < 3; ch++)
  2874. {
  2875. if (isSigned)
  2876. SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
  2877. if (modeInfo.m_transformed || isSigned)
  2878. {
  2879. SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
  2880. SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
  2881. SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
  2882. }
  2883. }
  2884. }
  2885. else
  2886. {
  2887. for (int ch = 0; ch < 3; ch++)
  2888. {
  2889. if (isSigned)
  2890. SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
  2891. if (modeInfo.m_transformed || isSigned)
  2892. SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
  2893. }
  2894. }
  2895. int aPrec = modeInfo.m_aPrec;
  2896. if (modeInfo.m_transformed)
  2897. {
  2898. for (int ch = 0; ch < 3; ch++)
  2899. {
  2900. int wrapMask = (1 << aPrec) - 1;
  2901. eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
  2902. if (isSigned)
  2903. SignExtendSingle(eps[0][1][ch], aPrec);
  2904. if (modeInfo.m_partitioned)
  2905. {
  2906. eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
  2907. eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
  2908. if (isSigned)
  2909. {
  2910. SignExtendSingle(eps[1][0][ch], aPrec);
  2911. SignExtendSingle(eps[1][1][ch], aPrec);
  2912. }
  2913. }
  2914. }
  2915. }
  2916. // Unquantize endpoints
  2917. for (int subset = 0; subset < numSubsets; subset++)
  2918. {
  2919. for (int epi = 0; epi < 2; epi++)
  2920. {
  2921. for (int ch = 0; ch < 3; ch++)
  2922. {
  2923. int &v = eps[subset][epi][ch];
  2924. if (isSigned)
  2925. {
  2926. if (aPrec >= 16)
  2927. {
  2928. // Nothing
  2929. }
  2930. else
  2931. {
  2932. bool s = false;
  2933. int comp = v;
  2934. if (v < 0)
  2935. {
  2936. s = true;
  2937. comp = -comp;
  2938. }
  2939. int unq = 0;
  2940. if (comp == 0)
  2941. unq = 0;
  2942. else if (comp >= ((1 << (aPrec - 1)) - 1))
  2943. unq = 0x7fff;
  2944. else
  2945. unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
  2946. if (s)
  2947. unq = -unq;
  2948. v = unq;
  2949. }
  2950. }
  2951. else
  2952. {
  2953. if (aPrec >= 15)
  2954. {
  2955. // Nothing
  2956. }
  2957. else if (v == 0)
  2958. {
  2959. // Nothing
  2960. }
  2961. else if (v == ((1 << aPrec) - 1))
  2962. v = 0xffff;
  2963. else
  2964. v = ((v << 16) + 0x8000) >> aPrec;
  2965. }
  2966. }
  2967. }
  2968. }
  2969. const int *weights = BC7Data::g_weightTables[indexBits];
  2970. for (int px = 0; px < 16; px++)
  2971. {
  2972. int subset = 0;
  2973. if (modeInfo.m_partitioned)
  2974. subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
  2975. int w = weights[indexes[px]];
  2976. for (int ch = 0; ch < 3; ch++)
  2977. {
  2978. int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
  2979. if (isSigned)
  2980. {
  2981. if (comp < 0)
  2982. comp = -(((-comp) * 31) >> 5);
  2983. else
  2984. comp = (comp * 31) >> 5;
  2985. int s = 0;
  2986. if (comp < 0)
  2987. {
  2988. s = 0x8000;
  2989. comp = -comp;
  2990. }
  2991. output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
  2992. }
  2993. else
  2994. {
  2995. comp = (comp * 31) >> 6;
  2996. output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
  2997. }
  2998. }
  2999. output.m_pixels[px][3] = 0x3c00; // 1.0
  3000. }
  3001. }
  3002. void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
  3003. {
  3004. static const int kMaxQuality = 100;
  3005. if (quality < 1)
  3006. quality = 1;
  3007. else if (quality > kMaxQuality)
  3008. quality = kMaxQuality;
  3009. const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
  3010. const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;
  3011. const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
  3012. const int prioListSizes[] = { numRGBModes, numRGBAModes };
  3013. BC7FineTuningParams ftParams;
  3014. memset(&ftParams, 0, sizeof(ftParams));
  3015. for (int listIndex = 0; listIndex < 2; listIndex++)
  3016. {
  3017. int prioListSize = prioListSizes[listIndex];
  3018. const uint16_t *prioList = prioLists[listIndex];
  3019. for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
  3020. {
  3021. const uint16_t packedMode = prioList[prioIndex];
  3022. uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
  3023. int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);
  3024. switch (mode)
  3025. {
  3026. case 0:
  3027. ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
  3028. break;
  3029. case 1:
  3030. ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
  3031. break;
  3032. case 2:
  3033. ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
  3034. break;
  3035. case 3:
  3036. ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
  3037. break;
  3038. case 4:
  3039. ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
  3040. break;
  3041. case 5:
  3042. ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
  3043. break;
  3044. case 6:
  3045. ftParams.mode6SP = seedPoints;
  3046. break;
  3047. case 7:
  3048. ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
  3049. break;
  3050. }
  3051. }
  3052. }
  3053. ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
  3054. }
  3055. // Generates a BC7 encoding plan from fine-tuning parameters.
  3056. bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
  3057. {
  3058. memset(&encodingPlan, 0, sizeof(encodingPlan));
  3059. // Mode 0
  3060. for (int partition = 0; partition < 16; partition++)
  3061. {
  3062. uint8_t sp = params.mode0SP[partition];
  3063. if (sp == 0)
  3064. continue;
  3065. encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;
  3066. for (int subset = 0; subset < 3; subset++)
  3067. {
  3068. int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
  3069. encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
  3070. }
  3071. }
  3072. // Mode 1
  3073. for (int partition = 0; partition < 64; partition++)
  3074. {
  3075. uint8_t sp = params.mode1SP[partition];
  3076. if (sp == 0)
  3077. continue;
  3078. encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;
  3079. for (int subset = 0; subset < 2; subset++)
  3080. {
  3081. int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
  3082. encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
  3083. }
  3084. }
  3085. // Mode 2
  3086. for (int partition = 0; partition < 64; partition++)
  3087. {
  3088. uint8_t sp = params.mode2SP[partition];
  3089. if (sp == 0)
  3090. continue;
  3091. encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;
  3092. for (int subset = 0; subset < 3; subset++)
  3093. {
  3094. int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
  3095. encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
  3096. }
  3097. }
  3098. // Mode 3
  3099. for (int partition = 0; partition < 64; partition++)
  3100. {
  3101. uint8_t sp = params.mode3SP[partition];
  3102. if (sp == 0)
  3103. continue;
  3104. encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;
  3105. for (int subset = 0; subset < 2; subset++)
  3106. {
  3107. int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
  3108. encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
  3109. }
  3110. }
  3111. // Mode 4
  3112. for (int rotation = 0; rotation < 4; rotation++)
  3113. {
  3114. for (int indexMode = 0; indexMode < 2; indexMode++)
  3115. encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
  3116. }
  3117. // Mode 5
  3118. for (int rotation = 0; rotation < 4; rotation++)
  3119. encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];
  3120. // Mode 6
  3121. {
  3122. uint8_t sp = params.mode6SP;
  3123. if (sp != 0)
  3124. {
  3125. encodingPlan.mode6Enabled = true;
  3126. int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
  3127. encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
  3128. }
  3129. }
  3130. // Mode 7
  3131. for (int partition = 0; partition < 64; partition++)
  3132. {
  3133. uint8_t sp = params.mode7SP[partition];
  3134. if (sp == 0)
  3135. continue;
  3136. encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;
  3137. for (int subset = 0; subset < 2; subset++)
  3138. {
  3139. int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
  3140. encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
  3141. }
  3142. }
  3143. for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
  3144. {
  3145. if (encodingPlan.seedPointsForShapeRGB[i] > 0)
  3146. {
  3147. encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
  3148. encodingPlan.rgbNumShapesToEvaluate++;
  3149. }
  3150. }
  3151. for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
  3152. {
  3153. if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
  3154. {
  3155. encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
  3156. encodingPlan.rgbaNumShapesToEvaluate++;
  3157. }
  3158. }
  3159. encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);
  3160. return true;
  3161. }
  3162. #endif