qbvh_nodes.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. /*
  2. * Copyright 2011-2014, Blender Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. *
  16. * Aligned nodes intersection SSE code is adopted from Embree,
  17. */
  18. struct QBVHStackItem {
  19. int addr;
  20. float dist;
  21. };
  22. ccl_device_inline void qbvh_near_far_idx_calc(const float3 &idir,
  23. int *ccl_restrict near_x,
  24. int *ccl_restrict near_y,
  25. int *ccl_restrict near_z,
  26. int *ccl_restrict far_x,
  27. int *ccl_restrict far_y,
  28. int *ccl_restrict far_z)
  29. {
  30. #ifdef __KERNEL_SSE__
  31. *near_x = 0;
  32. *far_x = 1;
  33. *near_y = 2;
  34. *far_y = 3;
  35. *near_z = 4;
  36. *far_z = 5;
  37. const size_t mask = movemask(ssef(idir.m128));
  38. const int mask_x = mask & 1;
  39. const int mask_y = (mask & 2) >> 1;
  40. const int mask_z = (mask & 4) >> 2;
  41. *near_x += mask_x;
  42. *far_x -= mask_x;
  43. *near_y += mask_y;
  44. *far_y -= mask_y;
  45. *near_z += mask_z;
  46. *far_z -= mask_z;
  47. #else
  48. if (idir.x >= 0.0f) {
  49. *near_x = 0;
  50. *far_x = 1;
  51. }
  52. else {
  53. *near_x = 1;
  54. *far_x = 0;
  55. }
  56. if (idir.y >= 0.0f) {
  57. *near_y = 2;
  58. *far_y = 3;
  59. }
  60. else {
  61. *near_y = 3;
  62. *far_y = 2;
  63. }
  64. if (idir.z >= 0.0f) {
  65. *near_z = 4;
  66. *far_z = 5;
  67. }
  68. else {
  69. *near_z = 5;
  70. *far_z = 4;
  71. }
  72. #endif
  73. }
  74. /* TOOD(sergey): Investigate if using intrinsics helps for both
  75. * stack item swap and float comparison.
  76. */
  77. ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b)
  78. {
  79. QBVHStackItem tmp = *a;
  80. *a = *b;
  81. *b = tmp;
  82. }
  83. ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
  84. QBVHStackItem *ccl_restrict s2,
  85. QBVHStackItem *ccl_restrict s3)
  86. {
  87. if (s2->dist < s1->dist) {
  88. qbvh_item_swap(s2, s1);
  89. }
  90. if (s3->dist < s2->dist) {
  91. qbvh_item_swap(s3, s2);
  92. }
  93. if (s2->dist < s1->dist) {
  94. qbvh_item_swap(s2, s1);
  95. }
  96. }
  97. ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
  98. QBVHStackItem *ccl_restrict s2,
  99. QBVHStackItem *ccl_restrict s3,
  100. QBVHStackItem *ccl_restrict s4)
  101. {
  102. if (s2->dist < s1->dist) {
  103. qbvh_item_swap(s2, s1);
  104. }
  105. if (s4->dist < s3->dist) {
  106. qbvh_item_swap(s4, s3);
  107. }
  108. if (s3->dist < s1->dist) {
  109. qbvh_item_swap(s3, s1);
  110. }
  111. if (s4->dist < s2->dist) {
  112. qbvh_item_swap(s4, s2);
  113. }
  114. if (s3->dist < s2->dist) {
  115. qbvh_item_swap(s3, s2);
  116. }
  117. }
  118. /* Axis-aligned nodes intersection */
  119. // ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
  120. static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
  121. const ssef &isect_near,
  122. const ssef &isect_far,
  123. #ifdef __KERNEL_AVX2__
  124. const sse3f &org_idir,
  125. #else
  126. const sse3f &org,
  127. #endif
  128. const sse3f &idir,
  129. const int near_x,
  130. const int near_y,
  131. const int near_z,
  132. const int far_x,
  133. const int far_y,
  134. const int far_z,
  135. const int node_addr,
  136. ssef *ccl_restrict dist)
  137. {
  138. const int offset = node_addr + 1;
  139. #ifdef __KERNEL_AVX2__
  140. const ssef tnear_x = msub(
  141. kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, org_idir.x);
  142. const ssef tnear_y = msub(
  143. kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, org_idir.y);
  144. const ssef tnear_z = msub(
  145. kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, org_idir.z);
  146. const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, org_idir.x);
  147. const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, org_idir.y);
  148. const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, org_idir.z);
  149. #else
  150. const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - org.x) * idir.x;
  151. const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - org.y) * idir.y;
  152. const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - org.z) * idir.z;
  153. const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - org.x) * idir.x;
  154. const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - org.y) * idir.y;
  155. const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - org.z) * idir.z;
  156. #endif
  157. #ifdef __KERNEL_SSE41__
  158. const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near));
  159. const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far));
  160. const sseb vmask = cast(tnear) > cast(tfar);
  161. int mask = (int)movemask(vmask) ^ 0xf;
  162. #else
  163. const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
  164. const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
  165. const sseb vmask = tnear <= tfar;
  166. int mask = (int)movemask(vmask);
  167. #endif
  168. *dist = tnear;
  169. return mask;
  170. }
  171. /* Unaligned nodes intersection */
  172. ccl_device_inline int qbvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg,
  173. const ssef &isect_near,
  174. const ssef &isect_far,
  175. #ifdef __KERNEL_AVX2__
  176. const sse3f &org_idir,
  177. #endif
  178. const sse3f &org,
  179. const sse3f &dir,
  180. const sse3f &idir,
  181. const int near_x,
  182. const int near_y,
  183. const int near_z,
  184. const int far_x,
  185. const int far_y,
  186. const int far_z,
  187. const int node_addr,
  188. ssef *ccl_restrict dist)
  189. {
  190. const int offset = node_addr;
  191. const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1);
  192. const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2);
  193. const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3);
  194. const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4);
  195. const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5);
  196. const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6);
  197. const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7);
  198. const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8);
  199. const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9);
  200. const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10);
  201. const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11);
  202. const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12);
  203. const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
  204. aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
  205. aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
  206. const ssef aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x,
  207. aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y,
  208. aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z;
  209. const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
  210. const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
  211. nrdir_z = neg_one / aligned_dir_z;
  212. const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
  213. tlower_z = aligned_P_z * nrdir_z;
  214. const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
  215. tupper_z = tlower_z - nrdir_z;
  216. #ifdef __KERNEL_SSE41__
  217. const ssef tnear_x = mini(tlower_x, tupper_x);
  218. const ssef tnear_y = mini(tlower_y, tupper_y);
  219. const ssef tnear_z = mini(tlower_z, tupper_z);
  220. const ssef tfar_x = maxi(tlower_x, tupper_x);
  221. const ssef tfar_y = maxi(tlower_y, tupper_y);
  222. const ssef tfar_z = maxi(tlower_z, tupper_z);
  223. const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
  224. const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
  225. const sseb vmask = tnear <= tfar;
  226. *dist = tnear;
  227. return movemask(vmask);
  228. #else
  229. const ssef tnear_x = min(tlower_x, tupper_x);
  230. const ssef tnear_y = min(tlower_y, tupper_y);
  231. const ssef tnear_z = min(tlower_z, tupper_z);
  232. const ssef tfar_x = max(tlower_x, tupper_x);
  233. const ssef tfar_y = max(tlower_y, tupper_y);
  234. const ssef tfar_z = max(tlower_z, tupper_z);
  235. const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
  236. const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
  237. const sseb vmask = tnear <= tfar;
  238. *dist = tnear;
  239. return movemask(vmask);
  240. #endif
  241. }
  242. /* Intersectors wrappers.
  243. *
  244. * They'll check node type and call appropriate intersection code.
  245. */
  246. ccl_device_inline int qbvh_node_intersect(KernelGlobals *ccl_restrict kg,
  247. const ssef &isect_near,
  248. const ssef &isect_far,
  249. #ifdef __KERNEL_AVX2__
  250. const sse3f &org_idir,
  251. #endif
  252. const sse3f &org,
  253. const sse3f &dir,
  254. const sse3f &idir,
  255. const int near_x,
  256. const int near_y,
  257. const int near_z,
  258. const int far_x,
  259. const int far_y,
  260. const int far_z,
  261. const int node_addr,
  262. ssef *ccl_restrict dist)
  263. {
  264. const int offset = node_addr;
  265. const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
  266. if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
  267. return qbvh_unaligned_node_intersect(kg,
  268. isect_near,
  269. isect_far,
  270. #ifdef __KERNEL_AVX2__
  271. org_idir,
  272. #endif
  273. org,
  274. dir,
  275. idir,
  276. near_x,
  277. near_y,
  278. near_z,
  279. far_x,
  280. far_y,
  281. far_z,
  282. node_addr,
  283. dist);
  284. }
  285. else {
  286. return qbvh_aligned_node_intersect(kg,
  287. isect_near,
  288. isect_far,
  289. #ifdef __KERNEL_AVX2__
  290. org_idir,
  291. #else
  292. org,
  293. #endif
  294. idir,
  295. near_x,
  296. near_y,
  297. near_z,
  298. far_x,
  299. far_y,
  300. far_z,
  301. node_addr,
  302. dist);
  303. }
  304. }