bvh_nodes.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. /*
  2. * Copyright 2011-2016, Blender Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
  17. // 3-vector which might be faster.
  18. ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
  19. int node_addr,
  20. int child)
  21. {
  22. Transform space;
  23. const int child_addr = node_addr + child * 3;
  24. space.x = kernel_tex_fetch(__bvh_nodes, child_addr + 1);
  25. space.y = kernel_tex_fetch(__bvh_nodes, child_addr + 2);
  26. space.z = kernel_tex_fetch(__bvh_nodes, child_addr + 3);
  27. return space;
  28. }
  29. #if !defined(__KERNEL_SSE2__)
  30. ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
  31. const float3 P,
  32. const float3 idir,
  33. const float t,
  34. const int node_addr,
  35. const uint visibility,
  36. float dist[2])
  37. {
  38. /* fetch node data */
  39. float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
  40. float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
  41. float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
  42. float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
  43. /* intersect ray against child nodes */
  44. float c0lox = (node0.x - P.x) * idir.x;
  45. float c0hix = (node0.z - P.x) * idir.x;
  46. float c0loy = (node1.x - P.y) * idir.y;
  47. float c0hiy = (node1.z - P.y) * idir.y;
  48. float c0loz = (node2.x - P.z) * idir.z;
  49. float c0hiz = (node2.z - P.z) * idir.z;
  50. float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
  51. float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
  52. float c1lox = (node0.y - P.x) * idir.x;
  53. float c1hix = (node0.w - P.x) * idir.x;
  54. float c1loy = (node1.y - P.y) * idir.y;
  55. float c1hiy = (node1.w - P.y) * idir.y;
  56. float c1loz = (node2.y - P.z) * idir.z;
  57. float c1hiz = (node2.w - P.z) * idir.z;
  58. float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
  59. float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
  60. dist[0] = c0min;
  61. dist[1] = c1min;
  62. # ifdef __VISIBILITY_FLAG__
  63. /* this visibility test gives a 5% performance hit, how to solve? */
  64. return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
  65. (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
  66. # else
  67. return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
  68. # endif
  69. }
  70. ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
  71. const float3 P,
  72. const float3 dir,
  73. const float t,
  74. int node_addr,
  75. int child,
  76. float dist[2])
  77. {
  78. Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
  79. float3 aligned_dir = transform_direction(&space, dir);
  80. float3 aligned_P = transform_point(&space, P);
  81. float3 nrdir = -bvh_inverse_direction(aligned_dir);
  82. float3 lower_xyz = aligned_P * nrdir;
  83. float3 upper_xyz = lower_xyz - nrdir;
  84. const float near_x = min(lower_xyz.x, upper_xyz.x);
  85. const float near_y = min(lower_xyz.y, upper_xyz.y);
  86. const float near_z = min(lower_xyz.z, upper_xyz.z);
  87. const float far_x = max(lower_xyz.x, upper_xyz.x);
  88. const float far_y = max(lower_xyz.y, upper_xyz.y);
  89. const float far_z = max(lower_xyz.z, upper_xyz.z);
  90. const float tnear = max4(0.0f, near_x, near_y, near_z);
  91. const float tfar = min4(t, far_x, far_y, far_z);
  92. *dist = tnear;
  93. return tnear <= tfar;
  94. }
  95. ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
  96. const float3 P,
  97. const float3 dir,
  98. const float3 idir,
  99. const float t,
  100. const int node_addr,
  101. const uint visibility,
  102. float dist[2])
  103. {
  104. int mask = 0;
  105. float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
  106. if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
  107. # ifdef __VISIBILITY_FLAG__
  108. if ((__float_as_uint(cnodes.x) & visibility))
  109. # endif
  110. {
  111. mask |= 1;
  112. }
  113. }
  114. if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
  115. # ifdef __VISIBILITY_FLAG__
  116. if ((__float_as_uint(cnodes.y) & visibility))
  117. # endif
  118. {
  119. mask |= 2;
  120. }
  121. }
  122. return mask;
  123. }
  124. ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
  125. const float3 P,
  126. const float3 dir,
  127. const float3 idir,
  128. const float t,
  129. const int node_addr,
  130. const uint visibility,
  131. float dist[2])
  132. {
  133. float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
  134. if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
  135. return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist);
  136. }
  137. else {
  138. return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
  139. }
  140. }
  141. #else /* !defined(__KERNEL_SSE2__) */
  142. int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg,
  143. const float3 &P,
  144. const float3 &dir,
  145. const ssef &tsplat,
  146. const ssef Psplat[3],
  147. const ssef idirsplat[3],
  148. const shuffle_swap_t shufflexyz[3],
  149. const int node_addr,
  150. const uint visibility,
  151. float dist[2])
  152. {
  153. /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
  154. const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
  155. /* fetch node data */
  156. const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr;
  157. /* intersect ray against child nodes */
  158. const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
  159. const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
  160. const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
  161. /* calculate { c0min, c1min, -c0max, -c1max} */
  162. ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
  163. const ssef tminmax = minmax ^ pn;
  164. const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
  165. dist[0] = tminmax[0];
  166. dist[1] = tminmax[1];
  167. int mask = movemask(lrhit);
  168. # ifdef __VISIBILITY_FLAG__
  169. /* this visibility test gives a 5% performance hit, how to solve? */
  170. float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
  171. int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
  172. (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
  173. return cmask;
  174. # else
  175. return mask & 3;
  176. # endif
  177. }
  178. ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
  179. const float3 P,
  180. const float3 dir,
  181. const ssef &isect_near,
  182. const ssef &isect_far,
  183. const int node_addr,
  184. const uint visibility,
  185. float dist[2])
  186. {
  187. Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
  188. Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
  189. float3 aligned_dir0 = transform_direction(&space0, dir),
  190. aligned_dir1 = transform_direction(&space1, dir);
  191. float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
  192. float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
  193. nrdir1 = -bvh_inverse_direction(aligned_dir1);
  194. ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
  195. lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
  196. lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
  197. ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
  198. upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
  199. upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
  200. ssef tnear_x = min(lower_x, upper_x);
  201. ssef tnear_y = min(lower_y, upper_y);
  202. ssef tnear_z = min(lower_z, upper_z);
  203. ssef tfar_x = max(lower_x, upper_x);
  204. ssef tfar_y = max(lower_y, upper_y);
  205. ssef tfar_z = max(lower_z, upper_z);
  206. const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
  207. const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
  208. sseb vmask = tnear <= tfar;
  209. dist[0] = tnear.f[0];
  210. dist[1] = tnear.f[1];
  211. int mask = (int)movemask(vmask);
  212. # ifdef __VISIBILITY_FLAG__
  213. /* this visibility test gives a 5% performance hit, how to solve? */
  214. float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
  215. int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
  216. (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
  217. return cmask;
  218. # else
  219. return mask & 3;
  220. # endif
  221. }
  222. ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
  223. const float3 &P,
  224. const float3 &dir,
  225. const ssef &isect_near,
  226. const ssef &isect_far,
  227. const ssef &tsplat,
  228. const ssef Psplat[3],
  229. const ssef idirsplat[3],
  230. const shuffle_swap_t shufflexyz[3],
  231. const int node_addr,
  232. const uint visibility,
  233. float dist[2])
  234. {
  235. float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
  236. if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
  237. return bvh_unaligned_node_intersect(
  238. kg, P, dir, isect_near, isect_far, node_addr, visibility, dist);
  239. }
  240. else {
  241. return bvh_aligned_node_intersect(
  242. kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist);
  243. }
  244. }
  245. #endif /* !defined(__KERNEL_SSE2__) */