kernel_work_stealing.h 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. /*
  2. * Copyright 2011-2015 Blender Foundation
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef __KERNEL_WORK_STEALING_H__
  17. #define __KERNEL_WORK_STEALING_H__
  18. CCL_NAMESPACE_BEGIN
  19. /*
  20. * Utility functions for work stealing
  21. */
  22. #ifdef __KERNEL_OPENCL__
  23. # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
  24. #endif
  25. #ifdef __SPLIT_KERNEL__
  26. /* Returns true if there is work */
  27. ccl_device bool get_next_work(KernelGlobals *kg,
  28. ccl_global uint *work_pools,
  29. uint total_work_size,
  30. uint ray_index,
  31. ccl_private uint *global_work_index)
  32. {
  33. /* With a small amount of work there may be more threads than work due to
  34. * rounding up of global size, stop such threads immediately. */
  35. if (ray_index >= total_work_size) {
  36. return false;
  37. }
  38. /* Increase atomic work index counter in pool. */
  39. uint pool = ray_index / WORK_POOL_SIZE;
  40. uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
  41. /* Map per-pool work index to a global work index. */
  42. uint global_size = ccl_global_size(0) * ccl_global_size(1);
  43. kernel_assert(global_size % WORK_POOL_SIZE == 0);
  44. kernel_assert(ray_index < global_size);
  45. *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
  46. (work_index % WORK_POOL_SIZE);
  47. /* Test if all work for this pool is done. */
  48. return (*global_work_index < total_work_size);
  49. }
  50. #endif
  51. /* Map global work index to tile, pixel X/Y and sample. */
  52. ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
  53. uint global_work_index,
  54. ccl_private uint *x,
  55. ccl_private uint *y,
  56. ccl_private uint *sample)
  57. {
  58. #ifdef __KERNEL_CUDA__
  59. /* Keeping threads for the same pixel together improves performance on CUDA. */
  60. uint sample_offset = global_work_index % tile->num_samples;
  61. uint pixel_offset = global_work_index / tile->num_samples;
  62. #else /* __KERNEL_CUDA__ */
  63. uint tile_pixels = tile->w * tile->h;
  64. uint sample_offset = global_work_index / tile_pixels;
  65. uint pixel_offset = global_work_index - sample_offset * tile_pixels;
  66. #endif /* __KERNEL_CUDA__ */
  67. uint y_offset = pixel_offset / tile->w;
  68. uint x_offset = pixel_offset - y_offset * tile->w;
  69. *x = tile->x + x_offset;
  70. *y = tile->y + y_offset;
  71. *sample = tile->start_sample + sample_offset;
  72. }
  73. CCL_NAMESPACE_END
  74. #endif /* __KERNEL_WORK_STEALING_H__ */