kernel_config.h 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /*
  2. * Copyright 2011-2013 Blender Foundation
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /* device data taken from CUDA occupancy calculator */
  17. /* 3.0 and 3.5 */
  18. #if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
  19. # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
  20. # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
  21. # define CUDA_BLOCK_MAX_THREADS 1024
  22. # define CUDA_THREAD_MAX_REGISTERS 63
  23. /* tunable parameters */
  24. # define CUDA_THREADS_BLOCK_WIDTH 16
  25. # define CUDA_KERNEL_MAX_REGISTERS 63
  26. # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
  27. /* 3.2 */
  28. #elif __CUDA_ARCH__ == 320
  29. # define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
  30. # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
  31. # define CUDA_BLOCK_MAX_THREADS 1024
  32. # define CUDA_THREAD_MAX_REGISTERS 63
  33. /* tunable parameters */
  34. # define CUDA_THREADS_BLOCK_WIDTH 16
  35. # define CUDA_KERNEL_MAX_REGISTERS 63
  36. # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
  37. /* 3.7 */
  38. #elif __CUDA_ARCH__ == 370
  39. # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
  40. # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
  41. # define CUDA_BLOCK_MAX_THREADS 1024
  42. # define CUDA_THREAD_MAX_REGISTERS 255
  43. /* tunable parameters */
  44. # define CUDA_THREADS_BLOCK_WIDTH 16
  45. # define CUDA_KERNEL_MAX_REGISTERS 63
  46. # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
  47. /* 5.x, 6.x */
  48. #elif __CUDA_ARCH__ <= 699
  49. # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
  50. # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
  51. # define CUDA_BLOCK_MAX_THREADS 1024
  52. # define CUDA_THREAD_MAX_REGISTERS 255
  53. /* tunable parameters */
  54. # define CUDA_THREADS_BLOCK_WIDTH 16
  55. /* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
  56. * registers */
  57. # if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
  58. # define CUDA_KERNEL_MAX_REGISTERS 64
  59. # else
  60. # define CUDA_KERNEL_MAX_REGISTERS 48
  61. # endif
  62. # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
  63. /* 7.x */
  64. #elif __CUDA_ARCH__ <= 799
  65. # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
  66. # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
  67. # define CUDA_BLOCK_MAX_THREADS 1024
  68. # define CUDA_THREAD_MAX_REGISTERS 255
  69. /* tunable parameters */
  70. # define CUDA_THREADS_BLOCK_WIDTH 16
  71. # define CUDA_KERNEL_MAX_REGISTERS 64
  72. # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
  73. /* unknown architecture */
  74. #else
  75. # error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
  76. #endif
  77. /* For split kernel using all registers seems fastest for now, but this
  78. * is unlikely to be optimal once we resolve other bottlenecks. */
  79. #define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
  80. /* Compute number of threads per block and minimum blocks per multiprocessor
  81. * given the maximum number of registers per thread. */
  82. #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
  83. __launch_bounds__(threads_block_width *threads_block_width, \
  84. CUDA_MULTIPRESSOR_MAX_REGISTERS / \
  85. (threads_block_width * threads_block_width * thread_num_registers))
  86. /* sanity checks */
  87. #if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
  88. # error "Maximum number of threads per block exceeded"
  89. #endif
  90. #if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
  91. (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
  92. CUDA_MULTIPROCESSOR_MAX_BLOCKS
  93. # error "Maximum number of blocks per multiprocessor exceeded"
  94. #endif
  95. #if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
  96. # error "Maximum number of registers per thread exceeded"
  97. #endif
  98. #if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
  99. # error "Maximum number of registers per thread exceeded"
  100. #endif