0002-OpenCL-Simplify-LLVM-IR-generated-for-OpenCL-blocks.patch 16 KB


  1. From 46c42e19eb6ad426c07f4e33408c7288ddd5d053 Mon Sep 17 00:00:00 2001
  2. From: Andrew Savonichev <andrew.savonichev@intel.com>
  3. Date: Thu, 21 Feb 2019 11:02:10 +0000
  4. Subject: [PATCH 2/3] [OpenCL] Simplify LLVM IR generated for OpenCL blocks
  5. Summary:
  6. Emit direct call of block invoke functions when possible, i.e. in case the
  7. block is not passed as a function argument.
  8. Also doing some refactoring of `CodeGenFunction::EmitBlockCallExpr()`
  9. Reviewers: Anastasia, yaxunl, svenvh
  10. Reviewed By: Anastasia
  11. Subscribers: cfe-commits
  12. Tags: #clang
  13. Differential Revision: https://reviews.llvm.org/D58388
  14. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@354568 91177308-0d34-0410-b5e6-96231b3b80d8
  15. ---
  16. lib/CodeGen/CGBlocks.cpp | 77 +++++++++----------
  17. lib/CodeGen/CGOpenCLRuntime.cpp | 30 ++++++--
  18. lib/CodeGen/CGOpenCLRuntime.h | 4 +
  19. test/CodeGenOpenCL/blocks.cl | 10 +--
  20. .../CodeGenOpenCL/cl20-device-side-enqueue.cl | 34 ++++++--
  21. 5 files changed, 91 insertions(+), 64 deletions(-)
  22. diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
  23. index fa3c3ee861..10a0238d91 100644
  24. --- a/lib/CodeGen/CGBlocks.cpp
  25. +++ b/lib/CodeGen/CGBlocks.cpp
  26. @@ -1261,52 +1261,49 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
  27. ReturnValueSlot ReturnValue) {
  28. const BlockPointerType *BPT =
  29. E->getCallee()->getType()->getAs<BlockPointerType>();
  30. -
  31. llvm::Value *BlockPtr = EmitScalarExpr(E->getCallee());
  32. -
  33. - // Get a pointer to the generic block literal.
  34. - // For OpenCL we generate generic AS void ptr to be able to reuse the same
  35. - // block definition for blocks with captures generated as private AS local
  36. - // variables and without captures generated as global AS program scope
  37. - // variables.
  38. - unsigned AddrSpace = 0;
  39. - if (getLangOpts().OpenCL)
  40. - AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_generic);
  41. -
  42. - llvm::Type *BlockLiteralTy =
  43. - llvm::PointerType::get(CGM.getGenericBlockLiteralType(), AddrSpace);
  44. -
  45. - // Bitcast the callee to a block literal.
  46. - BlockPtr =
  47. - Builder.CreatePointerCast(BlockPtr, BlockLiteralTy, "block.literal");
  48. -
  49. - // Get the function pointer from the literal.
  50. - llvm::Value *FuncPtr =
  51. - Builder.CreateStructGEP(CGM.getGenericBlockLiteralType(), BlockPtr,
  52. - CGM.getLangOpts().OpenCL ? 2 : 3);
  53. -
  54. - // Add the block literal.
  55. + llvm::Type *GenBlockTy = CGM.getGenericBlockLiteralType();
  56. + llvm::Value *Func = nullptr;
  57. + QualType FnType = BPT->getPointeeType();
  58. + ASTContext &Ctx = getContext();
  59. CallArgList Args;
  60. - QualType VoidPtrQualTy = getContext().VoidPtrTy;
  61. - llvm::Type *GenericVoidPtrTy = VoidPtrTy;
  62. if (getLangOpts().OpenCL) {
  63. - GenericVoidPtrTy = CGM.getOpenCLRuntime().getGenericVoidPointerType();
  64. - VoidPtrQualTy =
  65. - getContext().getPointerType(getContext().getAddrSpaceQualType(
  66. - getContext().VoidTy, LangAS::opencl_generic));
  67. - }
  68. -
  69. - BlockPtr = Builder.CreatePointerCast(BlockPtr, GenericVoidPtrTy);
  70. - Args.add(RValue::get(BlockPtr), VoidPtrQualTy);
  71. -
  72. - QualType FnType = BPT->getPointeeType();
  73. + // For OpenCL, BlockPtr is already casted to generic block literal.
  74. +
  75. + // First argument of a block call is a generic block literal casted to
  76. + // generic void pointer, i.e. i8 addrspace(4)*
  77. + llvm::Value *BlockDescriptor = Builder.CreatePointerCast(
  78. + BlockPtr, CGM.getOpenCLRuntime().getGenericVoidPointerType());
  79. + QualType VoidPtrQualTy = Ctx.getPointerType(
  80. + Ctx.getAddrSpaceQualType(Ctx.VoidTy, LangAS::opencl_generic));
  81. + Args.add(RValue::get(BlockDescriptor), VoidPtrQualTy);
  82. + // And the rest of the arguments.
  83. + EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
  84. +
  85. + // We *can* call the block directly unless it is a function argument.
  86. + if (!isa<ParmVarDecl>(E->getCalleeDecl()))
  87. + Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
  88. + else {
  89. + llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
  90. + Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
  91. + }
  92. + } else {
  93. + // Bitcast the block literal to a generic block literal.
  94. + BlockPtr = Builder.CreatePointerCast(
  95. + BlockPtr, llvm::PointerType::get(GenBlockTy, 0), "block.literal");
  96. + // Get pointer to the block invoke function
  97. + llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
  98. - // And the rest of the arguments.
  99. - EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
  100. + // First argument is a block literal casted to a void pointer
  101. + BlockPtr = Builder.CreatePointerCast(BlockPtr, VoidPtrTy);
  102. + Args.add(RValue::get(BlockPtr), Ctx.VoidPtrTy);
  103. + // And the rest of the arguments.
  104. + EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
  105. - // Load the function.
  106. - llvm::Value *Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
  107. + // Load the function.
  108. + Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
  109. + }
  110. const FunctionType *FuncTy = FnType->castAs<FunctionType>();
  111. const CGFunctionInfo &FnInfo =
  112. diff --git a/lib/CodeGen/CGOpenCLRuntime.cpp b/lib/CodeGen/CGOpenCLRuntime.cpp
  113. index 7f6f595dd5..75003e569f 100644
  114. --- a/lib/CodeGen/CGOpenCLRuntime.cpp
  115. +++ b/lib/CodeGen/CGOpenCLRuntime.cpp
  116. @@ -123,6 +123,23 @@ llvm::PointerType *CGOpenCLRuntime::getGenericVoidPointerType() {
  117. CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
  118. }
  119. +// Get the block literal from an expression derived from the block expression.
  120. +// OpenCL v2.0 s6.12.5:
  121. +// Block variable declarations are implicitly qualified with const. Therefore
  122. +// all block variables must be initialized at declaration time and may not be
  123. +// reassigned.
  124. +static const BlockExpr *getBlockExpr(const Expr *E) {
  125. + const Expr *Prev = nullptr; // to make sure we do not stuck in infinite loop.
  126. + while(!isa<BlockExpr>(E) && E != Prev) {
  127. + Prev = E;
  128. + E = E->IgnoreCasts();
  129. + if (auto DR = dyn_cast<DeclRefExpr>(E)) {
  130. + E = cast<VarDecl>(DR->getDecl())->getInit();
  131. + }
  132. + }
  133. + return cast<BlockExpr>(E);
  134. +}
  135. +
  136. /// Record emitted llvm invoke function and llvm block literal for the
  137. /// corresponding block expression.
  138. void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
  139. @@ -137,20 +154,17 @@ void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
  140. EnqueuedBlockMap[E].Kernel = nullptr;
  141. }
  142. +llvm::Function *CGOpenCLRuntime::getInvokeFunction(const Expr *E) {
  143. + return EnqueuedBlockMap[getBlockExpr(E)].InvokeFunc;
  144. +}
  145. +
  146. CGOpenCLRuntime::EnqueuedBlockInfo
  147. CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E) {
  148. CGF.EmitScalarExpr(E);
  149. // The block literal may be assigned to a const variable. Chasing down
  150. // to get the block literal.
  151. - if (auto DR = dyn_cast<DeclRefExpr>(E)) {
  152. - E = cast<VarDecl>(DR->getDecl())->getInit();
  153. - }
  154. - E = E->IgnoreImplicit();
  155. - if (auto Cast = dyn_cast<CastExpr>(E)) {
  156. - E = Cast->getSubExpr();
  157. - }
  158. - auto *Block = cast<BlockExpr>(E);
  159. + const BlockExpr *Block = getBlockExpr(E);
  160. assert(EnqueuedBlockMap.find(Block) != EnqueuedBlockMap.end() &&
  161. "Block expression not emitted");
  162. diff --git a/lib/CodeGen/CGOpenCLRuntime.h b/lib/CodeGen/CGOpenCLRuntime.h
  163. index 750721f1b8..4effc7eaa8 100644
  164. --- a/lib/CodeGen/CGOpenCLRuntime.h
  165. +++ b/lib/CodeGen/CGOpenCLRuntime.h
  166. @@ -92,6 +92,10 @@ public:
  167. /// \param Block block literal emitted for the block expression.
  168. void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF,
  169. llvm::Value *Block);
  170. +
  171. + /// \return LLVM block invoke function emitted for an expression derived from
  172. + /// the block expression.
  173. + llvm::Function *getInvokeFunction(const Expr *E);
  174. };
  175. }
  176. diff --git a/test/CodeGenOpenCL/blocks.cl b/test/CodeGenOpenCL/blocks.cl
  177. index 19aacc3f0d..ab5a2c643c 100644
  178. --- a/test/CodeGenOpenCL/blocks.cl
  179. +++ b/test/CodeGenOpenCL/blocks.cl
  180. @@ -39,11 +39,8 @@ void foo(){
  181. // SPIR: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic* %[[blk_ptr]] to %struct.__opencl_block_literal_generic addrspace(4)*
  182. // SPIR: store %struct.__opencl_block_literal_generic addrspace(4)* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B:.*]],
  183. // SPIR: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic addrspace(4)*, %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B]]
  184. - // SPIR: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]], i32 0, i32 2
  185. // SPIR: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]] to i8 addrspace(4)*
  186. - // SPIR: %[[invoke_func_ptr:.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %[[invoke_addr]]
  187. - // SPIR: %[[invoke_func:.*]] = addrspacecast i8 addrspace(4)* %[[invoke_func_ptr]] to i32 (i8 addrspace(4)*)*
  188. - // SPIR: call {{.*}}i32 %[[invoke_func]](i8 addrspace(4)* %[[blk_gen_ptr]])
  189. + // SPIR: call {{.*}}i32 @__foo_block_invoke(i8 addrspace(4)* %[[blk_gen_ptr]])
  190. // AMDGCN: %[[block_invoke:.*]] = getelementptr inbounds <{ i32, i32, i8*, i32 }>, <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block:.*]], i32 0, i32 2
  191. // AMDGCN: store i8* bitcast (i32 (i8*)* @__foo_block_invoke to i8*), i8* addrspace(5)* %[[block_invoke]]
  192. // AMDGCN: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8*, i32 }>, <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block]], i32 0, i32 3
  193. @@ -53,11 +50,8 @@ void foo(){
  194. // AMDGCN: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic addrspace(5)* %[[blk_ptr]] to %struct.__opencl_block_literal_generic*
  195. // AMDGCN: store %struct.__opencl_block_literal_generic* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B:.*]],
  196. // AMDGCN: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic*, %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B]]
  197. - // AMDGCN: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic* %[[block_literal]], i32 0, i32 2
  198. // AMDGCN: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic* %[[block_literal]] to i8*
  199. - // AMDGCN: %[[invoke_func_ptr:.*]] = load i8*, i8** %[[invoke_addr]]
  200. - // AMDGCN: %[[invoke_func:.*]] = bitcast i8* %[[invoke_func_ptr]] to i32 (i8*)*
  201. - // AMDGCN: call {{.*}}i32 %[[invoke_func]](i8* %[[blk_gen_ptr]])
  202. + // AMDGCN: call {{.*}}i32 @__foo_block_invoke(i8* %[[blk_gen_ptr]])
  203. int (^ block_B)(void) = ^{
  204. return i;
  205. diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  206. index 84450162da..1566912ded 100644
  207. --- a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  208. +++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  209. @@ -312,9 +312,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
  210. };
  211. // Uses global block literal [[BLG8]] and invoke function [[INVG8]].
  212. - // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
  213. - // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
  214. - // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
  215. + // COMMON: call spir_func void @__device_side_enqueue_block_invoke_11(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
  216. block_A();
  217. // Emits global block literal [[BLG8]] and block kernel [[INVGK8]]. [[INVGK8]] calls [[INVG8]].
  218. @@ -333,15 +331,35 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
  219. unsigned size = get_kernel_work_group_size(block_A);
  220. // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted.
  221. - // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
  222. - // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
  223. - // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
  224. + // COMMON: call spir_func void @__device_side_enqueue_block_invoke_11(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
  225. block_A();
  226. + // Make sure that block invoke function is resolved correctly after sequence of assignements.
  227. + // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)*
  228. + // COMMON-SAME: addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)*
  229. + // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to %struct.__opencl_block_literal_generic addrspace(1)*)
  230. + // COMMON-SAME: to %struct.__opencl_block_literal_generic addrspace(4)*),
  231. + // COMMON-SAME: %struct.__opencl_block_literal_generic addrspace(4)** %b1,
  232. + bl_t b1 = block_G;
  233. + // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)*
  234. + // COMMON-SAME: addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)*
  235. + // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to %struct.__opencl_block_literal_generic addrspace(1)*)
  236. + // COMMON-SAME: to %struct.__opencl_block_literal_generic addrspace(4)*),
  237. + // COMMON-SAME: %struct.__opencl_block_literal_generic addrspace(4)** %b2,
  238. + bl_t b2 = b1;
  239. + // COMMON: call spir_func void @block_G_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)*
  240. + // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*)
  241. + // COOMON-SAME: to i8 addrspace(4)*), i8 addrspace(3)* null)
  242. + b2(0);
  243. + // Uses global block literal [[BL_GLOBAL]] and block kernel [[INV_G_K]]. [[INV_G_K]] calls [[INV_G]].
  244. + // COMMON: call i32 @__get_kernel_preferred_work_group_size_multiple_impl(
  245. + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INV_G_K:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
  246. + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*))
  247. + size = get_kernel_preferred_work_group_size_multiple(b2);
  248. +
  249. void (^block_C)(void) = ^{
  250. callee(i, a);
  251. };
  252. -
  253. // Emits block literal on stack and block kernel [[INVLK3]].
  254. // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL3:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
  255. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
  256. @@ -404,8 +422,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
  257. // COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}})
  258. // COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}})
  259. // COMMON: define internal spir_kernel void [[INVGK8]](i8 addrspace(4)*{{.*}})
  260. +// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
  261. // COMMON: define internal spir_kernel void [[INVLK3]](i8 addrspace(4)*{{.*}})
  262. // COMMON: define internal spir_kernel void [[INVGK9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
  263. -// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
  264. // COMMON: define internal spir_kernel void [[INVGK10]](i8 addrspace(4)*{{.*}})
  265. // COMMON: define internal spir_kernel void [[INVGK11]](i8 addrspace(4)*{{.*}})
  266. --
  267. 2.21.0