opencv-4.8.0-arm64-fp16.patch 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. https://github.com/opencv/opencv/pull/24203
  2. From 689fa6f372975d58e9f50fd17a0abd105b1815f1 Mon Sep 17 00:00:00 2001
  3. From: Sam James <sam@gentoo.org>
  4. Date: Mon, 28 Aug 2023 04:20:58 +0100
  5. Subject: [PATCH] Fix compilation on arm64 with FP16 when disabled
  6. If building with -mcpu=native or any other setting which implies the current
  7. CPU has FP16 but with intrinsics disabled, we mistakenly try to use it even
  8. though convolution.hpp conditionally defines it correctly based on whether
  9. we should *use it*. convolution.cpp on the other hand was mismatched and
  10. trying to use it if the CPU supported it, even if not enabled in the build
  11. system.
  12. Make the guards match.
  13. Bug: https://bugs.gentoo.org/913031
  14. Signed-off-by: Sam James <sam@gentoo.org>
  15. --- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
  16. +++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
  17. @@ -118,7 +118,7 @@ Ptr<FastConv> initFastConv(
  18. const size_t wstep = weightsMat.step1();
  19. conv->useFP16 = false;
  20. -#ifdef CONV_ARM_FP16
  21. +#if defined(CONV_ARM_FP16) && CV_FP16
  22. // TODO: add FP16 support for Winograd.
  23. if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN))
  24. conv->useFP16 = true;
  25. @@ -137,7 +137,7 @@ Ptr<FastConv> initFastConv(
  26. int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
  27. int nweights = C * padded_ksize;
  28. -#ifdef CONV_ARM_FP16
  29. +#if defined(CONV_ARM_FP16) && CV_FP16
  30. if (conv->useFP16)
  31. {
  32. conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
  33. @@ -190,7 +190,7 @@ Ptr<FastConv> initFastConv(
  34. #endif
  35. const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
  36. -#ifdef CONV_ARM_FP16
  37. +#if defined(CONV_ARM_FP16) && CV_FP16
  38. // FP 16
  39. const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
  40. const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
  41. @@ -208,7 +208,7 @@ Ptr<FastConv> initFastConv(
  42. size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
  43. float* wptrWino = nullptr;
  44. -#ifdef CONV_ARM_FP16
  45. +#if defined(CONV_ARM_FP16) && CV_FP16
  46. float16_t* wptrWino_FP16 = nullptr;
  47. if (conv->useFP16)
  48. {
  49. @@ -264,7 +264,7 @@ Ptr<FastConv> initFastConv(
  50. }
  51. // repack the data.
  52. -#ifdef CONV_ARM_FP16
  53. +#if defined(CONV_ARM_FP16) && CV_FP16
  54. if (conv->useFP16)
  55. {
  56. float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
  57. @@ -308,7 +308,7 @@ Ptr<FastConv> initFastConv(
  58. float* weightsBufPtr = nullptr;
  59. -#ifdef CONV_ARM_FP16
  60. +#if defined(CONV_ARM_FP16) && CV_FP16
  61. int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
  62. int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
  63. size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
  64. @@ -331,7 +331,7 @@ Ptr<FastConv> initFastConv(
  65. }
  66. // Pack the weight.
  67. -#ifdef CONV_ARM_FP16
  68. +#if defined(CONV_ARM_FP16) && CV_FP16
  69. if (conv->useFP16)
  70. {
  71. parallel_for_(Range(0, ngroups * numStripsMR_FP16), [&](const Range& r0){
  72. @@ -415,7 +415,7 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0,
  73. char * inpbufC = inpbuf + s0 * esz;
  74. float* inptrInC = (float* )inptrIn;
  75. -#ifdef CONV_ARM_FP16
  76. +#if defined(CONV_ARM_FP16) && CV_FP16
  77. float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
  78. if (esz == sizeof(float16_t))
  79. {
  80. @@ -521,7 +521,7 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0
  81. char* inpbufC = inpbuf + s0 * esz;
  82. float* inptrInC = inptrIn;
  83. -#ifdef CONV_ARM_FP16
  84. +#if defined(CONV_ARM_FP16) && CV_FP16
  85. float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
  86. if (esz == sizeof(float16_t))
  87. {
  88. @@ -553,7 +553,7 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0
  89. in_w += stride_w;
  90. }
  91. -#ifdef CONV_ARM_FP16
  92. +#if defined(CONV_ARM_FP16) && CV_FP16
  93. // Fast convert float 32 to float16
  94. static inline void _cvt32f16f( const float* src, float16_t* dst, int len)
  95. {
  96. @@ -623,7 +623,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  97. {
  98. // Make special branch where memcpy() is called with a constant buffer size.
  99. // Compilers will likely unroll this loop properly.
  100. -#ifdef CONV_ARM_FP16
  101. +#if defined(CONV_ARM_FP16) && CV_FP16
  102. if (useFP16)
  103. {
  104. for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
  105. @@ -636,7 +636,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  106. }
  107. else
  108. {
  109. -#ifdef CONV_ARM_FP16
  110. +#if defined(CONV_ARM_FP16) && CV_FP16
  111. if (useFP16)
  112. {
  113. for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
  114. @@ -700,7 +700,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  115. int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
  116. int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
  117. const float* inptrInC = inptrIn;
  118. -#ifdef CONV_ARM_FP16
  119. +#if defined(CONV_ARM_FP16) && CV_FP16
  120. if (useFP16)
  121. {
  122. float16_t* inpbufC = (float16_t *)inpbuf + s0;
  123. @@ -761,7 +761,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  124. int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
  125. const float* inptrInC = inptrIn;
  126. -#ifdef CONV_ARM_FP16
  127. +#if defined(CONV_ARM_FP16) && CV_FP16
  128. if (useFP16)
  129. {
  130. float16_t* inpbufC = (float16_t *)inpbuf + s0;
  131. @@ -834,7 +834,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  132. int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
  133. int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
  134. const float* inptrInC = inptrIn;
  135. -#ifdef CONV_ARM_FP16
  136. +#if defined(CONV_ARM_FP16) && CV_FP16
  137. if (useFP16)
  138. {
  139. float16_t* inpbufC = (float16_t* )inpbuf + s0;
  140. @@ -887,7 +887,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  141. for (; i < CONV_NR;)
  142. {
  143. float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
  144. -#ifdef CONV_ARM_FP16
  145. +#if defined(CONV_ARM_FP16) && CV_FP16
  146. float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
  147. #endif
  148. @@ -903,7 +903,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  149. {
  150. if (stride_w == 1)
  151. {
  152. -#ifdef CONV_ARM_FP16
  153. +#if defined(CONV_ARM_FP16) && CV_FP16
  154. if (useFP16)
  155. {
  156. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  157. @@ -934,7 +934,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  158. }
  159. else if (stride_w == 2)
  160. {
  161. -#ifdef CONV_ARM_FP16
  162. +#if defined(CONV_ARM_FP16) && CV_FP16
  163. if (useFP16)
  164. {
  165. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  166. @@ -967,7 +967,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  167. }
  168. else
  169. {
  170. -#ifdef CONV_ARM_FP16
  171. +#if defined(CONV_ARM_FP16) && CV_FP16
  172. if (useFP16)
  173. {
  174. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  175. @@ -1006,7 +1006,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  176. {
  177. if (stride_w == 1)
  178. {
  179. -#ifdef CONV_ARM_FP16
  180. +#if defined(CONV_ARM_FP16) && CV_FP16
  181. if (useFP16)
  182. {
  183. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  184. @@ -1029,7 +1029,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  185. }
  186. else
  187. {
  188. -#ifdef CONV_ARM_FP16
  189. +#if defined(CONV_ARM_FP16) && CV_FP16
  190. if (useFP16)
  191. {
  192. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  193. @@ -1057,7 +1057,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  194. }
  195. else
  196. {
  197. -#ifdef CONV_ARM_FP16
  198. +#if defined(CONV_ARM_FP16) && CV_FP16
  199. if (useFP16)
  200. {
  201. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
  202. @@ -1073,7 +1073,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
  203. }
  204. else
  205. {
  206. -#ifdef CONV_ARM_FP16
  207. +#if defined(CONV_ARM_FP16) && CV_FP16
  208. if (useFP16)
  209. {
  210. for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
  211. @@ -1260,7 +1260,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
  212. int CONV_MR = CONV_MR_FP32;
  213. int esz = sizeof(float );
  214. -#ifdef CONV_ARM_FP16
  215. +#if defined(CONV_ARM_FP16) && CV_FP16
  216. if (useFP16)
  217. {
  218. // works at FP 16.
  219. @@ -1433,7 +1433,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
  220. }
  221. char *weights = nullptr;
  222. -#ifdef CONV_ARM_FP16
  223. +#if defined(CONV_ARM_FP16) && CV_FP16
  224. if (useFP16)
  225. {
  226. CV_Assert(!conv->weightsBuf_FP16.empty());
  227. @@ -1474,7 +1474,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
  228. #if CV_NEON && CV_NEON_AARCH64
  229. if (conv->useNEON)
  230. {
  231. -#ifdef CONV_ARM_FP16
  232. +#if defined(CONV_ARM_FP16) && CV_FP16
  233. if (useFP16)
  234. {
  235. opt_NEON::convBlockMR1_FP16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
  236. @@ -1537,7 +1537,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
  237. #if CV_NEON
  238. if (conv->useNEON)
  239. {
  240. -#ifdef CONV_ARM_FP16
  241. +#if defined(CONV_ARM_FP16) && CV_FP16
  242. if (useFP16)
  243. {
  244. opt_NEON::convBlock_FP16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
  245. @@ -1567,7 +1567,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
  246. float biasval = biasptr[k];
  247. int j = 0;
  248. -#ifdef CONV_ARM_FP16
  249. +#if defined(CONV_ARM_FP16) && CV_FP16
  250. if (useFP16)
  251. {
  252. float32x4_t vbias = vdupq_n_f32(biasval);