rotate_neon.cc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/rotate_row.h"
  11. #include "libyuv/row.h"
  12. #include "libyuv/basic_types.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  18. !defined(__aarch64__)
  19. static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
  20. 2, 6, 10, 14, 3, 7, 11, 15};
  21. void TransposeWx8_NEON(const uint8_t* src,
  22. int src_stride,
  23. uint8_t* dst,
  24. int dst_stride,
  25. int width) {
  26. const uint8_t* src_temp;
  27. asm volatile(
  28. // loops are on blocks of 8. loop will stop when
  29. // counter gets to or below 0. starting the counter
  30. // at w-8 allow for this
  31. "sub %5, #8 \n"
  32. // handle 8x8 blocks. this should be the majority of the plane
  33. "1: \n"
  34. "mov %0, %1 \n"
  35. "vld1.8 {d0}, [%0], %2 \n"
  36. "vld1.8 {d1}, [%0], %2 \n"
  37. "vld1.8 {d2}, [%0], %2 \n"
  38. "vld1.8 {d3}, [%0], %2 \n"
  39. "vld1.8 {d4}, [%0], %2 \n"
  40. "vld1.8 {d5}, [%0], %2 \n"
  41. "vld1.8 {d6}, [%0], %2 \n"
  42. "vld1.8 {d7}, [%0] \n"
  43. "vtrn.8 d1, d0 \n"
  44. "vtrn.8 d3, d2 \n"
  45. "vtrn.8 d5, d4 \n"
  46. "vtrn.8 d7, d6 \n"
  47. "vtrn.16 d1, d3 \n"
  48. "vtrn.16 d0, d2 \n"
  49. "vtrn.16 d5, d7 \n"
  50. "vtrn.16 d4, d6 \n"
  51. "vtrn.32 d1, d5 \n"
  52. "vtrn.32 d0, d4 \n"
  53. "vtrn.32 d3, d7 \n"
  54. "vtrn.32 d2, d6 \n"
  55. "vrev16.8 q0, q0 \n"
  56. "vrev16.8 q1, q1 \n"
  57. "vrev16.8 q2, q2 \n"
  58. "vrev16.8 q3, q3 \n"
  59. "mov %0, %3 \n"
  60. "vst1.8 {d1}, [%0], %4 \n"
  61. "vst1.8 {d0}, [%0], %4 \n"
  62. "vst1.8 {d3}, [%0], %4 \n"
  63. "vst1.8 {d2}, [%0], %4 \n"
  64. "vst1.8 {d5}, [%0], %4 \n"
  65. "vst1.8 {d4}, [%0], %4 \n"
  66. "vst1.8 {d7}, [%0], %4 \n"
  67. "vst1.8 {d6}, [%0] \n"
  68. "add %1, #8 \n" // src += 8
  69. "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
  70. "subs %5, #8 \n" // w -= 8
  71. "bge 1b \n"
  72. // add 8 back to counter. if the result is 0 there are
  73. // no residuals.
  74. "adds %5, #8 \n"
  75. "beq 4f \n"
  76. // some residual, so between 1 and 7 lines left to transpose
  77. "cmp %5, #2 \n"
  78. "blt 3f \n"
  79. "cmp %5, #4 \n"
  80. "blt 2f \n"
  81. // 4x8 block
  82. "mov %0, %1 \n"
  83. "vld1.32 {d0[0]}, [%0], %2 \n"
  84. "vld1.32 {d0[1]}, [%0], %2 \n"
  85. "vld1.32 {d1[0]}, [%0], %2 \n"
  86. "vld1.32 {d1[1]}, [%0], %2 \n"
  87. "vld1.32 {d2[0]}, [%0], %2 \n"
  88. "vld1.32 {d2[1]}, [%0], %2 \n"
  89. "vld1.32 {d3[0]}, [%0], %2 \n"
  90. "vld1.32 {d3[1]}, [%0] \n"
  91. "mov %0, %3 \n"
  92. "vld1.8 {q3}, [%6] \n"
  93. "vtbl.8 d4, {d0, d1}, d6 \n"
  94. "vtbl.8 d5, {d0, d1}, d7 \n"
  95. "vtbl.8 d0, {d2, d3}, d6 \n"
  96. "vtbl.8 d1, {d2, d3}, d7 \n"
  97. // TODO(frkoenig): Rework shuffle above to
  98. // write out with 4 instead of 8 writes.
  99. "vst1.32 {d4[0]}, [%0], %4 \n"
  100. "vst1.32 {d4[1]}, [%0], %4 \n"
  101. "vst1.32 {d5[0]}, [%0], %4 \n"
  102. "vst1.32 {d5[1]}, [%0] \n"
  103. "add %0, %3, #4 \n"
  104. "vst1.32 {d0[0]}, [%0], %4 \n"
  105. "vst1.32 {d0[1]}, [%0], %4 \n"
  106. "vst1.32 {d1[0]}, [%0], %4 \n"
  107. "vst1.32 {d1[1]}, [%0] \n"
  108. "add %1, #4 \n" // src += 4
  109. "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
  110. "subs %5, #4 \n" // w -= 4
  111. "beq 4f \n"
  112. // some residual, check to see if it includes a 2x8 block,
  113. // or less
  114. "cmp %5, #2 \n"
  115. "blt 3f \n"
  116. // 2x8 block
  117. "2: \n"
  118. "mov %0, %1 \n"
  119. "vld1.16 {d0[0]}, [%0], %2 \n"
  120. "vld1.16 {d1[0]}, [%0], %2 \n"
  121. "vld1.16 {d0[1]}, [%0], %2 \n"
  122. "vld1.16 {d1[1]}, [%0], %2 \n"
  123. "vld1.16 {d0[2]}, [%0], %2 \n"
  124. "vld1.16 {d1[2]}, [%0], %2 \n"
  125. "vld1.16 {d0[3]}, [%0], %2 \n"
  126. "vld1.16 {d1[3]}, [%0] \n"
  127. "vtrn.8 d0, d1 \n"
  128. "mov %0, %3 \n"
  129. "vst1.64 {d0}, [%0], %4 \n"
  130. "vst1.64 {d1}, [%0] \n"
  131. "add %1, #2 \n" // src += 2
  132. "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
  133. "subs %5, #2 \n" // w -= 2
  134. "beq 4f \n"
  135. // 1x8 block
  136. "3: \n"
  137. "vld1.8 {d0[0]}, [%1], %2 \n"
  138. "vld1.8 {d0[1]}, [%1], %2 \n"
  139. "vld1.8 {d0[2]}, [%1], %2 \n"
  140. "vld1.8 {d0[3]}, [%1], %2 \n"
  141. "vld1.8 {d0[4]}, [%1], %2 \n"
  142. "vld1.8 {d0[5]}, [%1], %2 \n"
  143. "vld1.8 {d0[6]}, [%1], %2 \n"
  144. "vld1.8 {d0[7]}, [%1] \n"
  145. "vst1.64 {d0}, [%3] \n"
  146. "4: \n"
  147. : "=&r"(src_temp), // %0
  148. "+r"(src), // %1
  149. "+r"(src_stride), // %2
  150. "+r"(dst), // %3
  151. "+r"(dst_stride), // %4
  152. "+r"(width) // %5
  153. : "r"(&kVTbl4x4Transpose) // %6
  154. : "memory", "cc", "q0", "q1", "q2", "q3");
  155. }
  156. static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
  157. 4, 12, 5, 13, 6, 14, 7, 15};
  158. void TransposeUVWx8_NEON(const uint8_t* src,
  159. int src_stride,
  160. uint8_t* dst_a,
  161. int dst_stride_a,
  162. uint8_t* dst_b,
  163. int dst_stride_b,
  164. int width) {
  165. const uint8_t* src_temp;
  166. asm volatile(
  167. // loops are on blocks of 8. loop will stop when
  168. // counter gets to or below 0. starting the counter
  169. // at w-8 allow for this
  170. "sub %7, #8 \n"
  171. // handle 8x8 blocks. this should be the majority of the plane
  172. "1: \n"
  173. "mov %0, %1 \n"
  174. "vld2.8 {d0, d1}, [%0], %2 \n"
  175. "vld2.8 {d2, d3}, [%0], %2 \n"
  176. "vld2.8 {d4, d5}, [%0], %2 \n"
  177. "vld2.8 {d6, d7}, [%0], %2 \n"
  178. "vld2.8 {d16, d17}, [%0], %2 \n"
  179. "vld2.8 {d18, d19}, [%0], %2 \n"
  180. "vld2.8 {d20, d21}, [%0], %2 \n"
  181. "vld2.8 {d22, d23}, [%0] \n"
  182. "vtrn.8 q1, q0 \n"
  183. "vtrn.8 q3, q2 \n"
  184. "vtrn.8 q9, q8 \n"
  185. "vtrn.8 q11, q10 \n"
  186. "vtrn.16 q1, q3 \n"
  187. "vtrn.16 q0, q2 \n"
  188. "vtrn.16 q9, q11 \n"
  189. "vtrn.16 q8, q10 \n"
  190. "vtrn.32 q1, q9 \n"
  191. "vtrn.32 q0, q8 \n"
  192. "vtrn.32 q3, q11 \n"
  193. "vtrn.32 q2, q10 \n"
  194. "vrev16.8 q0, q0 \n"
  195. "vrev16.8 q1, q1 \n"
  196. "vrev16.8 q2, q2 \n"
  197. "vrev16.8 q3, q3 \n"
  198. "vrev16.8 q8, q8 \n"
  199. "vrev16.8 q9, q9 \n"
  200. "vrev16.8 q10, q10 \n"
  201. "vrev16.8 q11, q11 \n"
  202. "mov %0, %3 \n"
  203. "vst1.8 {d2}, [%0], %4 \n"
  204. "vst1.8 {d0}, [%0], %4 \n"
  205. "vst1.8 {d6}, [%0], %4 \n"
  206. "vst1.8 {d4}, [%0], %4 \n"
  207. "vst1.8 {d18}, [%0], %4 \n"
  208. "vst1.8 {d16}, [%0], %4 \n"
  209. "vst1.8 {d22}, [%0], %4 \n"
  210. "vst1.8 {d20}, [%0] \n"
  211. "mov %0, %5 \n"
  212. "vst1.8 {d3}, [%0], %6 \n"
  213. "vst1.8 {d1}, [%0], %6 \n"
  214. "vst1.8 {d7}, [%0], %6 \n"
  215. "vst1.8 {d5}, [%0], %6 \n"
  216. "vst1.8 {d19}, [%0], %6 \n"
  217. "vst1.8 {d17}, [%0], %6 \n"
  218. "vst1.8 {d23}, [%0], %6 \n"
  219. "vst1.8 {d21}, [%0] \n"
  220. "add %1, #8*2 \n" // src += 8*2
  221. "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
  222. // dst_stride_a
  223. "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
  224. // dst_stride_b
  225. "subs %7, #8 \n" // w -= 8
  226. "bge 1b \n"
  227. // add 8 back to counter. if the result is 0 there are
  228. // no residuals.
  229. "adds %7, #8 \n"
  230. "beq 4f \n"
  231. // some residual, so between 1 and 7 lines left to transpose
  232. "cmp %7, #2 \n"
  233. "blt 3f \n"
  234. "cmp %7, #4 \n"
  235. "blt 2f \n"
  236. // TODO(frkoenig): Clean this up
  237. // 4x8 block
  238. "mov %0, %1 \n"
  239. "vld1.64 {d0}, [%0], %2 \n"
  240. "vld1.64 {d1}, [%0], %2 \n"
  241. "vld1.64 {d2}, [%0], %2 \n"
  242. "vld1.64 {d3}, [%0], %2 \n"
  243. "vld1.64 {d4}, [%0], %2 \n"
  244. "vld1.64 {d5}, [%0], %2 \n"
  245. "vld1.64 {d6}, [%0], %2 \n"
  246. "vld1.64 {d7}, [%0] \n"
  247. "vld1.8 {q15}, [%8] \n"
  248. "vtrn.8 q0, q1 \n"
  249. "vtrn.8 q2, q3 \n"
  250. "vtbl.8 d16, {d0, d1}, d30 \n"
  251. "vtbl.8 d17, {d0, d1}, d31 \n"
  252. "vtbl.8 d18, {d2, d3}, d30 \n"
  253. "vtbl.8 d19, {d2, d3}, d31 \n"
  254. "vtbl.8 d20, {d4, d5}, d30 \n"
  255. "vtbl.8 d21, {d4, d5}, d31 \n"
  256. "vtbl.8 d22, {d6, d7}, d30 \n"
  257. "vtbl.8 d23, {d6, d7}, d31 \n"
  258. "mov %0, %3 \n"
  259. "vst1.32 {d16[0]}, [%0], %4 \n"
  260. "vst1.32 {d16[1]}, [%0], %4 \n"
  261. "vst1.32 {d17[0]}, [%0], %4 \n"
  262. "vst1.32 {d17[1]}, [%0], %4 \n"
  263. "add %0, %3, #4 \n"
  264. "vst1.32 {d20[0]}, [%0], %4 \n"
  265. "vst1.32 {d20[1]}, [%0], %4 \n"
  266. "vst1.32 {d21[0]}, [%0], %4 \n"
  267. "vst1.32 {d21[1]}, [%0] \n"
  268. "mov %0, %5 \n"
  269. "vst1.32 {d18[0]}, [%0], %6 \n"
  270. "vst1.32 {d18[1]}, [%0], %6 \n"
  271. "vst1.32 {d19[0]}, [%0], %6 \n"
  272. "vst1.32 {d19[1]}, [%0], %6 \n"
  273. "add %0, %5, #4 \n"
  274. "vst1.32 {d22[0]}, [%0], %6 \n"
  275. "vst1.32 {d22[1]}, [%0], %6 \n"
  276. "vst1.32 {d23[0]}, [%0], %6 \n"
  277. "vst1.32 {d23[1]}, [%0] \n"
  278. "add %1, #4*2 \n" // src += 4 * 2
  279. "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
  280. // dst_stride_a
  281. "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
  282. // dst_stride_b
  283. "subs %7, #4 \n" // w -= 4
  284. "beq 4f \n"
  285. // some residual, check to see if it includes a 2x8 block,
  286. // or less
  287. "cmp %7, #2 \n"
  288. "blt 3f \n"
  289. // 2x8 block
  290. "2: \n"
  291. "mov %0, %1 \n"
  292. "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
  293. "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
  294. "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
  295. "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
  296. "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
  297. "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
  298. "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
  299. "vld2.16 {d1[3], d3[3]}, [%0] \n"
  300. "vtrn.8 d0, d1 \n"
  301. "vtrn.8 d2, d3 \n"
  302. "mov %0, %3 \n"
  303. "vst1.64 {d0}, [%0], %4 \n"
  304. "vst1.64 {d2}, [%0] \n"
  305. "mov %0, %5 \n"
  306. "vst1.64 {d1}, [%0], %6 \n"
  307. "vst1.64 {d3}, [%0] \n"
  308. "add %1, #2*2 \n" // src += 2 * 2
  309. "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
  310. // dst_stride_a
  311. "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
  312. // dst_stride_b
  313. "subs %7, #2 \n" // w -= 2
  314. "beq 4f \n"
  315. // 1x8 block
  316. "3: \n"
  317. "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
  318. "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
  319. "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
  320. "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
  321. "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
  322. "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
  323. "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
  324. "vld2.8 {d0[7], d1[7]}, [%1] \n"
  325. "vst1.64 {d0}, [%3] \n"
  326. "vst1.64 {d1}, [%5] \n"
  327. "4: \n"
  328. : "=&r"(src_temp), // %0
  329. "+r"(src), // %1
  330. "+r"(src_stride), // %2
  331. "+r"(dst_a), // %3
  332. "+r"(dst_stride_a), // %4
  333. "+r"(dst_b), // %5
  334. "+r"(dst_stride_b), // %6
  335. "+r"(width) // %7
  336. : "r"(&kVTbl4x4TransposeDi) // %8
  337. : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  338. }
  339. #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
  340. #ifdef __cplusplus
  341. } // extern "C"
  342. } // namespace libyuv
  343. #endif