vpx_subpixel_bilinear_ssse3.asm 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro GET_PARAM_4 0
  12. mov rdx, arg(5) ;filter ptr
  13. mov rsi, arg(0) ;src_ptr
  14. mov rdi, arg(2) ;output_ptr
  15. mov rcx, 0x0400040
  16. movdqa xmm3, [rdx] ;load filters
  17. psrldq xmm3, 6
  18. packsswb xmm3, xmm3
  19. pshuflw xmm3, xmm3, 0b ;k3_k4
  20. movq xmm2, rcx ;rounding
  21. pshufd xmm2, xmm2, 0
  22. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  23. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  24. movsxd rcx, DWORD PTR arg(4) ;output_height
  25. %endm
  26. %macro APPLY_FILTER_4 1
  27. punpcklbw xmm0, xmm1
  28. pmaddubsw xmm0, xmm3
  29. paddsw xmm0, xmm2 ;rounding
  30. psraw xmm0, 7 ;shift
  31. packuswb xmm0, xmm0 ;pack to byte
  32. %if %1
  33. movd xmm1, [rdi]
  34. pavgb xmm0, xmm1
  35. %endif
  36. movd [rdi], xmm0
  37. lea rsi, [rsi + rax]
  38. lea rdi, [rdi + rdx]
  39. dec rcx
  40. %endm
  41. %macro GET_PARAM 0
  42. mov rdx, arg(5) ;filter ptr
  43. mov rsi, arg(0) ;src_ptr
  44. mov rdi, arg(2) ;output_ptr
  45. mov rcx, 0x0400040
  46. movdqa xmm7, [rdx] ;load filters
  47. psrldq xmm7, 6
  48. packsswb xmm7, xmm7
  49. pshuflw xmm7, xmm7, 0b ;k3_k4
  50. punpcklwd xmm7, xmm7
  51. movq xmm6, rcx ;rounding
  52. pshufd xmm6, xmm6, 0
  53. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  54. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  55. movsxd rcx, DWORD PTR arg(4) ;output_height
  56. %endm
  57. %macro APPLY_FILTER_8 1
  58. punpcklbw xmm0, xmm1
  59. pmaddubsw xmm0, xmm7
  60. paddsw xmm0, xmm6 ;rounding
  61. psraw xmm0, 7 ;shift
  62. packuswb xmm0, xmm0 ;pack back to byte
  63. %if %1
  64. movq xmm1, [rdi]
  65. pavgb xmm0, xmm1
  66. %endif
  67. movq [rdi], xmm0 ;store the result
  68. lea rsi, [rsi + rax]
  69. lea rdi, [rdi + rdx]
  70. dec rcx
  71. %endm
  72. %macro APPLY_FILTER_16 1
  73. punpcklbw xmm0, xmm1
  74. punpckhbw xmm2, xmm1
  75. pmaddubsw xmm0, xmm7
  76. pmaddubsw xmm2, xmm7
  77. paddsw xmm0, xmm6 ;rounding
  78. paddsw xmm2, xmm6
  79. psraw xmm0, 7 ;shift
  80. psraw xmm2, 7
  81. packuswb xmm0, xmm2 ;pack back to byte
  82. %if %1
  83. movdqu xmm1, [rdi]
  84. pavgb xmm0, xmm1
  85. %endif
  86. movdqu [rdi], xmm0 ;store the result
  87. lea rsi, [rsi + rax]
  88. lea rdi, [rdi + rdx]
  89. dec rcx
  90. %endm
  91. global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
  92. sym(vpx_filter_block1d4_v2_ssse3):
  93. push rbp
  94. mov rbp, rsp
  95. SHADOW_ARGS_TO_STACK 6
  96. push rsi
  97. push rdi
  98. ; end prolog
  99. GET_PARAM_4
  100. .loop:
  101. movd xmm0, [rsi] ;load src
  102. movd xmm1, [rsi + rax]
  103. APPLY_FILTER_4 0
  104. jnz .loop
  105. ; begin epilog
  106. pop rdi
  107. pop rsi
  108. UNSHADOW_ARGS
  109. pop rbp
  110. ret
  111. global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
  112. sym(vpx_filter_block1d8_v2_ssse3):
  113. push rbp
  114. mov rbp, rsp
  115. SHADOW_ARGS_TO_STACK 6
  116. SAVE_XMM 7
  117. push rsi
  118. push rdi
  119. ; end prolog
  120. GET_PARAM
  121. .loop:
  122. movq xmm0, [rsi] ;0
  123. movq xmm1, [rsi + rax] ;1
  124. APPLY_FILTER_8 0
  125. jnz .loop
  126. ; begin epilog
  127. pop rdi
  128. pop rsi
  129. RESTORE_XMM
  130. UNSHADOW_ARGS
  131. pop rbp
  132. ret
  133. global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
  134. sym(vpx_filter_block1d16_v2_ssse3):
  135. push rbp
  136. mov rbp, rsp
  137. SHADOW_ARGS_TO_STACK 6
  138. SAVE_XMM 7
  139. push rsi
  140. push rdi
  141. ; end prolog
  142. GET_PARAM
  143. .loop:
  144. movdqu xmm0, [rsi] ;0
  145. movdqu xmm1, [rsi + rax] ;1
  146. movdqa xmm2, xmm0
  147. APPLY_FILTER_16 0
  148. jnz .loop
  149. ; begin epilog
  150. pop rdi
  151. pop rsi
  152. RESTORE_XMM
  153. UNSHADOW_ARGS
  154. pop rbp
  155. ret
  156. global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
  157. sym(vpx_filter_block1d4_v2_avg_ssse3):
  158. push rbp
  159. mov rbp, rsp
  160. SHADOW_ARGS_TO_STACK 6
  161. push rsi
  162. push rdi
  163. ; end prolog
  164. GET_PARAM_4
  165. .loop:
  166. movd xmm0, [rsi] ;load src
  167. movd xmm1, [rsi + rax]
  168. APPLY_FILTER_4 1
  169. jnz .loop
  170. ; begin epilog
  171. pop rdi
  172. pop rsi
  173. UNSHADOW_ARGS
  174. pop rbp
  175. ret
  176. global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
  177. sym(vpx_filter_block1d8_v2_avg_ssse3):
  178. push rbp
  179. mov rbp, rsp
  180. SHADOW_ARGS_TO_STACK 6
  181. SAVE_XMM 7
  182. push rsi
  183. push rdi
  184. ; end prolog
  185. GET_PARAM
  186. .loop:
  187. movq xmm0, [rsi] ;0
  188. movq xmm1, [rsi + rax] ;1
  189. APPLY_FILTER_8 1
  190. jnz .loop
  191. ; begin epilog
  192. pop rdi
  193. pop rsi
  194. RESTORE_XMM
  195. UNSHADOW_ARGS
  196. pop rbp
  197. ret
  198. global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
  199. sym(vpx_filter_block1d16_v2_avg_ssse3):
  200. push rbp
  201. mov rbp, rsp
  202. SHADOW_ARGS_TO_STACK 6
  203. SAVE_XMM 7
  204. push rsi
  205. push rdi
  206. ; end prolog
  207. GET_PARAM
  208. .loop:
  209. movdqu xmm0, [rsi] ;0
  210. movdqu xmm1, [rsi + rax] ;1
  211. movdqa xmm2, xmm0
  212. APPLY_FILTER_16 1
  213. jnz .loop
  214. ; begin epilog
  215. pop rdi
  216. pop rsi
  217. RESTORE_XMM
  218. UNSHADOW_ARGS
  219. pop rbp
  220. ret
  221. global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
  222. sym(vpx_filter_block1d4_h2_ssse3):
  223. push rbp
  224. mov rbp, rsp
  225. SHADOW_ARGS_TO_STACK 6
  226. push rsi
  227. push rdi
  228. ; end prolog
  229. GET_PARAM_4
  230. .loop:
  231. movdqu xmm0, [rsi] ;load src
  232. movdqa xmm1, xmm0
  233. psrldq xmm1, 1
  234. APPLY_FILTER_4 0
  235. jnz .loop
  236. ; begin epilog
  237. pop rdi
  238. pop rsi
  239. UNSHADOW_ARGS
  240. pop rbp
  241. ret
  242. global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
  243. sym(vpx_filter_block1d8_h2_ssse3):
  244. push rbp
  245. mov rbp, rsp
  246. SHADOW_ARGS_TO_STACK 6
  247. SAVE_XMM 7
  248. push rsi
  249. push rdi
  250. ; end prolog
  251. GET_PARAM
  252. .loop:
  253. movdqu xmm0, [rsi] ;load src
  254. movdqa xmm1, xmm0
  255. psrldq xmm1, 1
  256. APPLY_FILTER_8 0
  257. jnz .loop
  258. ; begin epilog
  259. pop rdi
  260. pop rsi
  261. RESTORE_XMM
  262. UNSHADOW_ARGS
  263. pop rbp
  264. ret
  265. global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
  266. sym(vpx_filter_block1d16_h2_ssse3):
  267. push rbp
  268. mov rbp, rsp
  269. SHADOW_ARGS_TO_STACK 6
  270. SAVE_XMM 7
  271. push rsi
  272. push rdi
  273. ; end prolog
  274. GET_PARAM
  275. .loop:
  276. movdqu xmm0, [rsi] ;load src
  277. movdqu xmm1, [rsi + 1]
  278. movdqa xmm2, xmm0
  279. APPLY_FILTER_16 0
  280. jnz .loop
  281. ; begin epilog
  282. pop rdi
  283. pop rsi
  284. RESTORE_XMM
  285. UNSHADOW_ARGS
  286. pop rbp
  287. ret
  288. global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
  289. sym(vpx_filter_block1d4_h2_avg_ssse3):
  290. push rbp
  291. mov rbp, rsp
  292. SHADOW_ARGS_TO_STACK 6
  293. push rsi
  294. push rdi
  295. ; end prolog
  296. GET_PARAM_4
  297. .loop:
  298. movdqu xmm0, [rsi] ;load src
  299. movdqa xmm1, xmm0
  300. psrldq xmm1, 1
  301. APPLY_FILTER_4 1
  302. jnz .loop
  303. ; begin epilog
  304. pop rdi
  305. pop rsi
  306. UNSHADOW_ARGS
  307. pop rbp
  308. ret
  309. global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
  310. sym(vpx_filter_block1d8_h2_avg_ssse3):
  311. push rbp
  312. mov rbp, rsp
  313. SHADOW_ARGS_TO_STACK 6
  314. SAVE_XMM 7
  315. push rsi
  316. push rdi
  317. ; end prolog
  318. GET_PARAM
  319. .loop:
  320. movdqu xmm0, [rsi] ;load src
  321. movdqa xmm1, xmm0
  322. psrldq xmm1, 1
  323. APPLY_FILTER_8 1
  324. jnz .loop
  325. ; begin epilog
  326. pop rdi
  327. pop rsi
  328. RESTORE_XMM
  329. UNSHADOW_ARGS
  330. pop rbp
  331. ret
  332. global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
  333. sym(vpx_filter_block1d16_h2_avg_ssse3):
  334. push rbp
  335. mov rbp, rsp
  336. SHADOW_ARGS_TO_STACK 6
  337. SAVE_XMM 7
  338. push rsi
  339. push rdi
  340. ; end prolog
  341. GET_PARAM
  342. .loop:
  343. movdqu xmm0, [rsi] ;load src
  344. movdqu xmm1, [rsi + 1]
  345. movdqa xmm2, xmm0
  346. APPLY_FILTER_16 1
  347. jnz .loop
  348. ; begin epilog
  349. pop rdi
  350. pop rsi
  351. RESTORE_XMM
  352. UNSHADOW_ARGS
  353. pop rbp
  354. ret