vpx_subpixel_bilinear_sse2.asm 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %macro GET_PARAM_4 0
  12. mov rdx, arg(5) ;filter ptr
  13. mov rsi, arg(0) ;src_ptr
  14. mov rdi, arg(2) ;output_ptr
  15. mov rcx, 0x0400040
  16. movdqa xmm3, [rdx] ;load filters
  17. pshuflw xmm4, xmm3, 11111111b ;k3
  18. psrldq xmm3, 8
  19. pshuflw xmm3, xmm3, 0b ;k4
  20. punpcklqdq xmm4, xmm3 ;k3k4
  21. movq xmm3, rcx ;rounding
  22. pshufd xmm3, xmm3, 0
  23. pxor xmm2, xmm2
  24. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  25. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  26. movsxd rcx, DWORD PTR arg(4) ;output_height
  27. %endm
  28. %macro APPLY_FILTER_4 1
  29. punpckldq xmm0, xmm1 ;two row in one register
  30. punpcklbw xmm0, xmm2 ;unpack to word
  31. pmullw xmm0, xmm4 ;multiply the filter factors
  32. movdqa xmm1, xmm0
  33. psrldq xmm1, 8
  34. paddsw xmm0, xmm1
  35. paddsw xmm0, xmm3 ;rounding
  36. psraw xmm0, 7 ;shift
  37. packuswb xmm0, xmm0 ;pack to byte
  38. %if %1
  39. movd xmm1, [rdi]
  40. pavgb xmm0, xmm1
  41. %endif
  42. movd [rdi], xmm0
  43. lea rsi, [rsi + rax]
  44. lea rdi, [rdi + rdx]
  45. dec rcx
  46. %endm
  47. %macro GET_PARAM 0
  48. mov rdx, arg(5) ;filter ptr
  49. mov rsi, arg(0) ;src_ptr
  50. mov rdi, arg(2) ;output_ptr
  51. mov rcx, 0x0400040
  52. movdqa xmm7, [rdx] ;load filters
  53. pshuflw xmm6, xmm7, 11111111b ;k3
  54. pshufhw xmm7, xmm7, 0b ;k4
  55. punpcklwd xmm6, xmm6
  56. punpckhwd xmm7, xmm7
  57. movq xmm4, rcx ;rounding
  58. pshufd xmm4, xmm4, 0
  59. pxor xmm5, xmm5
  60. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  61. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  62. movsxd rcx, DWORD PTR arg(4) ;output_height
  63. %endm
  64. %macro APPLY_FILTER_8 1
  65. punpcklbw xmm0, xmm5
  66. punpcklbw xmm1, xmm5
  67. pmullw xmm0, xmm6
  68. pmullw xmm1, xmm7
  69. paddsw xmm0, xmm1
  70. paddsw xmm0, xmm4 ;rounding
  71. psraw xmm0, 7 ;shift
  72. packuswb xmm0, xmm0 ;pack back to byte
  73. %if %1
  74. movq xmm1, [rdi]
  75. pavgb xmm0, xmm1
  76. %endif
  77. movq [rdi], xmm0 ;store the result
  78. lea rsi, [rsi + rax]
  79. lea rdi, [rdi + rdx]
  80. dec rcx
  81. %endm
  82. %macro APPLY_FILTER_16 1
  83. punpcklbw xmm0, xmm5
  84. punpcklbw xmm1, xmm5
  85. punpckhbw xmm2, xmm5
  86. punpckhbw xmm3, xmm5
  87. pmullw xmm0, xmm6
  88. pmullw xmm1, xmm7
  89. pmullw xmm2, xmm6
  90. pmullw xmm3, xmm7
  91. paddsw xmm0, xmm1
  92. paddsw xmm2, xmm3
  93. paddsw xmm0, xmm4 ;rounding
  94. paddsw xmm2, xmm4
  95. psraw xmm0, 7 ;shift
  96. psraw xmm2, 7
  97. packuswb xmm0, xmm2 ;pack back to byte
  98. %if %1
  99. movdqu xmm1, [rdi]
  100. pavgb xmm0, xmm1
  101. %endif
  102. movdqu [rdi], xmm0 ;store the result
  103. lea rsi, [rsi + rax]
  104. lea rdi, [rdi + rdx]
  105. dec rcx
  106. %endm
  107. global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
  108. sym(vpx_filter_block1d4_v2_sse2):
  109. push rbp
  110. mov rbp, rsp
  111. SHADOW_ARGS_TO_STACK 6
  112. push rsi
  113. push rdi
  114. ; end prolog
  115. GET_PARAM_4
  116. .loop:
  117. movd xmm0, [rsi] ;load src
  118. movd xmm1, [rsi + rax]
  119. APPLY_FILTER_4 0
  120. jnz .loop
  121. ; begin epilog
  122. pop rdi
  123. pop rsi
  124. UNSHADOW_ARGS
  125. pop rbp
  126. ret
  127. global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
  128. sym(vpx_filter_block1d8_v2_sse2):
  129. push rbp
  130. mov rbp, rsp
  131. SHADOW_ARGS_TO_STACK 6
  132. SAVE_XMM 7
  133. push rsi
  134. push rdi
  135. ; end prolog
  136. GET_PARAM
  137. .loop:
  138. movq xmm0, [rsi] ;0
  139. movq xmm1, [rsi + rax] ;1
  140. APPLY_FILTER_8 0
  141. jnz .loop
  142. ; begin epilog
  143. pop rdi
  144. pop rsi
  145. RESTORE_XMM
  146. UNSHADOW_ARGS
  147. pop rbp
  148. ret
  149. global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
  150. sym(vpx_filter_block1d16_v2_sse2):
  151. push rbp
  152. mov rbp, rsp
  153. SHADOW_ARGS_TO_STACK 6
  154. SAVE_XMM 7
  155. push rsi
  156. push rdi
  157. ; end prolog
  158. GET_PARAM
  159. .loop:
  160. movdqu xmm0, [rsi] ;0
  161. movdqu xmm1, [rsi + rax] ;1
  162. movdqa xmm2, xmm0
  163. movdqa xmm3, xmm1
  164. APPLY_FILTER_16 0
  165. jnz .loop
  166. ; begin epilog
  167. pop rdi
  168. pop rsi
  169. RESTORE_XMM
  170. UNSHADOW_ARGS
  171. pop rbp
  172. ret
  173. global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
  174. sym(vpx_filter_block1d4_v2_avg_sse2):
  175. push rbp
  176. mov rbp, rsp
  177. SHADOW_ARGS_TO_STACK 6
  178. push rsi
  179. push rdi
  180. ; end prolog
  181. GET_PARAM_4
  182. .loop:
  183. movd xmm0, [rsi] ;load src
  184. movd xmm1, [rsi + rax]
  185. APPLY_FILTER_4 1
  186. jnz .loop
  187. ; begin epilog
  188. pop rdi
  189. pop rsi
  190. UNSHADOW_ARGS
  191. pop rbp
  192. ret
  193. global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
  194. sym(vpx_filter_block1d8_v2_avg_sse2):
  195. push rbp
  196. mov rbp, rsp
  197. SHADOW_ARGS_TO_STACK 6
  198. SAVE_XMM 7
  199. push rsi
  200. push rdi
  201. ; end prolog
  202. GET_PARAM
  203. .loop:
  204. movq xmm0, [rsi] ;0
  205. movq xmm1, [rsi + rax] ;1
  206. APPLY_FILTER_8 1
  207. jnz .loop
  208. ; begin epilog
  209. pop rdi
  210. pop rsi
  211. RESTORE_XMM
  212. UNSHADOW_ARGS
  213. pop rbp
  214. ret
  215. global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
  216. sym(vpx_filter_block1d16_v2_avg_sse2):
  217. push rbp
  218. mov rbp, rsp
  219. SHADOW_ARGS_TO_STACK 6
  220. SAVE_XMM 7
  221. push rsi
  222. push rdi
  223. ; end prolog
  224. GET_PARAM
  225. .loop:
  226. movdqu xmm0, [rsi] ;0
  227. movdqu xmm1, [rsi + rax] ;1
  228. movdqa xmm2, xmm0
  229. movdqa xmm3, xmm1
  230. APPLY_FILTER_16 1
  231. jnz .loop
  232. ; begin epilog
  233. pop rdi
  234. pop rsi
  235. RESTORE_XMM
  236. UNSHADOW_ARGS
  237. pop rbp
  238. ret
  239. global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
  240. sym(vpx_filter_block1d4_h2_sse2):
  241. push rbp
  242. mov rbp, rsp
  243. SHADOW_ARGS_TO_STACK 6
  244. push rsi
  245. push rdi
  246. ; end prolog
  247. GET_PARAM_4
  248. .loop:
  249. movdqu xmm0, [rsi] ;load src
  250. movdqa xmm1, xmm0
  251. psrldq xmm1, 1
  252. APPLY_FILTER_4 0
  253. jnz .loop
  254. ; begin epilog
  255. pop rdi
  256. pop rsi
  257. UNSHADOW_ARGS
  258. pop rbp
  259. ret
  260. global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
  261. sym(vpx_filter_block1d8_h2_sse2):
  262. push rbp
  263. mov rbp, rsp
  264. SHADOW_ARGS_TO_STACK 6
  265. SAVE_XMM 7
  266. push rsi
  267. push rdi
  268. ; end prolog
  269. GET_PARAM
  270. .loop:
  271. movdqu xmm0, [rsi] ;load src
  272. movdqa xmm1, xmm0
  273. psrldq xmm1, 1
  274. APPLY_FILTER_8 0
  275. jnz .loop
  276. ; begin epilog
  277. pop rdi
  278. pop rsi
  279. RESTORE_XMM
  280. UNSHADOW_ARGS
  281. pop rbp
  282. ret
  283. global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
  284. sym(vpx_filter_block1d16_h2_sse2):
  285. push rbp
  286. mov rbp, rsp
  287. SHADOW_ARGS_TO_STACK 6
  288. SAVE_XMM 7
  289. push rsi
  290. push rdi
  291. ; end prolog
  292. GET_PARAM
  293. .loop:
  294. movdqu xmm0, [rsi] ;load src
  295. movdqu xmm1, [rsi + 1]
  296. movdqa xmm2, xmm0
  297. movdqa xmm3, xmm1
  298. APPLY_FILTER_16 0
  299. jnz .loop
  300. ; begin epilog
  301. pop rdi
  302. pop rsi
  303. RESTORE_XMM
  304. UNSHADOW_ARGS
  305. pop rbp
  306. ret
  307. global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
  308. sym(vpx_filter_block1d4_h2_avg_sse2):
  309. push rbp
  310. mov rbp, rsp
  311. SHADOW_ARGS_TO_STACK 6
  312. push rsi
  313. push rdi
  314. ; end prolog
  315. GET_PARAM_4
  316. .loop:
  317. movdqu xmm0, [rsi] ;load src
  318. movdqa xmm1, xmm0
  319. psrldq xmm1, 1
  320. APPLY_FILTER_4 1
  321. jnz .loop
  322. ; begin epilog
  323. pop rdi
  324. pop rsi
  325. UNSHADOW_ARGS
  326. pop rbp
  327. ret
  328. global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
  329. sym(vpx_filter_block1d8_h2_avg_sse2):
  330. push rbp
  331. mov rbp, rsp
  332. SHADOW_ARGS_TO_STACK 6
  333. SAVE_XMM 7
  334. push rsi
  335. push rdi
  336. ; end prolog
  337. GET_PARAM
  338. .loop:
  339. movdqu xmm0, [rsi] ;load src
  340. movdqa xmm1, xmm0
  341. psrldq xmm1, 1
  342. APPLY_FILTER_8 1
  343. jnz .loop
  344. ; begin epilog
  345. pop rdi
  346. pop rsi
  347. RESTORE_XMM
  348. UNSHADOW_ARGS
  349. pop rbp
  350. ret
  351. global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
  352. sym(vpx_filter_block1d16_h2_avg_sse2):
  353. push rbp
  354. mov rbp, rsp
  355. SHADOW_ARGS_TO_STACK 6
  356. SAVE_XMM 7
  357. push rsi
  358. push rdi
  359. ; end prolog
  360. GET_PARAM
  361. .loop:
  362. movdqu xmm0, [rsi] ;load src
  363. movdqu xmm1, [rsi + 1]
  364. movdqa xmm2, xmm0
  365. movdqa xmm3, xmm1
  366. APPLY_FILTER_16 1
  367. jnz .loop
  368. ; begin epilog
  369. pop rdi
  370. pop rsi
  371. RESTORE_XMM
  372. UNSHADOW_ARGS
  373. pop rbp
  374. ret