vpx_subpixel_8t_sse2.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
  12. ;overflow.
  13. %macro GET_FILTERS_4 0
  14. mov rdx, arg(5) ;filter ptr
  15. mov rcx, 0x0400040
  16. movdqa xmm7, [rdx] ;load filters
  17. pshuflw xmm0, xmm7, 0b ;k0
  18. pshuflw xmm1, xmm7, 01010101b ;k1
  19. pshuflw xmm2, xmm7, 10101010b ;k2
  20. pshuflw xmm3, xmm7, 11111111b ;k3
  21. psrldq xmm7, 8
  22. pshuflw xmm4, xmm7, 0b ;k4
  23. pshuflw xmm5, xmm7, 01010101b ;k5
  24. pshuflw xmm6, xmm7, 10101010b ;k6
  25. pshuflw xmm7, xmm7, 11111111b ;k7
  26. punpcklqdq xmm0, xmm1
  27. punpcklqdq xmm2, xmm3
  28. punpcklqdq xmm5, xmm4
  29. punpcklqdq xmm6, xmm7
  30. movdqa k0k1, xmm0
  31. movdqa k2k3, xmm2
  32. movdqa k5k4, xmm5
  33. movdqa k6k7, xmm6
  34. movq xmm6, rcx
  35. pshufd xmm6, xmm6, 0
  36. movdqa krd, xmm6
  37. pxor xmm7, xmm7
  38. movdqa zero, xmm7
  39. %endm
  40. %macro APPLY_FILTER_4 1
  41. punpckldq xmm0, xmm1 ;two row in one register
  42. punpckldq xmm6, xmm7
  43. punpckldq xmm2, xmm3
  44. punpckldq xmm5, xmm4
  45. punpcklbw xmm0, zero ;unpack to word
  46. punpcklbw xmm6, zero
  47. punpcklbw xmm2, zero
  48. punpcklbw xmm5, zero
  49. pmullw xmm0, k0k1 ;multiply the filter factors
  50. pmullw xmm6, k6k7
  51. pmullw xmm2, k2k3
  52. pmullw xmm5, k5k4
  53. paddsw xmm0, xmm6 ;sum
  54. movdqa xmm1, xmm0
  55. psrldq xmm1, 8
  56. paddsw xmm0, xmm1
  57. paddsw xmm0, xmm2
  58. psrldq xmm2, 8
  59. paddsw xmm0, xmm5
  60. psrldq xmm5, 8
  61. paddsw xmm0, xmm2
  62. paddsw xmm0, xmm5
  63. paddsw xmm0, krd ;rounding
  64. psraw xmm0, 7 ;shift
  65. packuswb xmm0, xmm0 ;pack to byte
  66. %if %1
  67. movd xmm1, [rdi]
  68. pavgb xmm0, xmm1
  69. %endif
  70. movd [rdi], xmm0
  71. %endm
  72. %macro GET_FILTERS 0
  73. mov rdx, arg(5) ;filter ptr
  74. mov rsi, arg(0) ;src_ptr
  75. mov rdi, arg(2) ;output_ptr
  76. mov rcx, 0x0400040
  77. movdqa xmm7, [rdx] ;load filters
  78. pshuflw xmm0, xmm7, 0b ;k0
  79. pshuflw xmm1, xmm7, 01010101b ;k1
  80. pshuflw xmm2, xmm7, 10101010b ;k2
  81. pshuflw xmm3, xmm7, 11111111b ;k3
  82. pshufhw xmm4, xmm7, 0b ;k4
  83. pshufhw xmm5, xmm7, 01010101b ;k5
  84. pshufhw xmm6, xmm7, 10101010b ;k6
  85. pshufhw xmm7, xmm7, 11111111b ;k7
  86. punpcklwd xmm0, xmm0
  87. punpcklwd xmm1, xmm1
  88. punpcklwd xmm2, xmm2
  89. punpcklwd xmm3, xmm3
  90. punpckhwd xmm4, xmm4
  91. punpckhwd xmm5, xmm5
  92. punpckhwd xmm6, xmm6
  93. punpckhwd xmm7, xmm7
  94. movdqa k0, xmm0 ;store filter factors on stack
  95. movdqa k1, xmm1
  96. movdqa k2, xmm2
  97. movdqa k3, xmm3
  98. movdqa k4, xmm4
  99. movdqa k5, xmm5
  100. movdqa k6, xmm6
  101. movdqa k7, xmm7
  102. movq xmm6, rcx
  103. pshufd xmm6, xmm6, 0
  104. movdqa krd, xmm6 ;rounding
  105. pxor xmm7, xmm7
  106. movdqa zero, xmm7
  107. %endm
  108. %macro LOAD_VERT_8 1
  109. movq xmm0, [rsi + %1] ;0
  110. movq xmm1, [rsi + rax + %1] ;1
  111. movq xmm6, [rsi + rdx * 2 + %1] ;6
  112. lea rsi, [rsi + rax]
  113. movq xmm7, [rsi + rdx * 2 + %1] ;7
  114. movq xmm2, [rsi + rax + %1] ;2
  115. movq xmm3, [rsi + rax * 2 + %1] ;3
  116. movq xmm4, [rsi + rdx + %1] ;4
  117. movq xmm5, [rsi + rax * 4 + %1] ;5
  118. %endm
  119. %macro APPLY_FILTER_8 2
  120. punpcklbw xmm0, zero
  121. punpcklbw xmm1, zero
  122. punpcklbw xmm6, zero
  123. punpcklbw xmm7, zero
  124. punpcklbw xmm2, zero
  125. punpcklbw xmm5, zero
  126. punpcklbw xmm3, zero
  127. punpcklbw xmm4, zero
  128. pmullw xmm0, k0
  129. pmullw xmm1, k1
  130. pmullw xmm6, k6
  131. pmullw xmm7, k7
  132. pmullw xmm2, k2
  133. pmullw xmm5, k5
  134. pmullw xmm3, k3
  135. pmullw xmm4, k4
  136. paddsw xmm0, xmm1
  137. paddsw xmm0, xmm6
  138. paddsw xmm0, xmm7
  139. paddsw xmm0, xmm2
  140. paddsw xmm0, xmm5
  141. paddsw xmm0, xmm3
  142. paddsw xmm0, xmm4
  143. paddsw xmm0, krd ;rounding
  144. psraw xmm0, 7 ;shift
  145. packuswb xmm0, xmm0 ;pack back to byte
  146. %if %1
  147. movq xmm1, [rdi + %2]
  148. pavgb xmm0, xmm1
  149. %endif
  150. movq [rdi + %2], xmm0
  151. %endm
  152. ;void vpx_filter_block1d4_v8_sse2
  153. ;(
  154. ; unsigned char *src_ptr,
  155. ; unsigned int src_pitch,
  156. ; unsigned char *output_ptr,
  157. ; unsigned int out_pitch,
  158. ; unsigned int output_height,
  159. ; short *filter
  160. ;)
  161. global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
  162. sym(vpx_filter_block1d4_v8_sse2):
  163. push rbp
  164. mov rbp, rsp
  165. SHADOW_ARGS_TO_STACK 6
  166. SAVE_XMM 7
  167. push rsi
  168. push rdi
  169. push rbx
  170. ; end prolog
  171. ALIGN_STACK 16, rax
  172. sub rsp, 16 * 6
  173. %define k0k1 [rsp + 16 * 0]
  174. %define k2k3 [rsp + 16 * 1]
  175. %define k5k4 [rsp + 16 * 2]
  176. %define k6k7 [rsp + 16 * 3]
  177. %define krd [rsp + 16 * 4]
  178. %define zero [rsp + 16 * 5]
  179. GET_FILTERS_4
  180. mov rsi, arg(0) ;src_ptr
  181. mov rdi, arg(2) ;output_ptr
  182. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  183. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  184. lea rdx, [rax + rax * 2]
  185. movsxd rcx, DWORD PTR arg(4) ;output_height
  186. .loop:
  187. movd xmm0, [rsi] ;load src: row 0
  188. movd xmm1, [rsi + rax] ;1
  189. movd xmm6, [rsi + rdx * 2] ;6
  190. lea rsi, [rsi + rax]
  191. movd xmm7, [rsi + rdx * 2] ;7
  192. movd xmm2, [rsi + rax] ;2
  193. movd xmm3, [rsi + rax * 2] ;3
  194. movd xmm4, [rsi + rdx] ;4
  195. movd xmm5, [rsi + rax * 4] ;5
  196. APPLY_FILTER_4 0
  197. lea rdi, [rdi + rbx]
  198. dec rcx
  199. jnz .loop
  200. add rsp, 16 * 6
  201. pop rsp
  202. pop rbx
  203. ; begin epilog
  204. pop rdi
  205. pop rsi
  206. RESTORE_XMM
  207. UNSHADOW_ARGS
  208. pop rbp
  209. ret
  210. ;void vpx_filter_block1d8_v8_sse2
  211. ;(
  212. ; unsigned char *src_ptr,
  213. ; unsigned int src_pitch,
  214. ; unsigned char *output_ptr,
  215. ; unsigned int out_pitch,
  216. ; unsigned int output_height,
  217. ; short *filter
  218. ;)
  219. global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
  220. sym(vpx_filter_block1d8_v8_sse2):
  221. push rbp
  222. mov rbp, rsp
  223. SHADOW_ARGS_TO_STACK 6
  224. SAVE_XMM 7
  225. push rsi
  226. push rdi
  227. push rbx
  228. ; end prolog
  229. ALIGN_STACK 16, rax
  230. sub rsp, 16 * 10
  231. %define k0 [rsp + 16 * 0]
  232. %define k1 [rsp + 16 * 1]
  233. %define k2 [rsp + 16 * 2]
  234. %define k3 [rsp + 16 * 3]
  235. %define k4 [rsp + 16 * 4]
  236. %define k5 [rsp + 16 * 5]
  237. %define k6 [rsp + 16 * 6]
  238. %define k7 [rsp + 16 * 7]
  239. %define krd [rsp + 16 * 8]
  240. %define zero [rsp + 16 * 9]
  241. GET_FILTERS
  242. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  243. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  244. lea rdx, [rax + rax * 2]
  245. movsxd rcx, DWORD PTR arg(4) ;output_height
  246. .loop:
  247. LOAD_VERT_8 0
  248. APPLY_FILTER_8 0, 0
  249. lea rdi, [rdi + rbx]
  250. dec rcx
  251. jnz .loop
  252. add rsp, 16 * 10
  253. pop rsp
  254. pop rbx
  255. ; begin epilog
  256. pop rdi
  257. pop rsi
  258. RESTORE_XMM
  259. UNSHADOW_ARGS
  260. pop rbp
  261. ret
  262. ;void vpx_filter_block1d16_v8_sse2
  263. ;(
  264. ; unsigned char *src_ptr,
  265. ; unsigned int src_pitch,
  266. ; unsigned char *output_ptr,
  267. ; unsigned int out_pitch,
  268. ; unsigned int output_height,
  269. ; short *filter
  270. ;)
  271. global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
  272. sym(vpx_filter_block1d16_v8_sse2):
  273. push rbp
  274. mov rbp, rsp
  275. SHADOW_ARGS_TO_STACK 6
  276. SAVE_XMM 7
  277. push rsi
  278. push rdi
  279. push rbx
  280. ; end prolog
  281. ALIGN_STACK 16, rax
  282. sub rsp, 16 * 10
  283. %define k0 [rsp + 16 * 0]
  284. %define k1 [rsp + 16 * 1]
  285. %define k2 [rsp + 16 * 2]
  286. %define k3 [rsp + 16 * 3]
  287. %define k4 [rsp + 16 * 4]
  288. %define k5 [rsp + 16 * 5]
  289. %define k6 [rsp + 16 * 6]
  290. %define k7 [rsp + 16 * 7]
  291. %define krd [rsp + 16 * 8]
  292. %define zero [rsp + 16 * 9]
  293. GET_FILTERS
  294. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  295. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  296. lea rdx, [rax + rax * 2]
  297. movsxd rcx, DWORD PTR arg(4) ;output_height
  298. .loop:
  299. LOAD_VERT_8 0
  300. APPLY_FILTER_8 0, 0
  301. sub rsi, rax
  302. LOAD_VERT_8 8
  303. APPLY_FILTER_8 0, 8
  304. add rdi, rbx
  305. dec rcx
  306. jnz .loop
  307. add rsp, 16 * 10
  308. pop rsp
  309. pop rbx
  310. ; begin epilog
  311. pop rdi
  312. pop rsi
  313. RESTORE_XMM
  314. UNSHADOW_ARGS
  315. pop rbp
  316. ret
  317. global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
  318. sym(vpx_filter_block1d4_v8_avg_sse2):
  319. push rbp
  320. mov rbp, rsp
  321. SHADOW_ARGS_TO_STACK 6
  322. SAVE_XMM 7
  323. push rsi
  324. push rdi
  325. push rbx
  326. ; end prolog
  327. ALIGN_STACK 16, rax
  328. sub rsp, 16 * 6
  329. %define k0k1 [rsp + 16 * 0]
  330. %define k2k3 [rsp + 16 * 1]
  331. %define k5k4 [rsp + 16 * 2]
  332. %define k6k7 [rsp + 16 * 3]
  333. %define krd [rsp + 16 * 4]
  334. %define zero [rsp + 16 * 5]
  335. GET_FILTERS_4
  336. mov rsi, arg(0) ;src_ptr
  337. mov rdi, arg(2) ;output_ptr
  338. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  339. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  340. lea rdx, [rax + rax * 2]
  341. movsxd rcx, DWORD PTR arg(4) ;output_height
  342. .loop:
  343. movd xmm0, [rsi] ;load src: row 0
  344. movd xmm1, [rsi + rax] ;1
  345. movd xmm6, [rsi + rdx * 2] ;6
  346. lea rsi, [rsi + rax]
  347. movd xmm7, [rsi + rdx * 2] ;7
  348. movd xmm2, [rsi + rax] ;2
  349. movd xmm3, [rsi + rax * 2] ;3
  350. movd xmm4, [rsi + rdx] ;4
  351. movd xmm5, [rsi + rax * 4] ;5
  352. APPLY_FILTER_4 1
  353. lea rdi, [rdi + rbx]
  354. dec rcx
  355. jnz .loop
  356. add rsp, 16 * 6
  357. pop rsp
  358. pop rbx
  359. ; begin epilog
  360. pop rdi
  361. pop rsi
  362. RESTORE_XMM
  363. UNSHADOW_ARGS
  364. pop rbp
  365. ret
  366. global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
  367. sym(vpx_filter_block1d8_v8_avg_sse2):
  368. push rbp
  369. mov rbp, rsp
  370. SHADOW_ARGS_TO_STACK 6
  371. SAVE_XMM 7
  372. push rsi
  373. push rdi
  374. push rbx
  375. ; end prolog
  376. ALIGN_STACK 16, rax
  377. sub rsp, 16 * 10
  378. %define k0 [rsp + 16 * 0]
  379. %define k1 [rsp + 16 * 1]
  380. %define k2 [rsp + 16 * 2]
  381. %define k3 [rsp + 16 * 3]
  382. %define k4 [rsp + 16 * 4]
  383. %define k5 [rsp + 16 * 5]
  384. %define k6 [rsp + 16 * 6]
  385. %define k7 [rsp + 16 * 7]
  386. %define krd [rsp + 16 * 8]
  387. %define zero [rsp + 16 * 9]
  388. GET_FILTERS
  389. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  390. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  391. lea rdx, [rax + rax * 2]
  392. movsxd rcx, DWORD PTR arg(4) ;output_height
  393. .loop:
  394. LOAD_VERT_8 0
  395. APPLY_FILTER_8 1, 0
  396. lea rdi, [rdi + rbx]
  397. dec rcx
  398. jnz .loop
  399. add rsp, 16 * 10
  400. pop rsp
  401. pop rbx
  402. ; begin epilog
  403. pop rdi
  404. pop rsi
  405. RESTORE_XMM
  406. UNSHADOW_ARGS
  407. pop rbp
  408. ret
  409. global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
  410. sym(vpx_filter_block1d16_v8_avg_sse2):
  411. push rbp
  412. mov rbp, rsp
  413. SHADOW_ARGS_TO_STACK 6
  414. SAVE_XMM 7
  415. push rsi
  416. push rdi
  417. push rbx
  418. ; end prolog
  419. ALIGN_STACK 16, rax
  420. sub rsp, 16 * 10
  421. %define k0 [rsp + 16 * 0]
  422. %define k1 [rsp + 16 * 1]
  423. %define k2 [rsp + 16 * 2]
  424. %define k3 [rsp + 16 * 3]
  425. %define k4 [rsp + 16 * 4]
  426. %define k5 [rsp + 16 * 5]
  427. %define k6 [rsp + 16 * 6]
  428. %define k7 [rsp + 16 * 7]
  429. %define krd [rsp + 16 * 8]
  430. %define zero [rsp + 16 * 9]
  431. GET_FILTERS
  432. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  433. movsxd rbx, DWORD PTR arg(3) ;out_pitch
  434. lea rdx, [rax + rax * 2]
  435. movsxd rcx, DWORD PTR arg(4) ;output_height
  436. .loop:
  437. LOAD_VERT_8 0
  438. APPLY_FILTER_8 1, 0
  439. sub rsi, rax
  440. LOAD_VERT_8 8
  441. APPLY_FILTER_8 1, 8
  442. add rdi, rbx
  443. dec rcx
  444. jnz .loop
  445. add rsp, 16 * 10
  446. pop rsp
  447. pop rbx
  448. ; begin epilog
  449. pop rdi
  450. pop rsi
  451. RESTORE_XMM
  452. UNSHADOW_ARGS
  453. pop rbp
  454. ret
  455. ;void vpx_filter_block1d4_h8_sse2
  456. ;(
  457. ; unsigned char *src_ptr,
  458. ; unsigned int src_pixels_per_line,
  459. ; unsigned char *output_ptr,
  460. ; unsigned int output_pitch,
  461. ; unsigned int output_height,
  462. ; short *filter
  463. ;)
  464. global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
  465. sym(vpx_filter_block1d4_h8_sse2):
  466. push rbp
  467. mov rbp, rsp
  468. SHADOW_ARGS_TO_STACK 6
  469. SAVE_XMM 7
  470. push rsi
  471. push rdi
  472. ; end prolog
  473. ALIGN_STACK 16, rax
  474. sub rsp, 16 * 6
  475. %define k0k1 [rsp + 16 * 0]
  476. %define k2k3 [rsp + 16 * 1]
  477. %define k5k4 [rsp + 16 * 2]
  478. %define k6k7 [rsp + 16 * 3]
  479. %define krd [rsp + 16 * 4]
  480. %define zero [rsp + 16 * 5]
  481. GET_FILTERS_4
  482. mov rsi, arg(0) ;src_ptr
  483. mov rdi, arg(2) ;output_ptr
  484. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  485. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  486. movsxd rcx, DWORD PTR arg(4) ;output_height
  487. .loop:
  488. movdqu xmm0, [rsi - 3] ;load src
  489. movdqa xmm1, xmm0
  490. movdqa xmm6, xmm0
  491. movdqa xmm7, xmm0
  492. movdqa xmm2, xmm0
  493. movdqa xmm3, xmm0
  494. movdqa xmm5, xmm0
  495. movdqa xmm4, xmm0
  496. psrldq xmm1, 1
  497. psrldq xmm6, 6
  498. psrldq xmm7, 7
  499. psrldq xmm2, 2
  500. psrldq xmm3, 3
  501. psrldq xmm5, 5
  502. psrldq xmm4, 4
  503. APPLY_FILTER_4 0
  504. lea rsi, [rsi + rax]
  505. lea rdi, [rdi + rdx]
  506. dec rcx
  507. jnz .loop
  508. add rsp, 16 * 6
  509. pop rsp
  510. ; begin epilog
  511. pop rdi
  512. pop rsi
  513. RESTORE_XMM
  514. UNSHADOW_ARGS
  515. pop rbp
  516. ret
  517. ;void vpx_filter_block1d8_h8_sse2
  518. ;(
  519. ; unsigned char *src_ptr,
  520. ; unsigned int src_pixels_per_line,
  521. ; unsigned char *output_ptr,
  522. ; unsigned int output_pitch,
  523. ; unsigned int output_height,
  524. ; short *filter
  525. ;)
  526. global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
  527. sym(vpx_filter_block1d8_h8_sse2):
  528. push rbp
  529. mov rbp, rsp
  530. SHADOW_ARGS_TO_STACK 6
  531. SAVE_XMM 7
  532. push rsi
  533. push rdi
  534. ; end prolog
  535. ALIGN_STACK 16, rax
  536. sub rsp, 16 * 10
  537. %define k0 [rsp + 16 * 0]
  538. %define k1 [rsp + 16 * 1]
  539. %define k2 [rsp + 16 * 2]
  540. %define k3 [rsp + 16 * 3]
  541. %define k4 [rsp + 16 * 4]
  542. %define k5 [rsp + 16 * 5]
  543. %define k6 [rsp + 16 * 6]
  544. %define k7 [rsp + 16 * 7]
  545. %define krd [rsp + 16 * 8]
  546. %define zero [rsp + 16 * 9]
  547. GET_FILTERS
  548. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  549. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  550. movsxd rcx, DWORD PTR arg(4) ;output_height
  551. .loop:
  552. movdqu xmm0, [rsi - 3] ;load src
  553. movdqa xmm1, xmm0
  554. movdqa xmm6, xmm0
  555. movdqa xmm7, xmm0
  556. movdqa xmm2, xmm0
  557. movdqa xmm5, xmm0
  558. movdqa xmm3, xmm0
  559. movdqa xmm4, xmm0
  560. psrldq xmm1, 1
  561. psrldq xmm6, 6
  562. psrldq xmm7, 7
  563. psrldq xmm2, 2
  564. psrldq xmm5, 5
  565. psrldq xmm3, 3
  566. psrldq xmm4, 4
  567. APPLY_FILTER_8 0, 0
  568. lea rsi, [rsi + rax]
  569. lea rdi, [rdi + rdx]
  570. dec rcx
  571. jnz .loop
  572. add rsp, 16 * 10
  573. pop rsp
  574. ; begin epilog
  575. pop rdi
  576. pop rsi
  577. RESTORE_XMM
  578. UNSHADOW_ARGS
  579. pop rbp
  580. ret
  581. ;void vpx_filter_block1d16_h8_sse2
  582. ;(
  583. ; unsigned char *src_ptr,
  584. ; unsigned int src_pixels_per_line,
  585. ; unsigned char *output_ptr,
  586. ; unsigned int output_pitch,
  587. ; unsigned int output_height,
  588. ; short *filter
  589. ;)
  590. global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
  591. sym(vpx_filter_block1d16_h8_sse2):
  592. push rbp
  593. mov rbp, rsp
  594. SHADOW_ARGS_TO_STACK 6
  595. SAVE_XMM 7
  596. push rsi
  597. push rdi
  598. ; end prolog
  599. ALIGN_STACK 16, rax
  600. sub rsp, 16 * 10
  601. %define k0 [rsp + 16 * 0]
  602. %define k1 [rsp + 16 * 1]
  603. %define k2 [rsp + 16 * 2]
  604. %define k3 [rsp + 16 * 3]
  605. %define k4 [rsp + 16 * 4]
  606. %define k5 [rsp + 16 * 5]
  607. %define k6 [rsp + 16 * 6]
  608. %define k7 [rsp + 16 * 7]
  609. %define krd [rsp + 16 * 8]
  610. %define zero [rsp + 16 * 9]
  611. GET_FILTERS
  612. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  613. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  614. movsxd rcx, DWORD PTR arg(4) ;output_height
  615. .loop:
  616. movdqu xmm0, [rsi - 3] ;load src
  617. movdqa xmm1, xmm0
  618. movdqa xmm6, xmm0
  619. movdqa xmm7, xmm0
  620. movdqa xmm2, xmm0
  621. movdqa xmm5, xmm0
  622. movdqa xmm3, xmm0
  623. movdqa xmm4, xmm0
  624. psrldq xmm1, 1
  625. psrldq xmm6, 6
  626. psrldq xmm7, 7
  627. psrldq xmm2, 2
  628. psrldq xmm5, 5
  629. psrldq xmm3, 3
  630. psrldq xmm4, 4
  631. APPLY_FILTER_8 0, 0
  632. movdqu xmm0, [rsi + 5] ;load src
  633. movdqa xmm1, xmm0
  634. movdqa xmm6, xmm0
  635. movdqa xmm7, xmm0
  636. movdqa xmm2, xmm0
  637. movdqa xmm5, xmm0
  638. movdqa xmm3, xmm0
  639. movdqa xmm4, xmm0
  640. psrldq xmm1, 1
  641. psrldq xmm6, 6
  642. psrldq xmm7, 7
  643. psrldq xmm2, 2
  644. psrldq xmm5, 5
  645. psrldq xmm3, 3
  646. psrldq xmm4, 4
  647. APPLY_FILTER_8 0, 8
  648. lea rsi, [rsi + rax]
  649. lea rdi, [rdi + rdx]
  650. dec rcx
  651. jnz .loop
  652. add rsp, 16 * 10
  653. pop rsp
  654. ; begin epilog
  655. pop rdi
  656. pop rsi
  657. RESTORE_XMM
  658. UNSHADOW_ARGS
  659. pop rbp
  660. ret
  661. global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
  662. sym(vpx_filter_block1d4_h8_avg_sse2):
  663. push rbp
  664. mov rbp, rsp
  665. SHADOW_ARGS_TO_STACK 6
  666. SAVE_XMM 7
  667. push rsi
  668. push rdi
  669. ; end prolog
  670. ALIGN_STACK 16, rax
  671. sub rsp, 16 * 6
  672. %define k0k1 [rsp + 16 * 0]
  673. %define k2k3 [rsp + 16 * 1]
  674. %define k5k4 [rsp + 16 * 2]
  675. %define k6k7 [rsp + 16 * 3]
  676. %define krd [rsp + 16 * 4]
  677. %define zero [rsp + 16 * 5]
  678. GET_FILTERS_4
  679. mov rsi, arg(0) ;src_ptr
  680. mov rdi, arg(2) ;output_ptr
  681. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  682. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  683. movsxd rcx, DWORD PTR arg(4) ;output_height
  684. .loop:
  685. movdqu xmm0, [rsi - 3] ;load src
  686. movdqa xmm1, xmm0
  687. movdqa xmm6, xmm0
  688. movdqa xmm7, xmm0
  689. movdqa xmm2, xmm0
  690. movdqa xmm3, xmm0
  691. movdqa xmm5, xmm0
  692. movdqa xmm4, xmm0
  693. psrldq xmm1, 1
  694. psrldq xmm6, 6
  695. psrldq xmm7, 7
  696. psrldq xmm2, 2
  697. psrldq xmm3, 3
  698. psrldq xmm5, 5
  699. psrldq xmm4, 4
  700. APPLY_FILTER_4 1
  701. lea rsi, [rsi + rax]
  702. lea rdi, [rdi + rdx]
  703. dec rcx
  704. jnz .loop
  705. add rsp, 16 * 6
  706. pop rsp
  707. ; begin epilog
  708. pop rdi
  709. pop rsi
  710. RESTORE_XMM
  711. UNSHADOW_ARGS
  712. pop rbp
  713. ret
  714. global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
  715. sym(vpx_filter_block1d8_h8_avg_sse2):
  716. push rbp
  717. mov rbp, rsp
  718. SHADOW_ARGS_TO_STACK 6
  719. SAVE_XMM 7
  720. push rsi
  721. push rdi
  722. ; end prolog
  723. ALIGN_STACK 16, rax
  724. sub rsp, 16 * 10
  725. %define k0 [rsp + 16 * 0]
  726. %define k1 [rsp + 16 * 1]
  727. %define k2 [rsp + 16 * 2]
  728. %define k3 [rsp + 16 * 3]
  729. %define k4 [rsp + 16 * 4]
  730. %define k5 [rsp + 16 * 5]
  731. %define k6 [rsp + 16 * 6]
  732. %define k7 [rsp + 16 * 7]
  733. %define krd [rsp + 16 * 8]
  734. %define zero [rsp + 16 * 9]
  735. GET_FILTERS
  736. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  737. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  738. movsxd rcx, DWORD PTR arg(4) ;output_height
  739. .loop:
  740. movdqu xmm0, [rsi - 3] ;load src
  741. movdqa xmm1, xmm0
  742. movdqa xmm6, xmm0
  743. movdqa xmm7, xmm0
  744. movdqa xmm2, xmm0
  745. movdqa xmm5, xmm0
  746. movdqa xmm3, xmm0
  747. movdqa xmm4, xmm0
  748. psrldq xmm1, 1
  749. psrldq xmm6, 6
  750. psrldq xmm7, 7
  751. psrldq xmm2, 2
  752. psrldq xmm5, 5
  753. psrldq xmm3, 3
  754. psrldq xmm4, 4
  755. APPLY_FILTER_8 1, 0
  756. lea rsi, [rsi + rax]
  757. lea rdi, [rdi + rdx]
  758. dec rcx
  759. jnz .loop
  760. add rsp, 16 * 10
  761. pop rsp
  762. ; begin epilog
  763. pop rdi
  764. pop rsi
  765. RESTORE_XMM
  766. UNSHADOW_ARGS
  767. pop rbp
  768. ret
  769. global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
  770. sym(vpx_filter_block1d16_h8_avg_sse2):
  771. push rbp
  772. mov rbp, rsp
  773. SHADOW_ARGS_TO_STACK 6
  774. SAVE_XMM 7
  775. push rsi
  776. push rdi
  777. ; end prolog
  778. ALIGN_STACK 16, rax
  779. sub rsp, 16 * 10
  780. %define k0 [rsp + 16 * 0]
  781. %define k1 [rsp + 16 * 1]
  782. %define k2 [rsp + 16 * 2]
  783. %define k3 [rsp + 16 * 3]
  784. %define k4 [rsp + 16 * 4]
  785. %define k5 [rsp + 16 * 5]
  786. %define k6 [rsp + 16 * 6]
  787. %define k7 [rsp + 16 * 7]
  788. %define krd [rsp + 16 * 8]
  789. %define zero [rsp + 16 * 9]
  790. GET_FILTERS
  791. movsxd rax, DWORD PTR arg(1) ;pixels_per_line
  792. movsxd rdx, DWORD PTR arg(3) ;out_pitch
  793. movsxd rcx, DWORD PTR arg(4) ;output_height
  794. .loop:
  795. movdqu xmm0, [rsi - 3] ;load src
  796. movdqa xmm1, xmm0
  797. movdqa xmm6, xmm0
  798. movdqa xmm7, xmm0
  799. movdqa xmm2, xmm0
  800. movdqa xmm5, xmm0
  801. movdqa xmm3, xmm0
  802. movdqa xmm4, xmm0
  803. psrldq xmm1, 1
  804. psrldq xmm6, 6
  805. psrldq xmm7, 7
  806. psrldq xmm2, 2
  807. psrldq xmm5, 5
  808. psrldq xmm3, 3
  809. psrldq xmm4, 4
  810. APPLY_FILTER_8 1, 0
  811. movdqu xmm0, [rsi + 5] ;load src
  812. movdqa xmm1, xmm0
  813. movdqa xmm6, xmm0
  814. movdqa xmm7, xmm0
  815. movdqa xmm2, xmm0
  816. movdqa xmm5, xmm0
  817. movdqa xmm3, xmm0
  818. movdqa xmm4, xmm0
  819. psrldq xmm1, 1
  820. psrldq xmm6, 6
  821. psrldq xmm7, 7
  822. psrldq xmm2, 2
  823. psrldq xmm5, 5
  824. psrldq xmm3, 3
  825. psrldq xmm4, 4
  826. APPLY_FILTER_8 1, 8
  827. lea rsi, [rsi + rax]
  828. lea rdi, [rdi + rdx]
  829. dec rcx
  830. jnz .loop
  831. add rsp, 16 * 10
  832. pop rsp
  833. ; begin epilog
  834. pop rdi
  835. pop rsi
  836. RESTORE_XMM
  837. UNSHADOW_ARGS
  838. pop rbp
  839. ret