123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630 |
- ;
- ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "third_party/x86inc/x86inc.asm"
- SECTION_RODATA
- pw_64: times 8 dw 64
- ; %define USE_PMULHRSW
- ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
- ; when using this instruction.
- ;
- ; The add order below (based on ffvp9) must be followed to prevent outranges.
- ; x = k0k1 + k4k5
- ; y = k2k3 + k6k7
- ; z = signed SAT(x + y)
- SECTION .text
- %if ARCH_X86_64
- %define LOCAL_VARS_SIZE 16*4
- %else
- %define LOCAL_VARS_SIZE 16*6
- %endif
- %macro SETUP_LOCAL_VARS 0
- ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
- ; pmaddubsw has a higher latency on some platforms, this might be eased by
- ; interleaving the instructions.
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- packsswb m4, m4
- ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
- ; some platforms.
- pshuflw m0, m4, 0b ;k0_k1
- pshuflw m1, m4, 01010101b ;k2_k3
- pshuflw m2, m4, 10101010b ;k4_k5
- pshuflw m3, m4, 11111111b ;k6_k7
- punpcklqdq m0, m0
- punpcklqdq m1, m1
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- mova k0k1, m0
- mova k2k3, m1
- mova k4k5, m2
- mova k6k7, m3
- %if ARCH_X86_64
- %define krd m12
- %define tmp m13
- mova krd, [GLOBAL(pw_64)]
- %else
- %define tmp [rsp + 16*4]
- %define krd [rsp + 16*5]
- %if CONFIG_PIC=0
- mova m6, [GLOBAL(pw_64)]
- %else
- ; build constants without accessing global memory
- pcmpeqb m6, m6 ;all ones
- psrlw m6, 15
- psllw m6, 6 ;aka pw_64
- %endif
- mova krd, m6
- %endif
- %endm
- %macro HORIZx4_ROW 2
- mova %2, %1
- punpcklbw %1, %1
- punpckhbw %2, %2
- mova m3, %2
- palignr %2, %1, 1
- palignr m3, %1, 5
- pmaddubsw %2, k0k1k4k5
- pmaddubsw m3, k2k3k6k7
- mova m4, %2 ;k0k1
- mova m5, m3 ;k2k3
- psrldq %2, 8 ;k4k5
- psrldq m3, 8 ;k6k7
- paddsw %2, m4
- paddsw m5, m3
- paddsw %2, m5
- paddsw %2, krd
- psraw %2, 7
- packuswb %2, %2
- %endm
- ;-------------------------------------------------------------------------------
- %macro SUBPIX_HFILTER4 1
- cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- packsswb m4, m4
- %if ARCH_X86_64
- %define k0k1k4k5 m8
- %define k2k3k6k7 m9
- %define krd m10
- %define orig_height r7d
- mova krd, [GLOBAL(pw_64)]
- pshuflw k0k1k4k5, m4, 0b ;k0_k1
- pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
- pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
- pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
- %else
- %define k0k1k4k5 [rsp + 16*0]
- %define k2k3k6k7 [rsp + 16*1]
- %define krd [rsp + 16*2]
- %define orig_height [rsp + 16*3]
- pshuflw m6, m4, 0b ;k0_k1
- pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
- pshuflw m7, m4, 01010101b ;k2_k3
- pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
- %if CONFIG_PIC=0
- mova m1, [GLOBAL(pw_64)]
- %else
- ; build constants without accessing global memory
- pcmpeqb m1, m1 ;all ones
- psrlw m1, 15
- psllw m1, 6 ;aka pw_64
- %endif
- mova k0k1k4k5, m6
- mova k2k3k6k7, m7
- mova krd, m1
- %endif
- mov orig_height, heightd
- shr heightd, 1
- .loop:
- ;Do two rows at once
- movh m0, [srcq - 3]
- movh m1, [srcq + 5]
- punpcklqdq m0, m1
- mova m1, m0
- movh m2, [srcq + sstrideq - 3]
- movh m3, [srcq + sstrideq + 5]
- punpcklqdq m2, m3
- mova m3, m2
- punpcklbw m0, m0
- punpckhbw m1, m1
- punpcklbw m2, m2
- punpckhbw m3, m3
- mova m4, m1
- palignr m4, m0, 1
- pmaddubsw m4, k0k1k4k5
- palignr m1, m0, 5
- pmaddubsw m1, k2k3k6k7
- mova m7, m3
- palignr m7, m2, 1
- pmaddubsw m7, k0k1k4k5
- palignr m3, m2, 5
- pmaddubsw m3, k2k3k6k7
- mova m0, m4 ;k0k1
- mova m5, m1 ;k2k3
- mova m2, m7 ;k0k1 upper
- psrldq m4, 8 ;k4k5
- psrldq m1, 8 ;k6k7
- paddsw m4, m0
- paddsw m5, m1
- mova m1, m3 ;k2k3 upper
- psrldq m7, 8 ;k4k5 upper
- psrldq m3, 8 ;k6k7 upper
- paddsw m7, m2
- paddsw m4, m5
- paddsw m1, m3
- paddsw m7, m1
- paddsw m4, krd
- psraw m4, 7
- packuswb m4, m4
- paddsw m7, krd
- psraw m7, 7
- packuswb m7, m7
- %ifidn %1, h8_avg
- movd m0, [dstq]
- pavgb m4, m0
- movd m2, [dstq + dstrideq]
- pavgb m7, m2
- %endif
- movd [dstq], m4
- movd [dstq + dstrideq], m7
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
- dec heightd
- jnz .loop
- ; Do last row if output_height is odd
- mov heightd, orig_height
- and heightd, 1
- je .done
- movh m0, [srcq - 3] ; load src
- movh m1, [srcq + 5]
- punpcklqdq m0, m1
- HORIZx4_ROW m0, m1
- %ifidn %1, h8_avg
- movd m0, [dstq]
- pavgb m1, m0
- %endif
- movd [dstq], m1
- .done
- RET
- %endm
- %macro HORIZx8_ROW 5
- mova %2, %1
- punpcklbw %1, %1
- punpckhbw %2, %2
- mova %3, %2
- mova %4, %2
- mova %5, %2
- palignr %2, %1, 1
- palignr %3, %1, 5
- palignr %4, %1, 9
- palignr %5, %1, 13
- pmaddubsw %2, k0k1
- pmaddubsw %3, k2k3
- pmaddubsw %4, k4k5
- pmaddubsw %5, k6k7
- paddsw %2, %4
- paddsw %5, %3
- paddsw %2, %5
- paddsw %2, krd
- psraw %2, 7
- packuswb %2, %2
- SWAP %1, %2
- %endm
- ;-------------------------------------------------------------------------------
- %macro SUBPIX_HFILTER8 1
- cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- %if ARCH_X86_64
- %define orig_height r7d
- %else
- %define orig_height heightmp
- %endif
- mov orig_height, heightd
- shr heightd, 1
- .loop:
- movh m0, [srcq - 3]
- movh m3, [srcq + 5]
- movh m4, [srcq + sstrideq - 3]
- movh m7, [srcq + sstrideq + 5]
- punpcklqdq m0, m3
- mova m1, m0
- punpcklbw m0, m0
- punpckhbw m1, m1
- mova m5, m1
- palignr m5, m0, 13
- pmaddubsw m5, k6k7
- mova m2, m1
- mova m3, m1
- palignr m1, m0, 1
- pmaddubsw m1, k0k1
- punpcklqdq m4, m7
- mova m6, m4
- punpcklbw m4, m4
- palignr m2, m0, 5
- punpckhbw m6, m6
- palignr m3, m0, 9
- mova m7, m6
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
- palignr m7, m4, 13
- mova m0, m6
- palignr m0, m4, 5
- pmaddubsw m7, k6k7
- paddsw m1, m3
- paddsw m2, m5
- paddsw m1, m2
- mova m5, m6
- palignr m6, m4, 1
- pmaddubsw m0, k2k3
- pmaddubsw m6, k0k1
- palignr m5, m4, 9
- paddsw m1, krd
- pmaddubsw m5, k4k5
- psraw m1, 7
- paddsw m0, m7
- %ifidn %1, h8_avg
- movh m7, [dstq]
- movh m2, [dstq + dstrideq]
- %endif
- packuswb m1, m1
- paddsw m6, m5
- paddsw m6, m0
- paddsw m6, krd
- psraw m6, 7
- packuswb m6, m6
- %ifidn %1, h8_avg
- pavgb m1, m7
- pavgb m6, m2
- %endif
- movh [dstq], m1
- movh [dstq + dstrideq], m6
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
- dec heightd
- jnz .loop
- ;Do last row if output_height is odd
- mov heightd, orig_height
- and heightd, 1
- je .done
- movh m0, [srcq - 3]
- movh m3, [srcq + 5]
- punpcklqdq m0, m3
- HORIZx8_ROW m0, m1, m2, m3, m4
- %ifidn %1, h8_avg
- movh m1, [dstq]
- pavgb m0, m1
- %endif
- movh [dstq], m0
- .done:
- RET
- %endm
- ;-------------------------------------------------------------------------------
- %macro SUBPIX_HFILTER16 1
- cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- .loop:
- prefetcht0 [srcq + 2 * sstrideq -3]
- movh m0, [srcq - 3]
- movh m4, [srcq + 5]
- movh m6, [srcq + 13]
- punpcklqdq m0, m4
- mova m7, m0
- punpckhbw m0, m0
- mova m1, m0
- punpcklqdq m4, m6
- mova m3, m0
- punpcklbw m7, m7
- palignr m3, m7, 13
- mova m2, m0
- pmaddubsw m3, k6k7
- palignr m0, m7, 1
- pmaddubsw m0, k0k1
- palignr m1, m7, 5
- pmaddubsw m1, k2k3
- palignr m2, m7, 9
- pmaddubsw m2, k4k5
- paddsw m1, m3
- mova m3, m4
- punpckhbw m4, m4
- mova m5, m4
- punpcklbw m3, m3
- mova m7, m4
- palignr m5, m3, 5
- mova m6, m4
- palignr m4, m3, 1
- pmaddubsw m4, k0k1
- pmaddubsw m5, k2k3
- palignr m6, m3, 9
- pmaddubsw m6, k4k5
- palignr m7, m3, 13
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, m1
- %ifidn %1, h8_avg
- mova m1, [dstq]
- %endif
- paddsw m4, m6
- paddsw m5, m7
- paddsw m4, m5
- paddsw m0, krd
- paddsw m4, krd
- psraw m0, 7
- psraw m4, 7
- packuswb m0, m4
- %ifidn %1, h8_avg
- pavgb m0, m1
- %endif
- lea srcq, [srcq + sstrideq]
- mova [dstq], m0
- lea dstq, [dstq + dstrideq]
- dec heightd
- jnz .loop
- RET
- %endm
- INIT_XMM ssse3
- SUBPIX_HFILTER16 h8
- SUBPIX_HFILTER16 h8_avg
- SUBPIX_HFILTER8 h8
- SUBPIX_HFILTER8 h8_avg
- SUBPIX_HFILTER4 h8
- SUBPIX_HFILTER4 h8_avg
- ;-------------------------------------------------------------------------------
- %macro SUBPIX_VFILTER 2
- cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- %if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
- %else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
- %endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
- %ifidn %2, 8
- %define movx movh
- %else
- %define movx movd
- %endif
- .loop:
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- punpcklbw m0, m1 ;A B
- movx m2, [srcq + sstrideq * 2 ] ;C
- pmaddubsw m0, k0k1
- mova m6, m2
- movx m3, [src1q + sstrideq * 2] ;D
- punpcklbw m2, m3 ;C D
- pmaddubsw m2, k2k3
- movx m4, [srcq + sstrideq * 4 ] ;E
- mova m7, m4
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m4, k4k5
- punpcklbw m1, m6 ;A B next iter
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m5, m6 ;E F next iter
- punpcklbw m3, m7 ;C D next iter
- pmaddubsw m5, k4k5
- movx m7, [src1q + sstride6q ] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m6, k6k7
- pmaddubsw m3, k2k3
- pmaddubsw m1, k0k1
- paddsw m0, m4
- paddsw m2, m6
- movx m6, [srcq + sstrideq * 8 ] ;H next iter
- punpcklbw m7, m6
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- paddsw m1, m5
- packuswb m0, m0
- paddsw m3, m7
- paddsw m1, m3
- paddsw m1, krd
- psraw m1, 7
- lea srcq, [srcq + sstrideq * 2 ]
- lea src1q, [src1q + sstrideq * 2]
- packuswb m1, m1
- %ifidn %1, v8_avg
- movx m2, [dstq]
- pavgb m0, m2
- %endif
- movx [dstq], m0
- add dstq, dst_stride
- %ifidn %1, v8_avg
- movx m3, [dstq]
- pavgb m1, m3
- %endif
- movx [dstq], m1
- add dstq, dst_stride
- sub heightd, 2
- cmp heightd, 1
- jg .loop
- cmp heightd, 0
- je .done
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m0, m1 ;A B
- movx m7, [src1q + sstride6q ] ;H
- pmaddubsw m0, k0k1
- movx m2, [srcq + sstrideq * 2 ] ;C
- punpcklbw m6, m7 ;G H
- movx m3, [src1q + sstrideq * 2] ;D
- pmaddubsw m6, k6k7
- movx m4, [srcq + sstrideq * 4 ] ;E
- punpcklbw m2, m3 ;C D
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- paddsw m2, m6
- paddsw m0, m4
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
- %ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
- %endif
- movx [dstq], m0
- .done:
- RET
- %endm
- ;-------------------------------------------------------------------------------
- %macro SUBPIX_VFILTER16 1
- cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- %if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
- %else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
- %endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
- .loop:
- movh m0, [srcq ] ;A
- movh m1, [srcq + sstrideq ] ;B
- movh m2, [srcq + sstrideq * 2 ] ;C
- movh m3, [src1q + sstrideq * 2] ;D
- movh m4, [srcq + sstrideq * 4 ] ;E
- movh m5, [src1q + sstrideq * 4] ;F
- punpcklbw m0, m1 ;A B
- movh m6, [srcq + sstride6q] ;G
- punpcklbw m2, m3 ;C D
- movh m7, [src1q + sstride6q] ;H
- punpcklbw m4, m5 ;E F
- pmaddubsw m0, k0k1
- movh m3, [srcq + 8] ;A
- pmaddubsw m2, k2k3
- punpcklbw m6, m7 ;G H
- movh m5, [srcq + sstrideq + 8] ;B
- pmaddubsw m4, k4k5
- punpcklbw m3, m5 ;A B
- movh m7, [srcq + sstrideq * 2 + 8] ;C
- pmaddubsw m6, k6k7
- movh m5, [src1q + sstrideq * 2 + 8] ;D
- punpcklbw m7, m5 ;C D
- paddsw m2, m6
- pmaddubsw m3, k0k1
- movh m1, [srcq + sstrideq * 4 + 8] ;E
- paddsw m0, m4
- pmaddubsw m7, k2k3
- movh m6, [src1q + sstrideq * 4 + 8] ;F
- punpcklbw m1, m6 ;E F
- paddsw m0, m2
- paddsw m0, krd
- movh m2, [srcq + sstride6q + 8] ;G
- pmaddubsw m1, k4k5
- movh m5, [src1q + sstride6q + 8] ;H
- psraw m0, 7
- punpcklbw m2, m5 ;G H
- pmaddubsw m2, k6k7
- %ifidn %1, v8_avg
- mova m4, [dstq]
- %endif
- movh [dstq], m0
- paddsw m7, m2
- paddsw m3, m1
- paddsw m3, m7
- paddsw m3, krd
- psraw m3, 7
- packuswb m0, m3
- add srcq, sstrideq
- add src1q, sstrideq
- %ifidn %1, v8_avg
- pavgb m0, m4
- %endif
- mova [dstq], m0
- add dstq, dst_stride
- dec heightd
- jnz .loop
- RET
- %endm
- INIT_XMM ssse3
- SUBPIX_VFILTER16 v8
- SUBPIX_VFILTER16 v8_avg
- SUBPIX_VFILTER v8, 8
- SUBPIX_VFILTER v8_avg, 8
- SUBPIX_VFILTER v8, 4
- SUBPIX_VFILTER v8_avg, 4
|