; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA
; dav1d_obmc_masks[] << 9
obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
rescale_mul: dd 0, 1, 2, 3
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
bdct_lb_q: times 8 db 0
times 8 db 4
times 8 db 8
times 8 db 12
pw_2: times 8 dw 2
pw_16: times 4 dw 16
prep_mul: times 4 dw 16
times 8 dw 4
pw_64: times 8 dw 64
pw_256: times 8 dw 256
pw_2048: times 4 dw 2048
bidir_mul: times 4 dw 2048
pw_8192: times 8 dw 8192
pw_27615: times 8 dw 27615
pw_32766: times 8 dw 32766
pw_m512: times 8 dw -512
pd_63: times 4 dd 63
pd_64: times 4 dd 64
pd_512: times 4 dd 512
pd_2560: times 2 dd 2560
pd_8704: times 2 dd 8704
pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
pd_0x3ff: times 4 dd 0x3ff
pd_0x4000: times 4 dd 0x4000
pq_0x400000: times 2 dq 0x400000
pq_0x40000000: times 2 dq 0x40000000
pd_65538: times 2 dd 65538
put_bilin_h_rnd: times 4 dw 8
times 4 dw 10
s_8tap_h_rnd: times 2 dd 2
times 2 dd 8
put_s_8tap_v_rnd: times 2 dd 512
times 2 dd 128
s_8tap_h_sh: dd 2, 4
put_s_8tap_v_sh: dd 10, 8
bidir_rnd: times 4 dw -16400
times 4 dw -16388
put_8tap_h_rnd: dd 34, 34, 40, 40
prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
warp8x8_shift: dd 11, 13
warp8x8_rnd1: dd 1024, 1024, 4096, 4096
warp8x8_rnd2: times 4 dw 4096
times 4 dw 16384
warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask , ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
%macro SCALED_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dw %%base %+ .w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_1024:
%xdefine %1_%2_dy1_table (%%dy_1024 - %3)
%rep %0 - 2
dw %%base %+ .dy1_w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_2048:
%xdefine %1_%2_dy2_table (%%dy_2048 - %3)
%rep %0 - 2
dw %%base %+ .dy2_w%3 - %%base
%rotate 1
%endrep
%endmacro
SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern resize_filter
SECTION .text
%if UNIX64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 5
%endif
INIT_XMM ssse3
cglobal put_bilin_16bpc, 4, 7, 0, dst, ds , src, ss , w, h, mxy
%define base t0-put_ssse3
mov mxyd, r6m ; mx
LEA t0, put_ssse3
movifnidn wd, wm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [base+put_ssse3_table+wq*2]
add wq, t0
movifnidn hd, hm
jmp wq
.put_w2:
mov r4d, [srcq+ssq*0]
mov r6d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r4d
mov [dstq+dsq*1], r6d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
movq m0, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq [dstq+dsq*0], m0
movq [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu m0, [srcq+ssq*0+16*0]
movu m1, [srcq+ssq*0+16*1]
movu m2, [srcq+ssq*1+16*0]
movu m3, [srcq+ssq*1+16*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+16*0], m0
mova [dstq+dsq*0+16*1], m1
mova [dstq+dsq*1+16*0], m2
mova [dstq+dsq*1+16*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
add srcq, ssq
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
add dstq, dsq
dec hd
jg .put_w32
RET
.put_w64:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, ssq
mova [dstq+16*4], m0
mova [dstq+16*5], m1
mova [dstq+16*6], m2
mova [dstq+16*7], m3
add dstq, dsq
dec hd
jg .put_w64
RET
.put_w128:
add srcq, 16*8
add dstq, 16*8
.put_w128_loop:
movu m0, [srcq-16*8]
movu m1, [srcq-16*7]
movu m2, [srcq-16*6]
movu m3, [srcq-16*5]
mova [dstq-16*8], m0
mova [dstq-16*7], m1
mova [dstq-16*6], m2
mova [dstq-16*5], m3
movu m0, [srcq-16*4]
movu m1, [srcq-16*3]
movu m2, [srcq-16*2]
movu m3, [srcq-16*1]
mova [dstq-16*4], m0
mova [dstq-16*3], m1
mova [dstq-16*2], m2
mova [dstq-16*1], m3
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, ssq
mova [dstq+16*4], m0
mova [dstq+16*5], m1
mova [dstq+16*6], m2
mova [dstq+16*7], m3
add dstq, dsq
dec hd
jg .put_w128_loop
RET
.h:
movd m5, mxyd
mov mxyd, r7m ; my
mova m4, [base+pw_16]
pshufb m5, [base+pw_256]
psubw m4, m5
test mxyd, mxyd
jnz .hv
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
mov r6d, r8m ; bitdepth_max
shr r6d, 11
movddup m3, [base+put_bilin_h_rnd+r6*8]
movifnidn hd, hm
sub wd, 8
jg .h_w16
je .h_w8
cmp wd, -4
je .h_w4
.h_w2:
movq m1, [srcq+ssq*0]
movhps m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmullw m0, m4, m1
psrlq m1, 16
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 4
movd [dstq+dsq*0], m0
punpckhqdq m0, m0
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
movq m0, [srcq+ssq*0]
movhps m0, [srcq+ssq*1]
movq m1, [srcq+ssq*0+2]
movhps m1, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 4
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m0, 4
psrlw m1, 4
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
.h_w16_loop0:
mov r6, wq
.h_w16_loop:
movu m0, [srcq+r6*2+ 0]
movu m1, [srcq+r6*2+ 2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
movu m1, [srcq+r6*2+16]
movu m2, [srcq+r6*2+18]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m0, 4
psrlw m1, 4
mova [dstq+r6*2+16*0], m0
mova [dstq+r6*2+16*1], m1
add r6, 16
jl .h_w16_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w16_loop0
RET
.v:
shl mxyd, 11
movd m5, mxyd
pshufb m5, [base+pw_256]
movifnidn hd, hm
cmp wd, 4
jg .v_w8
je .v_w4
.v_w2:
movd m0, [srcq+ssq*0]
.v_w2_loop:
movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m2, m0, m1
movd m0, [srcq+ssq*0]
punpcklqdq m1, m0
psubw m1, m2
pmulhrsw m1, m5
paddw m1, m2
movd [dstq+dsq*0], m1
punpckhqdq m1, m1
movd [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq m0, [srcq+ssq*0]
.v_w4_loop:
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m2, m0, m1
movq m0, [srcq+ssq*0]
punpcklqdq m1, m0
psubw m1, m2
pmulhrsw m1, m5
paddw m1, m2
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
%if ARCH_X86_64
%if WIN64
push r7
%endif
shl wd, 5
mov r7, srcq
lea r6d, [wq+hq-256]
mov r4, dstq
%else
mov r6, srcq
%endif
.v_w8_loop0:
movu m0, [srcq+ssq*0]
.v_w8_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw m1, m3, m0
pmulhrsw m1, m5
paddw m1, m0
movu m0, [srcq+ssq*0]
psubw m2, m0, m3
pmulhrsw m2, m5
paddw m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
%if ARCH_X86_64
add r7, 16
add r4, 16
movzx hd, r6b
mov srcq, r7
mov dstq, r4
sub r6d, 1<<8
%else
mov dstq, dstmp
add r6, 16
mov hd, hm
add dstq, 16
mov srcq, r6
mov dstmp, dstq
sub wd, 8
%endif
jg .v_w8_loop0
%if WIN64
pop r7
%endif
RET
.hv:
WIN64_SPILL_XMM 8
shl mxyd, 11
mova m3, [base+pw_2]
movd m6, mxyd
mova m7, [base+pw_8192]
pshufb m6, [base+pw_256]
test dword r8m, 0x800
jnz .hv_12bpc
psllw m4, 2
psllw m5, 2
mova m7, [base+pw_2048]
.hv_12bpc:
movifnidn hd, hm
cmp wd, 4
jg .hv_w8
je .hv_w4
.hv_w2:
movddup m0, [srcq+ssq*0]
pshufhw m1, m0, q0321
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w2_loop:
movq m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps m2, [srcq+ssq*0]
pmullw m1, m4, m2
psrlq m2, 16
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2 ; 1 _ 2 _
shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
movd [dstq+dsq*0], m1
punpckhqdq m1, m1
movd [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
movddup m0, [srcq+ssq*0]
movddup m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w4_loop:
movq m1, [srcq+ssq*1]
movq m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
movhps m2, [srcq+ssq*0+2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2 ; 1 2
shufpd m2, m0, m1, 0x01 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
%if ARCH_X86_64
%if WIN64
push r7
%endif
shl wd, 5
lea r6d, [wq+hq-256]
mov r4, srcq
mov r7, dstq
%else
mov r6, srcq
%endif
.hv_w8_loop0:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w8_loop:
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2
psubw m2, m1, m0
paddw m2, m2
pmulhw m2, m6
paddw m2, m0
pmulhrsw m2, m7
mova [dstq+dsq*0], m2
movu m0, [srcq+ssq*0]
movu m2, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m2, m5
paddw m0, m3
paddw m0, m2
psrlw m0, 2
psubw m2, m0, m1
paddw m2, m2
pmulhw m2, m6
paddw m2, m1
pmulhrsw m2, m7
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
%if ARCH_X86_64
add r4, 16
add r7, 16
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
%else
mov dstq, dstmp
add r6, 16
mov hd, hm
add dstq, 16
mov srcq, r6
mov dstmp, dstq
sub wd, 8
%endif
jg .hv_w8_loop0
%if WIN64
pop r7
%endif
RET
cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
%define base r6-prep_ssse3
movifnidn mxyd, r5m ; mx
LEA r6, prep_ssse3
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
tzcnt wd, wd
movzx wd, word [base+prep_ssse3_table+wq*2]
mov r5d, r7m ; bitdepth_max
mova m5, [base+pw_8192]
add wq, r6
shr r5d, 11
movddup m4, [base+prep_mul+r5*8]
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*2]
movhps m1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m4
psubw m0, m5
psubw m1, m5
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 4
jg .prep_w4
RET
.prep_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
movu m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu m0, [srcq+strideq*0+16*0]
movu m1, [srcq+strideq*0+16*1]
movu m2, [srcq+strideq*1+16*0]
movu m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 2
jg .prep_w16
RET
.prep_w32:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
dec hd
jg .prep_w32
RET
.prep_w64:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*4], m0
mova [tmpq+16*5], m1
mova [tmpq+16*6], m2
mova [tmpq+16*7], m3
add tmpq, 16*8
dec hd
jg .prep_w64
RET
.prep_w128:
movu m0, [srcq+16* 0]
movu m1, [srcq+16* 1]
movu m2, [srcq+16* 2]
movu m3, [srcq+16* 3]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
movu m0, [srcq+16* 4]
movu m1, [srcq+16* 5]
movu m2, [srcq+16* 6]
movu m3, [srcq+16* 7]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*4], m0
mova [tmpq+16*5], m1
mova [tmpq+16*6], m2
mova [tmpq+16*7], m3
movu m0, [srcq+16* 8]
movu m1, [srcq+16* 9]
movu m2, [srcq+16*10]
movu m3, [srcq+16*11]
add tmpq, 16*16
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq-16*8], m0
mova [tmpq-16*7], m1
mova [tmpq-16*6], m2
mova [tmpq-16*5], m3
movu m0, [srcq+16*12]
movu m1, [srcq+16*13]
movu m2, [srcq+16*14]
movu m3, [srcq+16*15]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq-16*4], m0
mova [tmpq-16*3], m1
mova [tmpq-16*2], m2
mova [tmpq-16*1], m3
dec hd
jg .prep_w128
RET
.h:
movd m4, mxyd
mov mxyd, r6m ; my
mova m3, [base+pw_16]
pshufb m4, [base+pw_256]
mova m5, [base+pw_32766]
psubw m3, m4
test dword r7m, 0x800
jnz .h_12bpc
psllw m3, 2
psllw m4, 2
.h_12bpc:
test mxyd, mxyd
jnz .hv
sub wd, 8
je .h_w8
jg .h_w16
.h_w4:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*0+2]
movhps m1, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 16
sub hd, 2
jg .h_w4
RET
.h_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m0, 2
psraw m1, 2
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 2
jg .h_w8
RET
.h_w16:
lea srcq, [srcq+wq*2]
neg wq
.h_w16_loop0:
mov r6, wq
.h_w16_loop:
movu m0, [srcq+r6*2+ 0]
movu m1, [srcq+r6*2+ 2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
movu m1, [srcq+r6*2+16]
movu m2, [srcq+r6*2+18]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m0, 2
psraw m1, 2
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
add r6, 16
jl .h_w16_loop
add srcq, strideq
dec hd
jg .h_w16_loop0
RET
.v:
movd m4, mxyd
mova m3, [base+pw_16]
pshufb m4, [base+pw_256]
mova m5, [base+pw_32766]
psubw m3, m4
test dword r7m, 0x800
jnz .v_12bpc
psllw m3, 2
psllw m4, 2
.v_12bpc:
cmp wd, 8
je .v_w8
jg .v_w16
.v_w4:
movq m0, [srcq+strideq*0]
.v_w4_loop:
movq m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
punpcklqdq m1, m0, m2 ; 0 1
movq m0, [srcq+strideq*0]
punpcklqdq m2, m0 ; 1 2
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
mova [tmpq], m1
add tmpq, 16
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movu m0, [srcq+strideq*0]
.v_w8_loop:
movu m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4, m2
psubw m0, m5
paddw m1, m0
movu m0, [srcq+strideq*0]
psraw m1, 2
pmullw m2, m3
mova [tmpq+16*0], m1
pmullw m1, m4, m0
psubw m2, m5
paddw m1, m2
psraw m1, 2
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
%if WIN64
push r7
%endif
mov r5, srcq
%if ARCH_X86_64
lea r6d, [wq*4-32]
mov wd, wd
lea r6d, [hq+r6*8]
mov r7, tmpq
%else
mov r6d, wd
%endif
.v_w16_loop0:
movu m0, [srcq+strideq*0]
.v_w16_loop:
movu m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4, m2
psubw m0, m5
paddw m1, m0
movu m0, [srcq+strideq*0]
psraw m1, 2
pmullw m2, m3
mova [tmpq+wq*0], m1
pmullw m1, m4, m0
psubw m2, m5
paddw m1, m2
psraw m1, 2
mova [tmpq+wq*2], m1
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w16_loop
%if ARCH_X86_64
add r5, 16
add r7, 16
movzx hd, r6b
mov srcq, r5
mov tmpq, r7
sub r6d, 1<<8
%else
mov tmpq, tmpmp
add r5, 16
mov hd, hm
add tmpq, 16
mov srcq, r5
mov tmpmp, tmpq
sub r6d, 8
%endif
jg .v_w16_loop0
%if WIN64
pop r7
%endif
RET
.hv:
WIN64_SPILL_XMM 7
shl mxyd, 11
movd m6, mxyd
pshufb m6, [base+pw_256]
cmp wd, 8
je .hv_w8
jg .hv_w16
.hv_w4:
movddup m0, [srcq+strideq*0]
movddup m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w4_loop:
movq m1, [srcq+strideq*1]
movq m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
movhps m1, [srcq+strideq*0]
movhps m2, [srcq+strideq*0+2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2 ; 1 2
shufpd m2, m0, m1, 0x01 ; 0 1
mova m0, m1
psubw m1, m2
pmulhrsw m1, m6
paddw m1, m2
mova [tmpq], m1
add tmpq, 16
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w8_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
psubw m2, m1, m0
pmulhrsw m2, m6
paddw m2, m0
mova [tmpq+16*0], m2
movu m0, [srcq+strideq*0]
movu m2, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m2, m4
psubw m0, m5
paddw m0, m2
psraw m0, 2
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+16*1], m2
add tmpq, 16*2
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
%if WIN64
push r7
%endif
mov r5, srcq
%if ARCH_X86_64
lea r6d, [wq*4-32]
mov wd, wd
lea r6d, [hq+r6*8]
mov r7, tmpq
%else
mov r6d, wd
%endif
.hv_w16_loop0:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w16_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
psubw m2, m1, m0
pmulhrsw m2, m6
paddw m2, m0
mova [tmpq+wq*0], m2
movu m0, [srcq+strideq*0]
movu m2, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m2, m4
psubw m0, m5
paddw m0, m2
psraw m0, 2
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+wq*2], m2
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .hv_w16_loop
%if ARCH_X86_64
add r5, 16
add r7, 16
movzx hd, r6b
mov srcq, r5
mov tmpq, r7
sub r6d, 1<<8
%else
mov tmpq, tmpmp
add r5, 16
mov hd, hm
add tmpq, 16
mov srcq, r5
mov tmpmp, tmpq
sub r6d, 8
%endif
jg .hv_w16_loop0
%if WIN64
pop r7
%endif
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
cglobal %1_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%if %0 == 5 ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
%if ARCH_X86_32
DECLARE_REG_TMP 1, 2, 6
%elif WIN64
DECLARE_REG_TMP 4, 5, 8
%else
DECLARE_REG_TMP 7, 8, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_6tap_16bpc, 0, 9, 0, dst, ds , src, ss , w, h, mx, my
%define base t2-put_ssse3
%if ARCH_X86_32
%define mxb r0b
%define mxd r0
%define mxq r0
%define myb r1b
%define myd r1
%define myq r1
%endif
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
LEA t2, put_ssse3
movifnidn wd, wm
movifnidn srcq, srcmp
movifnidn ssq, ssmp
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [base+put_ssse3_table+wq*2]
movifnidn dstq, dstmp
movifnidn dsq, dsmp
add wq, t2
%if WIN64
pop r8
pop r7
%endif
jmp wq
.h_w2:
mova m2, [base+spel_h_shuf2]
pshufd m3, m3, q2121
.h_w2_loop:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m0, m2
pshufb m1, m2
pmaddwd m0, m3
pmaddwd m1, m3
phaddd m0, m1
paddd m0, m4
psrad m0, 6
packssdw m0, m0
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
movd [dstq+dsq*0], m0
pshuflw m0, m0, q3232
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
movzx mxd, mxb
lea srcq, [srcq-2]
movq m3, [base+subpel_filters+mxq*8]
movifnidn dstq, dstmp
punpcklbw m3, m3
psraw m3, 8 ; sign-extend
jl .h_w2
WIN64_SPILL_XMM 9
mova m7, [base+spel_h_shufA]
%if ARCH_X86_32
%define m8 [base+spel_h_shufB]
%else
mova m8, [base+spel_h_shufB]
%endif
pshufd m2, m3, q1111
pshufd m3, m3, q2222
.h_w4_loop:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m6, m0, m7 ; 0 1 1 2 2 3 3 4
pmaddwd m6, m2
pshufb m0, m8 ; 2 3 3 4 4 5 5 6
pmaddwd m0, m3
paddd m0, m6
pshufb m6, m1, m7
pmaddwd m6, m2
pshufb m1, m8
pmaddwd m1, m3
paddd m0, m4
paddd m6, m4
paddd m1, m6
psrad m0, 6
psrad m1, 6
packssdw m0, m1
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h:
RESET_STACK_STATE
test myd, 0xf00
jnz .hv
mov myd, r8m
movd m5, r8m
shr myd, 11
movddup m4, [base+put_8tap_h_rnd+myq*8]
movifnidn dsq, dsmp
pshufb m5, [base+pw_256]
sub wd, 4
jle .h_w4
WIN64_SPILL_XMM 11
shr mxd, 16
movq m2, [base+subpel_filters+1+mxq*8]
movifnidn dstq, dstmp
mova m6, [base+spel_h_shufA]
mova m7, [base+spel_h_shufB]
lea srcq, [srcq+wq*2]
punpcklbw m2, m2
lea dstq, [dstq+wq*2]
psraw m2, 8
neg wq
%if ARCH_X86_32
ALLOC_STACK -16*3
%define m8 [rsp+16*0]
%define m9 [rsp+16*1]
%define m10 [rsp+16*2]
pshufd m0, m2, q0000
pshufd m1, m2, q1111
pshufd m2, m2, q2222
mova m8, m0
mova m9, m1
mova m10, m2
%else
pshufd m8, m2, q0000
pshufd m9, m2, q1111
pshufd m10, m2, q2222
%endif
.h_w8_loop0:
mov r6, wq
.h_w8_loop:
movu m3, [srcq+r6*2-4]
movu m2, [srcq+r6*2+8]
pshufb m0, m3, m6 ; 01 12 23 34
pmaddwd m0, m8 ; abcd0
pshufb m3, m7 ; 23 34 45 56
pmaddwd m1, m9, m3 ; abcd1
paddd m0, m1
pshufb m1, m2, m6 ; 67 78 89 9a
shufpd m3, m1, 0x01 ; 45 56 67 78
pmaddwd m1, m9 ; efgh1
pshufb m2, m7 ; 89 9a ab bc
pmaddwd m2, m10 ; efgh2
paddd m1, m2
pmaddwd m2, m10, m3 ; abcd2
pmaddwd m3, m8 ; efgh0
paddd m0, m4
paddd m1, m4
paddd m0, m2
paddd m1, m3
psrad m0, 6
psrad m1, 6
packssdw m0, m1
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
mova [dstq+r6*2], m0
add r6, 8
jl .h_w8_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w8_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m2, [base+subpel_filters+1+myq*8]
WIN64_SPILL_XMM 11, 16
movd m5, r8m
movifnidn dstq, dstmp
movifnidn dsq, dsmp
punpcklbw m2, m2
pshufb m5, [base+pw_256]
psraw m2, 8 ; sign-extend
%if ARCH_X86_32
ALLOC_STACK -16*4
pshufd m0, m2, q0000
mov r6, ssq
pshufd m1, m2, q1111
neg r6
pshufd m2, m2, q2222
mova m8, m0
mova m9, m1
mova m10, m2
cmp wd, 2
jne .v_w4
%else
mov r6, ssq
pshufd m8, m2, q0000
neg r6
cmp wd, 4
jg .v_w8
pshufd m9, m2, q1111
pshufd m10, m2, q2222
je .v_w4
%endif
.v_w2:
movd m1, [srcq+r6 *2]
movd m3, [srcq+r6 *1]
movd m2, [srcq+ssq*0]
movd m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movd m0, [srcq+ssq*0]
punpckldq m1, m3 ; 0 1
punpckldq m3, m2 ; 1 2
punpckldq m2, m4 ; 2 3
punpckldq m4, m0 ; 3 4
punpcklwd m1, m3 ; 01 12
punpcklwd m2, m4 ; 23 34
pxor m6, m6
.v_w2_loop:
movd m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddwd m4, m8, m1 ; a0 b0
mova m1, m2
pmaddwd m2, m9 ; a1 b1
paddd m4, m2
punpckldq m2, m0, m3 ; 4 5
movd m0, [srcq+ssq*0]
punpckldq m3, m0 ; 5 6
punpcklwd m2, m3 ; 67 78
pmaddwd m3, m10, m2 ; a2 b2
paddd m4, m3
psrad m4, 5
packssdw m4, m4
pmaxsw m4, m6
pavgw m4, m6
pminsw m4, m5
movd [dstq+dsq*0], m4
pshuflw m4, m4, q3232
movd [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
%if ARCH_X86_32
shl wd, 14
lea srcq, [srcq+r6*2]
lea wd, [wq+hq-(1<<16)]
%if STACK_ALIGNMENT < 16
%define dstmp [esp +16*3]
%endif
.v_w4_loop0:
mov dstmp, dstq
movq m1, [srcq+ssq*0]
movq m2, [srcq+ssq*1]
lea r6, [srcq+ssq*2]
movq m3, [r6 +ssq*0]
movq m4, [r6 +ssq*1]
lea r6, [r6 +ssq*2]
%else
movq m1, [srcq+r6 *2]
movq m2, [srcq+r6 *1]
lea r6, [srcq+ssq*2]
movq m3, [srcq+ssq*0]
movq m4, [srcq+ssq*1]
%endif
movq m0, [r6 +ssq*0]
punpcklwd m1, m2 ; 01
punpcklwd m2, m3 ; 12
punpcklwd m3, m4 ; 23
punpcklwd m4, m0 ; 34
.v_w4_loop:
pmaddwd m6, m8, m1 ; a0
pmaddwd m7, m8, m2 ; b0
mova m1, m3
pmaddwd m3, m9 ; a1
mova m2, m4
pmaddwd m4, m9 ; b1
paddd m6, m3
movq m3, [r6+ssq*0]
paddd m7, m4
movq m4, [r6+ssq*1]
lea r6, [r6+ssq*2]
movq m0, [r6+ssq*0]
punpcklwd m3, m4 ; 45
punpcklwd m4, m0 ; 56
pmaddwd m0, m10, m3 ; a2
paddd m6, m0
pmaddwd m0, m10, m4 ; b2
paddd m7, m0
psrad m6, 5
psrad m7, 5
packssdw m6, m7
pxor m7, m7
pmaxsw m6, m7
pavgw m6, m7
pminsw m6, m5
movq [dstq+dsq*0], m6
movhps [dstq+dsq*1], m6
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
%if ARCH_X86_32
mov dstq, dstmp
add srcq, 8
movzx hd, ww
add dstq, 8
sub wd, 1<<16
jg .v_w4_loop0
RET
%else
RET
.v_w8:
mova r6m, m8
shl wd, 5
pshufd m6, m2, q1111
lea wd, [wq+hq-(1<<8)]
pshufd m7, m2, q2222
WIN64_PUSH_XMM 16
.v_w8_loop0:
movu m9, [srcq+ r6*2]
movu m11, [srcq+ r6*1]
lea r7, [srcq+ssq*2]
movu m13, [srcq+ssq*0]
movu m15, [srcq+ssq*1]
mov r8, dstq
movu m4, [r7 +ssq*0]
punpcklwd m8, m9, m11 ; 01
punpckhwd m9, m11
punpcklwd m10, m11, m13 ; 12
punpckhwd m11, m13
punpcklwd m12, m13, m15 ; 23
punpckhwd m13, m15
punpcklwd m14, m15, m4 ; 34
punpckhwd m15, m4
.v_w8_loop:
mova m3, r6m
pmaddwd m0, m8, m3 ; a0
pmaddwd m2, m9, m3 ; a0'
pmaddwd m1, m10, m3 ; b0
pmaddwd m3, m11 ; b0'
mova m8, m12
pmaddwd m12, m6 ; a1
mova m9, m13
pmaddwd m13, m6 ; a1'
mova m10, m14
pmaddwd m14, m6 ; b1
mova m11, m15
pmaddwd m15, m6 ; b1'
paddd m0, m12
paddd m2, m13
movu m13, [r7+ssq*0]
paddd m1, m14
paddd m3, m15
movu m15, [r7+ssq*1]
lea r7, [r7+ssq*2]
movu m4, [r7+ssq*0]
punpcklwd m12, m13, m15 ; 45
punpckhwd m13, m15
punpcklwd m14, m15, m4 ; 56
punpckhwd m15, m4
pmaddwd m4, m7, m12 ; a2
paddd m0, m4
pmaddwd m4, m7, m13 ; a2'
paddd m2, m4
pmaddwd m4, m7, m14 ; b2
paddd m1, m4
pmaddwd m4, m7, m15 ; b2'
paddd m3, m4
REPX {psrad x, 5}, m0, m2, m1, m3
packssdw m0, m2
packssdw m1, m3
pxor m2, m2
pmaxsw m0, m2
pmaxsw m1, m2
pavgw m0, m2
pavgw m1, m2
pminsw m0, m5
pminsw m1, m5
mova [r8+dsq*0], m0
mova [r8+dsq*1], m1
lea r8, [r8+dsq*2]
sub hd, 2
jg .v_w8_loop
add srcq, 16
add dstq, 16
movzx hd, wb
sub wd, 1<<8
jg .v_w8_loop0
RET
%endif
.hv:
cmp wd, 4
jg .hv_w8
WIN64_SPILL_XMM 12, 16
%if ARCH_X86_32
movd m3, r8m
pshufb m3, [base+pw_256]
%else
movd m11, r8m
pshufb m11, [base+pw_256]
%endif
movzx mxd, mxb
movq m0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m2, [base+subpel_filters+1+myq*8]
movddup m7, [base+pd_8704]
sub srcq, 2
pshuflw m0, m0, q2121
pxor m6, m6
punpcklbw m6, m0
punpcklbw m2, m2
psraw m2, 8 ; sign-extend
test dword r8m, 0x800
jz .hv_w2_10bpc
movddup m7, [base+pd_2560]
psraw m6, 2
psllw m2, 2
.hv_w2_10bpc:
%if ARCH_X86_32
%assign regs_used 2
ALLOC_STACK -16*7
%assign regs_used 7
mov dstq, r0mp
mov dsq, r1mp
%define m11 [esp +16*4]
pshufd m0, m2, q0000
pshufd m1, m2, q1111
pshufd m2, m2, q2222
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
neg ssq
movu m3, [srcq+ssq*2]
movu m4, [srcq+ssq*1]
neg ssq
%else
pshufd m8, m2, q0000
mov r6, ssq
pshufd m9, m2, q1111
neg r6
pshufd m10, m2, q2222
movu m3, [srcq+r6 *2]
movu m4, [srcq+r6 *1]
%endif
movu m1, [srcq+ssq*0]
movu m0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movu m2, [srcq+ssq*0]
cmp wd, 4
je .hv_w4
mova m5, [base+spel_h_shuf2]
REPX {pshufb x, m5}, m3, m4, m0, m1, m2
REPX {pmaddwd x, m6}, m3, m0, m4, m1, m2
phaddd m3, m0 ; 0 3
phaddd m4, m1 ; 1 2
phaddd m0, m2 ; 3 4
REPX {paddd x, m7}, m3, m4, m0
REPX {psrad x, 10}, m3, m4, m0
packssdw m3, m4 ; 0 3 1 2
packssdw m4, m0 ; 1 2 3 4
pshufd m2, m3, q1320 ; 0 1 2 3
punpcklwd m1, m2, m4 ; 01 12
punpckhwd m2, m4 ; 23 34
.hv_w2_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movu m4, [srcq+ssq*0]
pshufb m3, m5
pshufb m4, m5
pmaddwd m3, m6
pmaddwd m4, m6
phaddd m3, m4
pmaddwd m4, m8, m1 ; a0 b0
mova m1, m2
pmaddwd m2, m9 ; a1 b1
paddd m4, m2
paddd m3, m7
psrad m3, 10 ; 5 6
packssdw m0, m3
pshufd m2, m0, q2103
punpckhwd m2, m0 ; 45 56
mova m0, m3
pmaddwd m3, m10, m2 ; a2 b2
paddd m4, m3
psrad m4, 10
packssdw m4, m4
pxor m3, m3
pminsw m4, m11
pmaxsw m4, m3
movd [dstq+dsq*0], m4
pshuflw m4, m4, q1032
movd [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
%if ARCH_X86_32
%define m12 [esp +16*5]
%define m13 [esp +16*6]
%define m14 [base+spel_h_shufA]
%define m15 [base+spel_h_shufB]
pshufd m5, m6, q0000
pshufd m6, m6, q1111
mova m12, m5
mova m13, m6
%else
WIN64_PUSH_XMM 16
mova m14, [base+spel_h_shufA]
mova m15, [base+spel_h_shufB]
pshufd m12, m6, q0000
pshufd m13, m6, q1111
%endif
%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB
pshufb %3, %2, m14
pmaddwd %3, m12
pshufb %2, %4
pmaddwd %2, m13
paddd %3, m7
paddd %1, %2, %3
%endmacro
HV_H_W4_6TAP m3, m3, m5
HV_H_W4_6TAP m4, m4, m5
HV_H_W4_6TAP m5, m1, m5
HV_H_W4_6TAP m0, m0, m1
HV_H_W4_6TAP m2, m2, m1
REPX {psrad x, 10}, m3, m5, m4, m0, m2
packssdw m3, m5 ; 0 2
packssdw m4, m0 ; 1 3
packssdw m5, m2 ; 2 4
punpcklwd m1, m3, m4 ; 01
punpckhwd m3, m4 ; 23
punpcklwd m2, m4, m5 ; 12
punpckhwd m4, m5 ; 34
.hv_w4_loop:
movu m0, [srcq+ssq*1]
pmaddwd m5, m8, m1 ; a0
lea srcq, [srcq+ssq*2]
pmaddwd m6, m8, m2 ; b0
mova m1, m3
pmaddwd m3, m9 ; a1
mova m2, m4
pmaddwd m4, m9 ; b1
paddd m5, m3
movu m3, [srcq+ssq*0]
paddd m6, m4
HV_H_W4_6TAP m0, m0, m4
HV_H_W4_6TAP m3, m3, m4
psrad m4, m2, 16
psrad m0, 10
psrad m3, 10
packssdw m4, m0 ; 4 5
packssdw m0, m3 ; 5 6
punpcklwd m3, m4, m0 ; 45
punpckhwd m4, m0 ; 56
pmaddwd m0, m10, m3 ; a2
paddd m5, m0
pmaddwd m0, m10, m4 ; b2
paddd m6, m0
psrad m5, 10
psrad m6, 10
packssdw m5, m6
pxor m6, m6
pminsw m5, m11
pmaxsw m5, m6
movq [dstq+dsq*0], m5
movhps [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
RESET_STACK_STATE
shr mxd, 16
movq m2, [base+subpel_filters+1+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m1, [base+subpel_filters+1+myq*8]
movd m3, r8m
movddup m4, [base+pd_8704]
pshufb m3, [base+pw_256]
pxor m0, m0
punpcklbw m0, m2
punpcklbw m1, m1
sub srcq, 4
psraw m1, 8 ; sign-extend
test dword r8m, 0x800
jz .hv_w8_10bpc
movddup m4, [base+pd_2560]
psraw m0, 2
psllw m1, 2
.hv_w8_10bpc:
%if ARCH_X86_32
%assign regs_used 2
ALLOC_STACK -16*9
%assign regs_used 7
mov dstq, r0mp
mov dsq, r1mp
mova [rsp+16*7], m4
%else
ALLOC_STACK 16*7, 16
%endif
mova [rsp+16*6], m3
pshufd m2, m0, q0000
mova [rsp+16*0], m2
pshufd m2, m0, q1111
mova [rsp+16*1], m2
pshufd m0, m0, q2222
mova [rsp+16*2], m0
pshufd m2, m1, q0000
mova [rsp+16*3], m2
pshufd m2, m1, q1111
mova [rsp+16*4], m2
pshufd m1, m1, q2222
mova [rsp+16*5], m1
mov r6, ssq
neg r6
%if ARCH_X86_32
shl wd, 14
lea r4d, [wq+hq-(1<<16)]
%if STACK_ALIGNMENT < 16
%define srcmp [esp +16*8+4*0]
%define dstmp [esp +16*8+4*1]
%endif
%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3]
punpcklwd %1, %2, %3 ; 01 12 23 34
punpckhwd %2, %3 ; 45 56 67 78
pmaddwd %3, %4, %1 ; a0
shufpd %1, %2, 0x01 ; 23 34 45 56
pmaddwd %2, %6 ; a2
pmaddwd %1, %5 ; a1
paddd %2, %3
paddd %1, %2
%endmacro
.hv_w8_loop0:
mov srcmp, srcq
mov dstmp, dstq
movu m5, [srcq+r6*2+0]
movu m6, [srcq+r6*2+2]
mova m7, [rsp+16*0]
mova m1, [rsp+16*1]
mova m0, [rsp+16*2]
HV_H_6TAP m2, m5, m6, m7, m1, m0
movu m5, [srcq+r6*1+0]
movu m6, [srcq+r6*1+2]
HV_H_6TAP m3, m5, m6, m7, m1, m0
movu m5, [srcq+ssq*0+0]
movu m6, [srcq+ssq*0+2]
HV_H_6TAP m4, m5, m6, m7, m1, m0
movu m5, [srcq+ssq*1+0]
movu m6, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
HV_H_6TAP m0, m5, m6, m7, m1
movu m5, [srcq+ssq*0+0]
movu m6, [srcq+ssq*0+2]
HV_H_6TAP m1, m5, m6, m7
mova m5, [rsp+16*7]
REPX {paddd x, m5}, m2, m3, m4, m0, m1
REPX {psrad x, 10}, m2, m4, m3, m0, m1
packssdw m2, m4 ; 0 2
packssdw m3, m0 ; 1 3
packssdw m4, m1 ; 2 4
punpcklwd m0, m2, m3 ; 01
punpckhwd m2, m3 ; 23
punpcklwd m1, m3, m4 ; 12
punpckhwd m3, m4 ; 34
.hv_w8_loop:
mova m5, [rsp+16*3]
mova m6, [rsp+16*4]
pmaddwd m4, m0, m5 ; a0
pmaddwd m5, m1 ; b0
mova m0, m2
pmaddwd m2, m6 ; a1
mova m1, m3
pmaddwd m3, m6 ; b1
paddd m4, m2
movu m2, [srcq+ssq*1+0]
paddd m5, m3
movu m3, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
HV_H_6TAP m6, m2, m3
movu m2, [srcq+ssq*0+0]
movu m3, [srcq+ssq*0+2]
HV_H_6TAP m7, m2, m3
mova m2, [rsp+16*7]
psrad m3, m1, 16
paddd m6, m2
paddd m7, m2
psrad m6, 10
psrad m7, 10
packssdw m3, m6 ; 4 5
packssdw m6, m7 ; 5 6
mova m7, [rsp+16*5]
punpcklwd m2, m3, m6 ; 45
punpckhwd m3, m6 ; 56
pmaddwd m6, m2, m7 ; a2
pmaddwd m7, m3 ; b2
paddd m4, m6
paddd m5, m7
psrad m4, 10
psrad m5, 10
packssdw m4, m5
pxor m5, m5
pminsw m4, [rsp+16*6]
pmaxsw m4, m5
movq [dstq+dsq*0], m4
movhps [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
mov srcq, srcmp
mov dstq, dstmp
movzx hd, r4w
add srcq, 8
add dstq, 8
sub r4d, 1<<16
%else
shl wd, 5
lea r8d, [wq+hq-256]
%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3]
%ifid %6
REPX {pshufb x, %6}, %2, %3, %4
%else
mova %1, %6
pshufb %2, %1 ; 01 12 23 34
pshufb %3, %1 ; 45 56 67 78
pshufb %4, %1 ; 89 9a ab bc
%endif
pmaddwd %1, %7, %2
shufpd %2, %3, 0x01 ; 23 34 45 56
pmaddwd %2, %8
paddd %1, %2
pmaddwd %2, %9, %3
paddd %1, %2
pmaddwd %2, %7, %3
shufpd %3, %4, 0x01 ; 67 78 89 9a
pmaddwd %4, %9
pmaddwd %3, %8
paddd %1, m4
paddd %2, m4
paddd %3, %4
paddd %2, %3
psrad %1, %5
psrad %2, %5
packssdw %1, %2
%endmacro
.hv_w8_loop0:
mova m5, [spel_h_shufA]
movu m0, [srcq+r6*2+ 0]
mova m6, [rsp+16*0]
movu m1, [srcq+r6*2+ 8]
mova m7, [rsp+16*1]
movu m2, [srcq+r6*2+16]
mova m8, [rsp+16*2]
HV_H_6TAP m9, m0, m1, m2, 10, m5, m6, m7, m8
movu m0, [srcq+r6*1+ 0]
movu m1, [srcq+r6*1+ 8]
movu m2, [srcq+r6*1+16]
lea r4, [srcq+ssq*2]
HV_H_6TAP m11, m0, m1, m2, 10, m5, m6, m7, m8
movu m0, [srcq+ssq*0+ 0]
movu m1, [srcq+ssq*0+ 8]
movu m2, [srcq+ssq*0+16]
mov r7, dstq
HV_H_6TAP m13, m0, m1, m2, 10, m5, m6, m7, m8
movu m0, [srcq+ssq*1+ 0]
movu m1, [srcq+ssq*1+ 8]
movu m2, [srcq+ssq*1+16]
HV_H_6TAP m15, m0, m1, m2, 10, m5, m6, m7, m8
movu m0, [r4+ssq*0+ 0]
movu m1, [r4+ssq*0+ 8]
movu m2, [r4+ssq*0+16]
HV_H_6TAP m5, m0, m1, m2, 10, m5, m6, m7, m8
punpcklwd m8, m9, m11 ; 01
punpckhwd m9, m11
punpcklwd m10, m11, m13 ; 12
punpckhwd m11, m13
punpcklwd m12, m13, m15 ; 23
punpckhwd m13, m15
punpcklwd m14, m15, m5 ; 34
punpckhwd m15, m5
.hv_w8_loop:
mova m3, [rsp+16*3]
mova m7, [rsp+16*4]
pmaddwd m0, m8, m3 ; a0
mova m8, m12
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=93 H=83 G=87
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland