; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018-2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
; dav1d_obmc_masks[] with 64-x interleaved
obmc_masks: db 0, 0, 0, 0
; 2
db 45, 19, 64, 0
; 4
db 39, 25, 50, 14, 59, 5, 64, 0
; 8
db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
; 16
db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
; 32
db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
wm_420_sign: dd 0x01020102, 0x01010101
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
pb_64: times 4 db 64
pw_m256: times 2 dw -256
pw_15: times 2 dw 15
pw_32: times 2 dw 32
pw_34: times 2 dw 34
pw_258: times 2 dw 258
pw_512: times 2 dw 512
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_63: dd 63
pd_512: dd 512
pd_32768: dd 32768
pd_0x3ff: dd 0x3ff
pd_0x4000: dd 0x4000
pq_0x40000000: dq 0x40000000
cextern mc_subpel_filters
cextern mc_warp_filter2
cextern resize_filter
cextern z_filter_s
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro SCALED_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dw %%base %+ .w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_1024:
%xdefine %1_%2_dy1_table (%%dy_1024 - %3)
%rep %0 - 2
dw %%base %+ .dy1_w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_2048:
%xdefine %1_%2_dy2_table (%%dy_2048 - %3)
%rep %0 - 2
dw %%base %+ .dy2_w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask , avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
SECTION .text
INIT_XMM avx2
cglobal put_bilin_8bpc, 4, 8, 0, dst, ds , src, ss , w, h, mxy
movifnidn mxyd, r6m ; mx
lea r7, [put_avx2]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx wd, word [r7+wq*2+table_offset(put,)]
add wq, r7
jmp wq
.put_w2:
movzx r6d, word [srcq+ssq*0]
movzx r7d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6w
mov [dstq+dsq*1], r7w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6d
mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6
mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
INIT_YMM avx2
.put_w32:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+ssq*0+32*0]
movu m1, [srcq+ssq*0+32*1]
movu m2, [srcq+ssq*1+32*0]
movu m3, [srcq+ssq*1+32*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+32*0], m0
mova [dstq+dsq*0+32*1], m1
mova [dstq+dsq*1+32*0], m2
mova [dstq+dsq*1+32*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w64
RET
.put_w128:
movu m0, [srcq+32*0]
movu m1, [srcq+32*1]
movu m2, [srcq+32*2]
movu m3, [srcq+32*3]
add srcq, ssq
mova [dstq+32*0], m0
mova [dstq+32*1], m1
mova [dstq+32*2], m2
mova [dstq+32*3], m3
add dstq, dsq
dec hd
jg .put_w128
RET
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 255
vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r7m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
vpbroadcastd m3, [pw_2048]
add wq, r7
jmp wq
.h_w2:
movd xm0, [srcq+ssq*0]
pinsrd xm0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
pmulhrsw xm0, xm3
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
mova xm4, [bilin_h_shuf4]
.h_w4_loop:
movq xm0, [srcq+ssq*0]
movhps xm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
pmulhrsw xm0, xm3
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h_w8:
movu xm0, [srcq+ssq*0]
movu xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pshufb xm1, xm4
pmaddubsw xm0, xm5
pmaddubsw xm1, xm5
pmulhrsw xm0, xm3
pmulhrsw xm1, xm3
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu xm0, [srcq+ssq*0+8*0]
vinserti128 m0, [srcq+ssq*1+8*0], 1
movu xm1, [srcq+ssq*0+8*1]
vinserti128 m1, [srcq+ssq*1+8*1], 1
lea srcq, [srcq+ssq*2]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16
RET
.h_w32:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
add srcq, ssq
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
dec hd
jg .h_w32
RET
.h_w64:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
movu m1, [srcq+8*4]
movu m2, [srcq+8*5]
add srcq, ssq
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
pmulhrsw m1, m3
pmulhrsw m2, m3
packuswb m1, m2
mova [dstq+32*0], m0
mova [dstq+32*1], m1
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
mov r6, -32*3
.h_w128_loop:
movu m0, [srcq+r6+32*3+8*0]
movu m1, [srcq+r6+32*3+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+r6+32*3], m0
add r6, 32
jle .h_w128_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
imul mxyd, 255
vpbroadcastd m5, [pw_2048]
add mxyd, 16
add wq, r7
movd xm4, mxyd
vpbroadcastw m4, xm4
jmp wq
.v_w2:
movd xm0, [srcq+ssq*0]
.v_w2_loop:
pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
lea srcq, [srcq+ssq*2]
pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xm1, xm1, q2301 ; 1 0
punpcklbw xm1, xm0
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 1
pextrw [dstq+dsq*1], xm1, 0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movd xm0, [srcq+ssq*0]
.v_w4_loop:
vpbroadcastd xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd xm1, xm2, xm0, 0x01 ; 0 1
vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm2, xm0, 0x02 ; 1 2
punpcklbw xm1, xm2
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movq xm0, [srcq+ssq*0]
.v_w8_loop:
movq xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw xm1, xm0, xm2
movq xm0, [srcq+ssq*0]
punpcklbw xm2, xm0
pmaddubsw xm1, xm4
pmaddubsw xm2, xm4
pmulhrsw xm1, xm5
pmulhrsw xm2, xm5
packuswb xm1, xm2
movq [dstq+dsq*0], xm1
movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
movu xm0, [srcq+ssq*0]
.v_w16_loop:
vbroadcasti128 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd m2, m3, m0, 0x0f ; 0 1
vbroadcasti128 m0, [srcq+ssq*0]
vpblendd m3, m0, 0xf0 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], xm1
vextracti128 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
%macro PUT_BILIN_V_W32 0
movu m0, [srcq+ssq*0]
%%loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw m1, m0, m3
punpckhbw m2, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
punpcklbw m2, m3, m0
punpckhbw m3, m0
pmaddubsw m2, m4
pmaddubsw m3, m4
pmulhrsw m2, m5
pmulhrsw m3, m5
packuswb m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
%endmacro
PUT_BILIN_V_W32
RET
.v_w64:
movu m0, [srcq+32*0]
movu m1, [srcq+32*1]
.v_w64_loop:
add srcq, ssq
movu m3, [srcq+32*0]
punpcklbw m2, m0, m3
punpckhbw m0, m3
pmaddubsw m2, m4
pmaddubsw m0, m4
pmulhrsw m2, m5
pmulhrsw m0, m5
packuswb m2, m0
mova m0, m3
movu m3, [srcq+32*1]
mova [dstq+32*0], m2
punpcklbw m2, m1, m3
punpckhbw m1, m3
pmaddubsw m2, m4
pmaddubsw m1, m4
pmulhrsw m2, m5
pmulhrsw m1, m5
packuswb m2, m1
mova m1, m3
mova [dstq+32*1], m2
add dstq, dsq
dec hd
jg .v_w64_loop
RET
.v_w128:
lea r6d, [hq+(3<<8)]
mov r4, srcq
mov r7, dstq
.v_w128_loop:
PUT_BILIN_V_W32
add r4, 32
add r7, 32
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
jg .v_w128_loop
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
vpbroadcastd m7, [pw_15]
movd xm6, mxyd
add wq, r7
paddb m5, m5
vpbroadcastw m6, xm6
jmp wq
.hv_w2:
vpbroadcastd xm0, [srcq+ssq*0]
pshufb xm0, xm4
pmaddubsw xm0, xm5
.hv_w2_loop:
movd xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pinsrd xm1, [srcq+ssq*0], 1
pshufb xm1, xm4
pmaddubsw xm1, xm5 ; 1 _ 2 _
shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
mova xm0, xm1
psubw xm1, xm2
pmulhw xm1, xm6
pavgw xm2, xm7
paddw xm1, xm2
psrlw xm1, 4
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 0
pextrw [dstq+dsq*1], xm1, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
mova xm4, [bilin_h_shuf4]
movddup xm0, [srcq+ssq*0]
pshufb xm0, xm4
pmaddubsw xm0, xm5
.hv_w4_loop:
movq xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xm1, [srcq+ssq*0]
pshufb xm1, xm4
pmaddubsw xm1, xm5 ; 1 2
shufps xm2, xm0, xm1, q1032 ; 0 1
mova xm0, xm1
psubw xm1, xm2
pmulhw xm1, xm6
pavgw xm2, xm7
paddw xm1, xm2
psrlw xm1, 4
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
vbroadcasti128 m0, [srcq+ssq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
movu xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti128 m1, [srcq+ssq*0], 1
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2
vperm2i128 m2, m0, m1, 0x21 ; 0 1
mova m0, m1
psubw m1, m2
pmulhw m1, m6
pavgw m2, m7
paddw m1, m2
psrlw m1, 4
vextracti128 xm2, m1, 1
packuswb xm1, xm2
movq [dstq+dsq*0], xm1
movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
movu m0, [srcq+ssq*0+8*0]
vinserti128 m0, [srcq+ssq*0+8*1], 1
pshufb m0, m4
pmaddubsw m0, m5
.hv_w16_loop:
movu xm2, [srcq+ssq*1+8*0]
vinserti128 m2, [srcq+ssq*1+8*1], 1
lea srcq, [srcq+ssq*2]
movu xm3, [srcq+ssq*0+8*0]
vinserti128 m3, [srcq+ssq*0+8*1], 1
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
pmulhw m1, m6
pavgw m0, m7
paddw m1, m0
pmaddubsw m0, m3, m5
psubw m3, m0, m2
pmulhw m3, m6
pavgw m2, m7
paddw m3, m2
psrlw m1, 4
psrlw m3, 4
packuswb m1, m3
vpermq m1, m1, q3120
mova [dstq+dsq*0], xm1
vextracti128 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w128:
lea r6d, [hq+(3<<16)]
jmp .hv_w32_start
.hv_w64:
lea r6d, [hq+(1<<16)]
.hv_w32_start:
mov r4, srcq
mov r7, dstq
.hv_w32:
%if WIN64
movaps r4m, xmm8
%endif
.hv_w32_loop0:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w32_loop:
add srcq, ssq
movu m2, [srcq+8*0]
movu m3, [srcq+8*1]
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
pmaddubsw m3, m5
psubw m8, m2, m0
pmulhw m8, m6
pavgw m0, m7
paddw m8, m0
mova m0, m2
psubw m2, m3, m1
pmulhw m2, m6
pavgw m1, m7
paddw m2, m1
mova m1, m3
psrlw m8, 4
psrlw m2, 4
packuswb m8, m2
mova [dstq], m8
add dstq, dsq
dec hd
jg .hv_w32_loop
add r4, 32
add r7, 32
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<16
jg .hv_w32_loop0
%if WIN64
movaps xmm8, r4m
%endif
RET
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea r6, [prep%+SUFFIX]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
movzx wd, word [r6+wq*2+table_offset(prep,)]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
movd xm0, [srcq+strideq*0]
pinsrd xm0, [srcq+strideq*1], 1
pinsrd xm0, [srcq+strideq*2], 2
pinsrd xm0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pmovzxbw m0, xm0
psllw m0, 4
mova [tmpq], m0
add tmpq, 32
sub hd, 4
jg .prep_w4
RET
.prep_w8:
movq xm0, [srcq+strideq*0]
movhps xm0, [srcq+strideq*1]
movq xm1, [srcq+strideq*2]
movhps xm1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
pmovzxbw m0, xm0
pmovzxbw m1, xm1
psllw m0, 4
psllw m1, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
add tmpq, 32*2
sub hd, 4
jg .prep_w8
RET
.prep_w16:
pmovzxbw m0, [srcq+strideq*0]
pmovzxbw m1, [srcq+strideq*1]
pmovzxbw m2, [srcq+strideq*2]
pmovzxbw m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
sub hd, 4
jg .prep_w16
RET
.prep_w32:
pmovzxbw m0, [srcq+strideq*0+16*0]
pmovzxbw m1, [srcq+strideq*0+16*1]
pmovzxbw m2, [srcq+strideq*1+16*0]
pmovzxbw m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
sub hd, 2
jg .prep_w32
RET
.prep_w64:
pmovzxbw m0, [srcq+16*0]
pmovzxbw m1, [srcq+16*1]
pmovzxbw m2, [srcq+16*2]
pmovzxbw m3, [srcq+16*3]
add srcq, strideq
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
dec hd
jg .prep_w64
RET
.prep_w128:
pmovzxbw m0, [srcq+16*0]
pmovzxbw m1, [srcq+16*1]
pmovzxbw m2, [srcq+16*2]
pmovzxbw m3, [srcq+16*3]
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
pmovzxbw m0, [srcq+16*4]
pmovzxbw m1, [srcq+16*5]
pmovzxbw m2, [srcq+16*6]
pmovzxbw m3, [srcq+16*7]
add tmpq, 32*8
add srcq, strideq
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
mova [tmpq-32*4], m0
mova [tmpq-32*3], m1
mova [tmpq-32*2], m2
mova [tmpq-32*1], m3
dec hd
jg .prep_w128
RET
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 255
vbroadcasti128 m4, [z_filter_s+2]
add mxyd, 16
movd xm5, mxyd
mov mxyd, r6m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
vbroadcasti128 m4, [bilin_h_shuf4]
.h_w4_loop:
movq xm0, [srcq+strideq*0]
movhps xm0, [srcq+strideq*1]
movq xm1, [srcq+strideq*2]
movhps xm1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vinserti128 m0, xm1, 1
pshufb m0, m4
pmaddubsw m0, m5
mova [tmpq], m0
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
.h_w8:
.h_w8_loop:
movu xm0, [srcq+strideq*0]
vinserti128 m0, [srcq+strideq*1], 1
movu xm1, [srcq+strideq*2]
vinserti128 m1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
add tmpq, 32*2
sub hd, 4
jg .h_w8_loop
RET
.h_w16:
.h_w16_loop:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
movu xm1, [srcq+strideq*1+8*0]
vinserti128 m1, [srcq+strideq*1+8*1], 1
movu xm2, [srcq+strideq*2+8*0]
vinserti128 m2, [srcq+strideq*2+8*1], 1
movu xm3, [srcq+stride3q +8*0]
vinserti128 m3, [srcq+stride3q +8*1], 1
lea srcq, [srcq+strideq*4]
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
sub hd, 4
jg .h_w16_loop
RET
.h_w32:
.h_w32_loop:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
movu xm1, [srcq+strideq*0+8*2]
vinserti128 m1, [srcq+strideq*0+8*3], 1
movu xm2, [srcq+strideq*1+8*0]
vinserti128 m2, [srcq+strideq*1+8*1], 1
movu xm3, [srcq+strideq*1+8*2]
vinserti128 m3, [srcq+strideq*1+8*3], 1
lea srcq, [srcq+strideq*2]
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
sub hd, 2
jg .h_w32_loop
RET
.h_w64:
movu xm0, [srcq+8*0]
vinserti128 m0, [srcq+8*1], 1
movu xm1, [srcq+8*2]
vinserti128 m1, [srcq+8*3], 1
movu xm2, [srcq+8*4]
vinserti128 m2, [srcq+8*5], 1
movu xm3, [srcq+8*6]
vinserti128 m3, [srcq+8*7], 1
add srcq, strideq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
add tmpq, 32*4
dec hd
jg .h_w64
RET
.h_w128:
movu xm0, [srcq+8*0]
vinserti128 m0, [srcq+8*1], 1
movu xm1, [srcq+8*2]
vinserti128 m1, [srcq+8*3], 1
movu xm2, [srcq+8*4]
vinserti128 m2, [srcq+8*5], 1
movu xm3, [srcq+8*6]
vinserti128 m3, [srcq+8*7], 1
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
movu xm0, [srcq+8* 8]
vinserti128 m0, [srcq+8* 9], 1
movu xm1, [srcq+8*10]
vinserti128 m1, [srcq+8*11], 1
movu xm2, [srcq+8*12]
vinserti128 m2, [srcq+8*13], 1
movu xm3, [srcq+8*14]
vinserti128 m3, [srcq+8*15], 1
add tmpq, 32*8
add srcq, strideq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq-32*4], m0
mova [tmpq-32*3], m1
mova [tmpq-32*2], m2
mova [tmpq-32*1], m3
dec hd
jg .h_w128
RET
.v:
WIN64_SPILL_XMM 7
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
imul mxyd, 255
add mxyd, 16
add wq, r6
lea stride3q, [strideq*3]
movd xm6, mxyd
vpbroadcastw m6, xm6
jmp wq
.v_w4:
movd xm0, [srcq+strideq*0]
.v_w4_loop:
vpbroadcastd m1, [srcq+strideq*2]
vpbroadcastd xm2, [srcq+strideq*1]
vpbroadcastd m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd m1, m0, 0x05 ; 0 2 2 2
vpbroadcastd m0, [srcq+strideq*0]
vpblendd m3, m2, 0x0f ; 1 1 3 3
vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
vpblendd m1, m3, 0xaa ; 0 1 2 3
vpblendd m2, m3, 0x55 ; 1 2 3 4
punpcklbw m1, m2
pmaddubsw m1, m6
mova [tmpq], m1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
RET
.v_w8:
movq xm0, [srcq+strideq*0]
.v_w8_loop:
vpbroadcastq m1, [srcq+strideq*2]
vpbroadcastq m2, [srcq+strideq*1]
vpbroadcastq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd m1, m0, 0x03 ; 0 2 2 2
vpbroadcastq m0, [srcq+strideq*0]
vpblendd m2, m3, 0xcc ; 1 3 1 3
vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
vpblendd m2, m1, 0x0f ; 0 2 1 3
vpblendd m3, m0, 0xc0 ; 1 3 2 4
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m6
pmaddubsw m2, m6
mova [tmpq+32*0], m1
mova [tmpq+32*1], m2
add tmpq, 32*2
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
vbroadcasti128 m0, [srcq+strideq*0]
.v_w16_loop:
vbroadcasti128 m1, [srcq+strideq*1]
vbroadcasti128 m2, [srcq+strideq*2]
vbroadcasti128 m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
shufpd m4, m0, m2, 0x0c ; 0 2
vbroadcasti128 m0, [srcq+strideq*0]
shufpd m1, m3, 0x0c ; 1 3
shufpd m2, m0, 0x0c ; 2 4
punpcklbw m3, m4, m1
punpcklbw m5, m1, m2
punpckhbw m4, m1
punpckhbw m1, m2
pmaddubsw m3, m6
pmaddubsw m5, m6
pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+32*0], m3
mova [tmpq+32*1], m5
mova [tmpq+32*2], m4
mova [tmpq+32*3], m1
add tmpq, 32*4
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
vpermq m0, [srcq+strideq*0], q3120
.v_w32_loop:
vpermq m1, [srcq+strideq*1], q3120
vpermq m2, [srcq+strideq*2], q3120
vpermq m3, [srcq+stride3q ], q3120
lea srcq, [srcq+strideq*4]
punpcklbw m4, m0, m1
punpckhbw m5, m0, m1
vpermq m0, [srcq+strideq*0], q3120
pmaddubsw m4, m6
pmaddubsw m5, m6
mova [tmpq+32*0], m4
mova [tmpq+32*1], m5
punpcklbw m4, m1, m2
punpckhbw m1, m2
pmaddubsw m4, m6
pmaddubsw m1, m6
punpcklbw m5, m2, m3
punpckhbw m2, m3
pmaddubsw m5, m6
pmaddubsw m2, m6
mova [tmpq+32*2], m4
mova [tmpq+32*3], m1
add tmpq, 32*8
punpcklbw m1, m3, m0
punpckhbw m3, m0
pmaddubsw m1, m6
pmaddubsw m3, m6
mova [tmpq-32*4], m5
mova [tmpq-32*3], m2
mova [tmpq-32*2], m1
mova [tmpq-32*1], m3
sub hd, 4
jg .v_w32_loop
RET
.v_w64:
vpermq m0, [srcq+strideq*0+32*0], q3120
vpermq m1, [srcq+strideq*0+32*1], q3120
.v_w64_loop:
vpermq m2, [srcq+strideq*1+32*0], q3120
vpermq m3, [srcq+strideq*1+32*1], q3120
lea srcq, [srcq+strideq*2]
punpcklbw m4, m0, m2
punpckhbw m0, m2
pmaddubsw m4, m6
pmaddubsw m0, m6
mova [tmpq+32*0], m4
mova [tmpq+32*1], m0
punpcklbw m4, m1, m3
punpckhbw m5, m1, m3
vpermq m0, [srcq+strideq*0+32*0], q3120
vpermq m1, [srcq+strideq*0+32*1], q3120
pmaddubsw m4, m6
pmaddubsw m5, m6
mova [tmpq+32*2], m4
mova [tmpq+32*3], m5
add tmpq, 32*8
punpcklbw m4, m2, m0
punpckhbw m2, m0
punpcklbw m5, m3, m1
punpckhbw m3, m1
pmaddubsw m4, m6
pmaddubsw m2, m6
pmaddubsw m5, m6
pmaddubsw m3, m6
mova [tmpq-32*4], m4
mova [tmpq-32*3], m2
mova [tmpq-32*2], m5
mova [tmpq-32*1], m3
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
lea r6d, [hq+(3<<8)]
mov r3, srcq
mov r5, tmpq
.v_w128_loop0:
vpermq m0, [srcq+strideq*0], q3120
.v_w128_loop:
vpermq m1, [srcq+strideq*1], q3120
lea srcq, [srcq+strideq*2]
punpcklbw m2, m0, m1
punpckhbw m3, m0, m1
vpermq m0, [srcq+strideq*0], q3120
pmaddubsw m2, m6
pmaddubsw m3, m6
punpcklbw m4, m1, m0
punpckhbw m1, m0
pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+32*0], m2
mova [tmpq+32*1], m3
mova [tmpq+32*8], m4
mova [tmpq+32*9], m1
add tmpq, 32*16
sub hd, 2
jg .v_w128_loop
add r3, 32
add r5, 64
movzx hd, r6b
mov srcq, r3
mov tmpq, r5
sub r6d, 1<<8
jg .v_w128_loop0
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
WIN64_SPILL_XMM 7
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
movd xm6, mxyd
vpbroadcastw m6, xm6
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
vbroadcasti128 m4, [bilin_h_shuf4]
vpbroadcastq m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
movq xm1, [srcq+strideq*1]
movhps xm1, [srcq+strideq*2]
movq xm2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
movhps xm2, [srcq+strideq*0]
vinserti128 m1, xm2, 1
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2 3 4
vpblendd m2, m1, m0, 0xc0
vpermq m2, m2, q2103 ; 0 1 2 3
mova m0, m1
psubw m1, m2
pmulhrsw m1, m6
paddw m1, m2
mova [tmpq], m1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
vbroadcasti128 m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
movu xm1, [srcq+strideq*1]
vinserti128 m1, [srcq+strideq*2], 1
movu xm2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vinserti128 m2, [srcq+strideq*0], 1
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5 ; 1 2
vperm2i128 m3, m0, m1, 0x21 ; 0 1
pmaddubsw m0, m2, m5 ; 3 4
vperm2i128 m2, m1, m0, 0x21 ; 2 3
psubw m1, m3
pmulhrsw m1, m6
paddw m1, m3
psubw m3, m0, m2
pmulhrsw m3, m6
paddw m3, m2
mova [tmpq+32*0], m1
mova [tmpq+32*1], m3
add tmpq, 32*2
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
pshufb m0, m4
pmaddubsw m0, m5
.hv_w16_loop:
movu xm1, [srcq+strideq*1+8*0]
vinserti128 m1, [srcq+strideq*1+8*1], 1
lea srcq, [srcq+strideq*2]
movu xm2, [srcq+strideq*0+8*0]
vinserti128 m2, [srcq+strideq*0+8*1], 1
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
psubw m3, m1, m0
pmulhrsw m3, m6
paddw m3, m0
pmaddubsw m0, m2, m5
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+32*0], m3
mova [tmpq+32*1], m2
add tmpq, 32*2
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
movu xm0, [srcq+8*0]
vinserti128 m0, [srcq+8*1], 1
movu xm1, [srcq+8*2]
vinserti128 m1, [srcq+8*3], 1
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w32_loop:
add srcq, strideq
movu xm2, [srcq+8*0]
vinserti128 m2, [srcq+8*1], 1
pshufb m2, m4
pmaddubsw m2, m5
psubw m3, m2, m0
pmulhrsw m3, m6
paddw m3, m0
mova m0, m2
movu xm2, [srcq+8*2]
vinserti128 m2, [srcq+8*3], 1
pshufb m2, m4
pmaddubsw m2, m5
mova [tmpq+32*0], m3
psubw m3, m2, m1
pmulhrsw m3, m6
paddw m3, m1
mova m1, m2
mova [tmpq+32*1], m3
add tmpq, 32*2
dec hd
jg .hv_w32_loop
RET
.hv_w128:
lea r3d, [hq+(7<<8)]
mov r6d, 256
jmp .hv_w64_start
.hv_w64:
lea r3d, [hq+(3<<8)]
mov r6d, 128
.hv_w64_start:
%if WIN64
PUSH r7
%endif
mov r5, srcq
mov r7, tmpq
.hv_w64_loop0:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
pshufb m0, m4
pmaddubsw m0, m5
.hv_w64_loop:
movu xm1, [srcq+strideq*1+8*0]
vinserti128 m1, [srcq+strideq*1+8*1], 1
lea srcq, [srcq+strideq*2]
movu xm2, [srcq+strideq*0+8*0]
vinserti128 m2, [srcq+strideq*0+8*1], 1
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
psubw m3, m1, m0
pmulhrsw m3, m6
paddw m3, m0
pmaddubsw m0, m2, m5
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+r6*0], m3
mova [tmpq+r6*1], m2
lea tmpq, [tmpq+r6*2]
sub hd, 2
jg .hv_w64_loop
add r5, 16
add r7, 32
movzx hd, r3b
mov srcq, r5
mov tmpq, r7
sub r3d, 1<<8
jg .hv_w64_loop0
%if WIN64
POP r7
%endif
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%if %0 == 5 ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
%if WIN64
DECLARE_REG_TMP 4, 5
%else
DECLARE_REG_TMP 7, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_6tap_8bpc, 4, 9, 0, dst, ds , src, ss , w, h, mx, my, ns
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx2]
mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
lea r6, [ssq*3]
lea r7, [dsq*3]
%if WIN64
pop r8
%endif
jmp wq
.h_w2:
movzx mxd, mxb
lea srcq, [srcq-1]
vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
je .h_w4
mova xm3, [subpel_h_shuf4]
.h_w2_loop:
movq xm0, [srcq+ssq*0]
movhps xm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb xm0, xm3
pmaddubsw xm0, xm4
phaddw xm0, xm0
paddw xm0, xm5
psraw xm0, 6
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
mova xm3, [subpel_h_shufA]
.h_w4_loop:
movq xm0, [srcq+ssq*0]
movq xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb xm0, xm3
pshufb xm1, xm3
pmaddubsw xm0, xm4
pmaddubsw xm1, xm4
phaddw xm0, xm1
paddw xm0, xm5
psraw xm0, 6
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h:
test myd, 0xf00
jnz .hv
vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
cmp wd, 4
jle .h_w2
WIN64_SPILL_XMM 11
tzcnt wd, wd
vbroadcasti128 m4, [z_filter_s+ 2] ; 01
shr mxd, 16
vbroadcasti128 m6, [z_filter_s+ 6] ; 23
sub srcq, 2
vbroadcasti128 m7, [z_filter_s+10] ; 45
lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)]
vpbroadcastw m8, [mxq+0]
vpbroadcastw m9, [mxq+2]
add wq, r8
vpbroadcastw m10, [mxq+4]
jmp wq
.h_w8:
%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
pshufb m%2, m%1, m4
pmaddubsw m%2, m8
pshufb m%3, m%1, m6
pmaddubsw m%3, m9
pshufb m%1, m7
pmaddubsw m%1, m10
paddw m%2, m5
paddw m%1, m%3
paddw m%1, m%2
psraw m%1, 6
%endmacro
movu xm0, [srcq+ssq*0]
vinserti128 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
PUT_6TAP_H 0, 1, 2
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu xm0, [srcq+ssq*0+8*0]
vinserti128 m0, [srcq+ssq*1+8*0], 1
movu xm1, [srcq+ssq*0+8*1]
vinserti128 m1, [srcq+ssq*1+8*1], 1
PUT_6TAP_H 0, 2, 3
lea srcq, [srcq+ssq*2]
PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16
RET
.h_w32:
xor r6d, r6d
jmp .h_start
.h_w64:
mov r6, -32*1
jmp .h_start
.h_w128:
mov r6, -32*3
.h_start:
sub srcq, r6
sub dstq, r6
mov r4, r6
.h_loop:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
PUT_6TAP_H 0, 2, 3
PUT_6TAP_H 1, 2, 3
packuswb m0, m1
mova [dstq+r6], m0
add r6, 32
jle .h_loop
add srcq, ssq
add dstq, dsq
mov r6, r4
dec hd
jg .h_loop
RET
.v:
WIN64_SPILL_XMM 9, 12
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
tzcnt r6d, wd
movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
vpbroadcastd m8, [pw_512]
lea myq, [r8+myq*8+subpel_filters+1-put_avx2]
vpbroadcastw m5, [myq+0]
vpbroadcastw m6, [myq+2]
vpbroadcastw m7, [myq+4]
add r6, r8
mov nsq, ssq
neg nsq
jmp r6
.v_w2:
movd xm2, [srcq+nsq*2]
pinsrw xm2, [srcq+nsq*1], 2
pinsrw xm2, [srcq+ssq*0], 4
pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
vpbroadcastd xm0, [srcq+ssq*0]
palignr xm3, xm0, xm2, 4 ; 1 2 3 4
punpcklbw xm1, xm2, xm3 ; 01 12
punpckhbw xm2, xm3 ; 23 34
.v_w2_loop:
vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw xm3, xm1, xm5 ; a0 b0
mova xm1, xm2
pmaddubsw xm2, xm6 ; a1 b1
paddw xm3, xm2
vpblendd xm2, xm0, xm4, 0x02 ; 4 5
vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 5 6
punpcklbw xm2, xm4 ; 67 78
pmaddubsw xm4, xm2, xm7 ; a3 b3
paddw xm3, xm4
pmulhrsw xm3, xm8
packuswb xm3, xm3
pextrw [dstq+dsq*0], xm3, 0
pextrw [dstq+dsq*1], xm3, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movd xm2, [srcq+nsq*2]
pinsrd xm2, [srcq+nsq*1], 1
pinsrd xm2, [srcq+ssq*0], 2
pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
vpbroadcastd xm0, [srcq+ssq*0]
palignr xm3, xm0, xm2, 4 ; 1 2 3 4
punpcklbw xm1, xm2, xm3 ; 01 12
punpckhbw xm2, xm3 ; 23 34
.v_w4_loop:
vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw xm3, xm1, xm5 ; a0 b0
mova xm1, xm2
pmaddubsw xm2, xm6 ; a1 b1
paddw xm3, xm2
vpblendd xm2, xm0, xm4, 0x02 ; 4 5
vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 5 6
punpcklbw xm2, xm4 ; 45 56
pmaddubsw xm4, xm2, xm7 ; a2 b2
paddw xm3, xm4
pmulhrsw xm3, xm8
packuswb xm3, xm3
movd [dstq+dsq*0], xm3
pextrd [dstq+dsq*1], xm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movq xm1, [srcq+nsq*2]
vpbroadcastq m3, [srcq+nsq*1]
vpbroadcastq m2, [srcq+ssq*0]
vpbroadcastq m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq m0, [srcq+ssq*0]
vpblendd m1, m3, 0x30
vpblendd m3, m2, 0x30
punpcklbw m1, m3 ; 01 12
vpblendd m2, m4, 0x30
vpblendd m4, m0, 0x30
punpcklbw m2, m4 ; 23 34
.v_w8_loop:
vpbroadcastq m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw m3, m1, m5 ; a0 b0
mova m1, m2
pmaddubsw m2, m6 ; a1 b1
paddw m3, m2
vpblendd m2, m0, m4, 0x30
vpbroadcastq m0, [srcq+ssq*0]
vpblendd m4, m0, 0x30
punpcklbw m2, m4 ; 45 56
pmaddubsw m4, m2, m7 ; a2 b2
paddw m3, m4
pmulhrsw m3, m8
vextracti128 xm4, m3, 1
packuswb xm3, xm4
movq [dstq+dsq*0], xm3
movhps [dstq+dsq*1], xm3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
.v_w32:
.v_w64:
.v_w128:
lea r6d, [wq*8-128]
WIN64_PUSH_XMM 12
lea r6d, [hq+r6*2]
.v_w16_loop0:
vbroadcasti128 m3, [srcq+nsq*2]
vbroadcasti128 m4, [srcq+nsq*1]
lea r4, [srcq+ssq*2]
vbroadcasti128 m0, [srcq+ssq*0]
vbroadcasti128 m1, [srcq+ssq*1]
mov r7, dstq
vbroadcasti128 m2, [r4+ssq*0]
shufpd m3, m0, 0x0c
shufpd m4, m1, 0x0c
punpcklbw m1, m3, m4 ; 01
punpckhbw m3, m4 ; 23
shufpd m0, m2, 0x0c
punpcklbw m2, m4, m0 ; 12
punpckhbw m4, m0 ; 34
.v_w16_loop:
vbroadcasti128 m9, [r4+ssq*1]
pmaddubsw m10, m1, m5 ; a0
lea r4, [r4+ssq*2]
pmaddubsw m11, m2, m5 ; b0
mova m1, m3
pmaddubsw m3, m6 ; a1
mova m2, m4
pmaddubsw m4, m6 ; b1
paddw m10, m3
vbroadcasti128 m3, [r4+ssq*0]
paddw m11, m4
shufpd m4, m0, m9, 0x0d
shufpd m0, m9, m3, 0x0c
punpcklbw m3, m4, m0 ; 45
punpckhbw m4, m0 ; 56
pmaddubsw m9, m3, m7 ; a2
paddw m10, m9
pmaddubsw m9, m4, m7 ; b2
paddw m11, m9
pmulhrsw m10, m8
pmulhrsw m11, m8
packuswb m10, m11
vpermq m10, m10, q3120
mova [r7+dsq*0], xm10
vextracti128 [r7+dsq*1], m10, 1
lea r7, [r7+dsq*2]
sub hd, 2
jg .v_w16_loop
add srcq, 16
add dstq, 16
movzx hd, r6b
sub r6d, 1<<8
jg .v_w16_loop0
RET
.hv:
WIN64_SPILL_XMM 12, 16
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
dec srcq
vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
vpbroadcastd m7, [pw_8192]
punpcklbw m0, m0
vpbroadcastd m8, [pd_512]
psraw m0, 8 ; sign-extend
mov nsq, ssq
pshufd m9, m0, q0000
neg nsq
pshufd m10, m0, q1111
pshufd m11, m0, q2222
cmp wd, 4
je .hv_w4
vbroadcasti128 m5, [subpel_h_shuf4]
movq xm2, [srcq+nsq*2]
movhps xm2, [srcq+nsq*1]
movq xm0, [srcq+ssq*0]
movhps xm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq m1, [srcq+ssq*0]
vpblendd m2, m1, 0x30
pshufb m2, m5
pshufb xm0, xm5
pmaddubsw m2, m6
pmaddubsw xm0, xm6
phaddw m2, m0
pmulhrsw m2, m7
vextracti128 xm0, m2, 1
palignr xm0, xm2, 4
punpcklwd xm1, xm2, xm0 ; 01 12
punpckhwd xm2, xm0 ; 23 34
.hv_w2_loop:
movq xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xm4, [srcq+ssq*0]
pshufb xm4, xm5
pmaddubsw xm4, xm6
pmaddwd xm3, xm9, xm1 ; a0 b0
mova xm1, xm2
pmaddwd xm2, xm10 ; a1 b1
phaddw xm4, xm4
paddd xm3, xm2
pmulhrsw xm4, xm7
palignr xm2, xm4, xm0, 12
mova xm0, xm4
punpcklwd xm2, xm4 ; 45 56
pmaddwd xm4, xm11, xm2 ; a2 b2
paddd xm3, xm8
paddd xm3, xm4
psrad xm3, 10
packssdw xm3, xm3
packuswb xm3, xm3
pextrw [dstq+dsq*0], xm3, 0
pextrw [dstq+dsq*1], xm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
mova m5, [subpel_h_shuf4]
vpbroadcastq m2, [srcq+nsq*2]
vpbroadcastq m4, [srcq+nsq*1]
vpbroadcastq m1, [srcq+ssq*0]
vpbroadcastq m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq m0, [srcq+ssq*0]
vpblendd m2, m4, 0xcc ; 0 1
vpblendd m1, m3, 0xcc ; 2 3
pshufb m2, m5
pshufb m1, m5
pshufb m0, m5
pmaddubsw m2, m6
pmaddubsw m1, m6
pmaddubsw m0, m6
phaddw m2, m1
phaddw m0, m0
pmulhrsw m2, m7
pmulhrsw m0, m7
palignr m3, m0, m2, 4
punpcklwd m1, m2, m3 ; 01 12
punpckhwd m2, m3 ; 23 34
.hv_w4_loop:
vpbroadcastq m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddwd m3, m9, m1 ; a0 b0
mova m1, m2
pmaddwd m2, m10 ; a1 b1
paddd m3, m2
vpbroadcastq m2, [srcq+ssq*0]
vpblendd m4, m2, 0xcc ; 5 6
pshufb m4, m5
pmaddubsw m4, m6
phaddw m4, m4
pmulhrsw m4, m7
palignr m2, m4, m0, 12
mova m0, m4
punpcklwd m2, m4 ; 45 56
pmaddwd m4, m11, m2 ; a2 b2
paddd m3, m8
paddd m3, m4
psrad m3, 10
vextracti128 xm4, m3, 1
packssdw xm3, xm4
packuswb xm3, xm3
pshuflw xm3, xm3, q3120
movd [dstq+dsq*0], xm3
pextrd [dstq+dsq*1], xm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
shr mxd, 16
sub srcq, 2
lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
WIN64_PUSH_XMM 16
vpbroadcastw m10, [mxq+0]
vpbroadcastw m11, [mxq+2]
vpbroadcastw m12, [mxq+4]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2]
lea r6d, [wq*8-64]
vbroadcasti128 m8, [z_filter_s+ 6]
punpcklbw m0, m0
vbroadcasti128 m9, [z_filter_s+10]
psraw m0, 8 ; sign-extend
mov nsq, ssq
pshufd m13, m0, q0000
neg nsq
pshufd m14, m0, q1111
lea r6d, [hq+r6*4]
pshufd m15, m0, q2222
.hv_w8_loop0:
vbroadcasti128 m7, [z_filter_s+2]
movu xm3, [srcq+nsq*2]
lea r4, [srcq+ssq*2]
movu xm4, [srcq+nsq*1]
vbroadcasti128 m0, [srcq+ssq*0]
mov r7, dstq
vinserti128 m4, [srcq+ssq*1], 1 ; 1 3
vpblendd m3, m0, 0xf0 ; 0 2
vinserti128 m0, [r4+ssq*0], 1 ; 2 4
vpbroadcastd m5, [pw_8192]
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=93 H=75 G=84
¤ Dauer der Verarbeitung: 0.29 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland