; Copyright © 2020, VideoLAN and dav1d authors
; Copyright © 2020, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
dd 1
pw_2048: times 2 dw 2048
dd 3
pw_8192: times 2 dw 8192
avg_shift: dw 5, 5, 3, 3
pw_27615: times 2 dw 27615
pw_32766: times 2 dw 32766
warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
resize_permE: dq 0, 2, 4, 6
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
prep_hv_shift: dq 6, 4
put_bilin_h_rnd: dw 8, 8, 10, 10
prep_mul: dw 16, 16, 4, 4
put_8tap_h_rnd: dd 34, 40
prep_8tap_rnd: dd 128 - (8192 << 8)
warp_8x8_rnd_h: dd 512, 2048
warp_8x8_rnd_v: dd 262144, 65536
warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
avg_round: dw -16400, -16400, -16388, -16388
w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
w_mask_round: dd 128, 64
bidir_shift: dw 6, 6, 4, 4
pb_64: times 4 db 64
pw_m512: times 2 dw -512
pw_2: times 2 dw 2
pw_64: times 2 dw 64
pd_32: dd 32
pd_63: dd 63
pd_128: dd 128
pd_640: dd 640
pd_2176: dd 2176
pd_16384: dd 16384
pd_0_4: dd 0, 4
%define pw_16 prep_mul
%define pd_512 warp_8x8_rnd_h
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask , avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern obmc_masks_avx2
cextern resize_filter
SECTION .text
%if WIN64
DECLARE_REG_TMP 4
%else
DECLARE_REG_TMP 8
%endif
INIT_ZMM avx512icl
cglobal put_bilin_16bpc, 4, 8, 13, dst, ds , src, ss , w, h, mxy
mov mxyd, r6m ; mx
lea r7, [put_avx512icl]
tzcnt t0d, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx t0d, word [r7+t0*2+table_offset(put,)]
add t0, r7
jmp t0
.put_w2:
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6d
mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6
mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movu xmm0, [srcq+ssq*0]
movu xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], xmm0
mova [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu ym0, [srcq+ssq*0]
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], ym0
mova [dstq+dsq*1], ym1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+64*0], m0
mova [dstq+dsq*0+64*1], m1
mova [dstq+dsq*1+64*0], m2
mova [dstq+dsq*1+64*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w64
RET
.put_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
add srcq, ssq
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .put_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
vpbroadcastd m4, [pw_16]
psubw m4, m5
test mxyd, mxyd
jnz .hv
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
mov r6d, r8m ; bitdepth_max
add t0, r7
shr r6d, 11
vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
jmp t0
.h_w2:
movq xmm1, [srcq+ssq*0]
movhps xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
movq xmm0, [srcq+ssq*0+0]
movhps xmm0, [srcq+ssq*1+0]
movq xmm1, [srcq+ssq*0+2]
movhps xmm1, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xm4
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+ssq*0+0]
vinserti32x4 ym0, [srcq+ssq*1+0], 1
movu xm1, [srcq+ssq*0+2]
vinserti32x4 ym1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw ym0, ym4
pmullw ym1, ym5
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 4
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+ssq*0+0]
vinserti32x8 m0, [srcq+ssq*1+0], 1
movu ym1, [srcq+ssq*0+2]
vinserti32x8 m1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m6
paddw m0, m1
psrlw m0, 4
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
pmullw m1, m4, [srcq+ssq*1+0]
pmullw m3, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+64*0+0]
pmullw m2, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m3, m5, [srcq+64*1+2]
add srcq, ssq
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+64*0+0]
pmullw m7, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m8, m5, [srcq+64*1+2]
pmullw m2, m4, [srcq+64*2+0]
pmullw m9, m5, [srcq+64*2+2]
pmullw m3, m4, [srcq+64*3+0]
pmullw m10, m5, [srcq+64*3+2]
add srcq, ssq
REPX {paddw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psrlw x, 4}, m0, m1, m2, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
shl mxyd, 11
vpbroadcastw m8, mxyd
add t0, r7
jmp t0
.v_w2:
movd xmm0, [srcq+ssq*0]
.v_w2_loop:
movd xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpckldq xmm2, xmm0, xmm1
movd xmm0, [srcq+ssq*0]
punpckldq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm0, [srcq+ssq*0]
.v_w4_loop:
movq xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq xmm2, xmm0, xmm1
movq xmm0, [srcq+ssq*0]
punpcklqdq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movu xmm0, [srcq+ssq*0]
.v_w8_loop:
vbroadcasti128 ymm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd ymm2, ymm0, ymm1, 0xf0
vbroadcasti128 ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm0, 0xf0
psubw ymm1, ymm2
pmulhrsw ymm1, ym8
paddw ymm1, ymm2
mova [dstq+dsq*0], xmm1
vextracti128 [dstq+dsq*1], ymm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
vzeroupper
RET
.v_w16:
movu ym0, [srcq+ssq*0]
.v_w16_loop:
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw ym1, ym3, ym0
pmulhrsw ym1, ym8
paddw ym1, ym0
movu ym0, [srcq+ssq*0]
psubw ym2, ym0, ym3
pmulhrsw ym2, ym8
paddw ym2, ym3
mova [dstq+dsq*0], ym1
mova [dstq+dsq*1], ym2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+ssq*0]
.v_w32_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw m1, m3, m0
pmulhrsw m1, m8
paddw m1, m0
movu m0, [srcq+ssq*0]
psubw m2, m0, m3
pmulhrsw m2, m8
paddw m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
.v_w64_loop:
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
psubw m4, m2, m0
pmulhrsw m4, m8
paddw m4, m0
movu m0, [srcq+ssq*0+64*0]
psubw m5, m3, m1
pmulhrsw m5, m8
paddw m5, m1
movu m1, [srcq+ssq*0+64*1]
psubw m6, m0, m2
pmulhrsw m6, m8
psubw m7, m1, m3
pmulhrsw m7, m8
mova [dstq+dsq*0+64*0], m4
mova [dstq+dsq*0+64*1], m5
paddw m6, m2
paddw m7, m3
mova [dstq+dsq*1+64*0], m6
mova [dstq+dsq*1+64*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*0+64*2]
movu m3, [srcq+ssq*0+64*3]
.v_w128_loop:
movu m4, [srcq+ssq*1+64*0]
movu m5, [srcq+ssq*1+64*1]
movu m6, [srcq+ssq*1+64*2]
movu m7, [srcq+ssq*1+64*3]
lea srcq, [srcq+ssq*2]
psubw m9, m4, m0
pmulhrsw m9, m8
paddw m9, m0
movu m0, [srcq+ssq*0+64*0]
psubw m10, m5, m1
pmulhrsw m10, m8
paddw m10, m1
movu m1, [srcq+ssq*0+64*1]
psubw m11, m6, m2
pmulhrsw m11, m8
paddw m11, m2
movu m2, [srcq+ssq*0+64*2]
psubw m12, m7, m3
pmulhrsw m12, m8
paddw m12, m3
movu m3, [srcq+ssq*0+64*3]
mova [dstq+dsq*0+64*0], m9
psubw m9, m0, m4
pmulhrsw m9, m8
mova [dstq+dsq*0+64*1], m10
psubw m10, m1, m5
pmulhrsw m10, m8
mova [dstq+dsq*0+64*2], m11
psubw m11, m2, m6
pmulhrsw m11, m8
mova [dstq+dsq*0+64*3], m12
psubw m12, m3, m7
pmulhrsw m12, m8
paddw m9, m4
paddw m10, m5
mova [dstq+dsq*1+64*0], m9
mova [dstq+dsq*1+64*1], m10
paddw m11, m6
paddw m12, m7
mova [dstq+dsq*1+64*2], m11
mova [dstq+dsq*1+64*3], m12
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w128_loop
RET
.hv:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
shl mxyd, 11
vpbroadcastd m6, [pw_2]
vpbroadcastw m7, mxyd
vpbroadcastd m8, [pw_8192]
add t0, r7
test dword r8m, 0x800
jnz .hv_12bpc
psllw m4, 2
psllw m5, 2
vpbroadcastd m8, [pw_2048]
.hv_12bpc:
jmp t0
.hv_w2:
vpbroadcastq xmm1, [srcq+ssq*0]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w2_loop:
movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm2, [srcq+ssq*0]
pmullw xmm1, xmm2, xm4
psrlq xmm2, 16
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 _ 2 _
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
pmullw xmm0, xm4, [srcq+ssq*0-8]
pmullw xmm1, xm5, [srcq+ssq*0-6]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w4_loop:
movq xmm1, [srcq+ssq*1+0]
movq xmm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
movhps xmm1, [srcq+ssq*0+0]
movhps xmm2, [srcq+ssq*0+2]
pmullw xmm1, xm4
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 2
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
pmullw xmm0, xm4, [srcq+ssq*0+0]
pmullw xmm1, xm5, [srcq+ssq*0+2]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
vinserti32x4 ym0, xmm0, 1
.hv_w8_loop:
movu xm1, [srcq+ssq*1+0]
movu xm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym1, [srcq+ssq*0+0], 1
vinserti32x4 ym2, [srcq+ssq*0+2], 1
pmullw ym1, ym4
pmullw ym2, ym5
paddw ym1, ym6
paddw ym1, ym2
psrlw ym1, 2 ; 1 2
vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
mova ym0, ym1
psubw ym1, ym2
paddw ym1, ym1
pmulhw ym1, ym7
paddw ym1, ym2
pmulhrsw ym1, ym8
mova [dstq+dsq*0], xm1
vextracti32x4 [dstq+dsq*1], ym1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+ssq*0+0]
pmullw ym1, ym5, [srcq+ssq*0+2]
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+ssq*1+0]
movu ym2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x8 m1, [srcq+ssq*0+0], 1
vinserti32x8 m2, [srcq+ssq*0+2], 1
pmullw m1, m4
pmullw m2, m5
paddw m1, m6
paddw m1, m2
psrlw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m7
paddw m1, m2
pmulhrsw m1, m8
mova [dstq+dsq*0], ym1
vextracti32x8 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
.hv_w64:
.hv_w128:
movifnidn wd, wm
lea r6d, [hq+wq*8-256]
mov r4, srcq
mov r7, dstq
.hv_w32_loop0:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m1, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m1
psrlw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+ssq*1+0]
pmullw m1, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m3, m6
paddw m3, m1
psrlw m3, 2
psubw m1, m3, m0
paddw m1, m1
pmulhw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m2
psrlw m0, 2
psubw m2, m0, m3
paddw m2, m2
pmulhw m2, m7
paddw m2, m3
pmulhrsw m1, m8
pmulhrsw m2, m8
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w32_loop
add r4, 64
add r7, 64
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
jg .hv_w32_loop0
RET
cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea r6, [prep_avx512icl]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
movzx wd, word [r6+wq*2+table_offset(prep,)]
mov r5d, r7m ; bitdepth_max
vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
add wq, r6
shr r5d, 11
vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
mov r3d, 0x0c
kmovb k1, r3d
.prep_w4_loop:
movq xm0, [srcq+strideq*0]
movhps xm0, [srcq+strideq*1]
vpbroadcastq ym1, [srcq+strideq*2]
vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4}
lea srcq, [srcq+strideq*4]
pmullw ym0, ym4
psubw ym0, ym5
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .prep_w4_loop
RET
.prep_w8:
movu xm0, [srcq+strideq*0]
vinserti32x4 ym0, [srcq+strideq*1], 1
vinserti32x4 m0, [srcq+strideq*2], 2
vinserti32x4 m0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
psubw m0, m5
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1
movu ym1, [srcq+strideq*2]
vinserti32x8 m1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m4
psubw m0, m5
psubw m1, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 4
jg .prep_w16
RET
.prep_w32:
pmullw m0, m4, [srcq+strideq*0]
pmullw m1, m4, [srcq+strideq*1]
pmullw m2, m4, [srcq+strideq*2]
pmullw m3, m4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 4
jg .prep_w32
RET
.prep_w64:
pmullw m0, m4, [srcq+strideq*0+64*0]
pmullw m1, m4, [srcq+strideq*0+64*1]
pmullw m2, m4, [srcq+strideq*1+64*0]
pmullw m3, m4, [srcq+strideq*1+64*1]
lea srcq, [srcq+strideq*2]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 2
jg .prep_w64
RET
.prep_w128:
pmullw m0, m4, [srcq+64*0]
pmullw m1, m4, [srcq+64*1]
pmullw m2, m4, [srcq+64*2]
pmullw m3, m4, [srcq+64*3]
add srcq, strideq
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .prep_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
vpbroadcastd m4, [pw_16]
vpbroadcastd m6, [pw_32766]
psubw m4, m5
test dword r7m, 0x800
jnz .h_12bpc
psllw m4, 2
psllw m5, 2
.h_12bpc:
test mxyd, mxyd
jnz .hv
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
movu xm1, [srcq+strideq*0]
vinserti32x4 ym1, [srcq+strideq*2], 1
movu xm2, [srcq+strideq*1]
vinserti32x4 ym2, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
punpcklqdq ym0, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym0, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+strideq*0+0]
movu xm1, [srcq+strideq*0+2]
vinserti32x4 ym0, [srcq+strideq*1+0], 1
vinserti32x4 ym1, [srcq+strideq*1+2], 1
vinserti32x4 m0, [srcq+strideq*2+0], 2
vinserti32x4 m1, [srcq+strideq*2+2], 2
vinserti32x4 m0, [srcq+stride3q +0], 3
vinserti32x4 m1, [srcq+stride3q +2], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+strideq*0+0]
vinserti32x8 m0, [srcq+strideq*1+0], 1
movu ym1, [srcq+strideq*0+2]
vinserti32x8 m1, [srcq+strideq*1+2], 1
lea srcq, [srcq+strideq*2]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
pmullw m1, m4, [srcq+strideq*1+0]
pmullw m3, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
add srcq, strideq
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m7, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m8, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m9, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m10, m5, [srcq+194]
add srcq, strideq
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psraw x, 2}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .h_w128
RET
.v:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
vpbroadcastw m9, mxyd
vpbroadcastd m8, [pw_16]
vpbroadcastd m10, [pw_32766]
add wq, r6
lea stride3q, [strideq*3]
psubw m8, m9
test dword r7m, 0x800
jnz .v_12bpc
psllw m8, 2
psllw m9, 2
.v_12bpc:
jmp wq
.v_w4:
movq xmm0, [srcq+strideq*0]
.v_w4_loop:
vpbroadcastq xmm2, [srcq+strideq*1]
vpbroadcastq ymm1, [srcq+strideq*2]
vpbroadcastq ymm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd ymm2, ymm1, 0x30
vpblendd ymm2, ymm3, 0xc0
vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
movq xmm0, [srcq+strideq*0]
valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
pmullw ymm1, ym8
pmullw ymm2, ym9
psubw ymm1, ym10
paddw ymm1, ymm2
psraw ymm1, 2
mova [tmpq], ymm1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
vzeroupper
RET
.v_w8:
movu xm0, [srcq+strideq*0]
.v_w8_loop:
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
vinserti32x4 m1, [srcq+strideq*2], 2
vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
lea srcq, [srcq+strideq*4]
movu xm0, [srcq+strideq*0]
valignq m2, m0, m1, 2 ; 1 2 3 4
pmullw m1, m8
pmullw m2, m9
psubw m1, m10
paddw m1, m2
psraw m1, 2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
movu ym0, [srcq+strideq*0]
.v_w16_loop:
vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
movu ym3, [srcq+strideq*2]
vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
lea srcq, [srcq+strideq*4]
movu ym0, [srcq+strideq*0]
vshufi32x4 m3, m1, m3, q1032 ; 1 2
vshufi32x4 m4, m2, m0, q1032 ; 3 4
pmullw m1, m8
pmullw m2, m8
pmullw m3, m9
pmullw m4, m9
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+strideq*0]
.v_w32_loop:
movu m3, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m1, m8, m0
movu m0, [srcq+strideq*0]
pmullw m2, m8, m3
pmullw m3, m9
pmullw m4, m9, m0
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
.v_w64_loop:
add srcq, strideq
pmullw m2, m8, m0
movu m0, [srcq+64*0]
pmullw m3, m8, m1
movu m1, [srcq+64*1]
pmullw m4, m9, m0
pmullw m5, m9, m1
psubw m2, m10
psubw m3, m10
paddw m2, m4
paddw m3, m5
psraw m2, 2
psraw m3, 2
mova [tmpq+64*0], m2
mova [tmpq+64*1], m3
add tmpq, 64*2
dec hd
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
.v_w128_loop:
add srcq, strideq
pmullw m4, m8, m0
movu m0, [srcq+64*0]
pmullw m5, m8, m1
movu m1, [srcq+64*1]
pmullw m6, m8, m2
movu m2, [srcq+64*2]
pmullw m7, m8, m3
movu m3, [srcq+64*3]
pmullw m11, m9, m0
pmullw m12, m9, m1
pmullw m13, m9, m2
pmullw m14, m9, m3
REPX {psubw x, m10}, m4, m5, m6, m7
paddw m4, m11
paddw m5, m12
paddw m6, m13
paddw m7, m14
REPX {psraw x, 2}, m4, m5, m6, m7
mova [tmpq+64*0], m4
mova [tmpq+64*1], m5
mova [tmpq+64*2], m6
mova [tmpq+64*3], m7
add tmpq, 64*4
dec hd
jg .v_w128_loop
RET
.hv:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
vpbroadcastw m7, mxyd
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
movq xmm0, [srcq+strideq*0+0]
movq xmm1, [srcq+strideq*0+2]
pmullw xmm0, xm4
pmullw xmm1, xm5
psubw xmm0, xm6
paddw xmm0, xmm1
psraw xmm0, 2
vpbroadcastq ym0, xmm0
.hv_w4_loop:
movu xm1, [srcq+strideq*1]
vinserti128 ym1, [srcq+stride3q ], 1
movu xm2, [srcq+strideq*2]
lea srcq, [srcq+strideq*4]
vinserti128 ym2, [srcq+strideq*0], 1
punpcklqdq ym3, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym3, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym3, ym6
paddw ym1, ym3
psraw ym1, 2 ; 1 2 3 4
valignq ym2, ym1, ym0, 3 ; 0 1 2 3
mova ym0, ym1
psubw ym1, ym2
pmulhrsw ym1, ym7
paddw ym1, ym2
mova [tmpq], ym1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
pmullw xm0, xm4, [srcq+strideq*0+0]
pmullw xm1, xm5, [srcq+strideq*0+2]
psubw xm0, xm6
paddw xm0, xm1
psraw xm0, 2
vinserti32x4 m0, xm0, 3
.hv_w8_loop:
movu xm1, [srcq+strideq*1+0]
movu xm2, [srcq+strideq*1+2]
vinserti32x4 ym1, [srcq+strideq*2+0], 1
vinserti32x4 ym2, [srcq+strideq*2+2], 1
vinserti32x4 m1, [srcq+stride3q +0], 2
vinserti32x4 m2, [srcq+stride3q +2], 2
lea srcq, [srcq+strideq*4]
vinserti32x4 m1, [srcq+strideq*0+0], 3
vinserti32x4 m2, [srcq+strideq*0+2], 3
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2 3 4
valignq m2, m1, m0, 6 ; 0 1 2 3
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+strideq*0+0]
pmullw ym1, ym5, [srcq+strideq*0+2]
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+strideq*1+0]
movu ym2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
vinserti32x8 m1, [srcq+strideq*0+0], 1
vinserti32x8 m2, [srcq+strideq*0+2], 1
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m1, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m1
psraw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+strideq*1+0]
pmullw m1, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m3, m6
paddw m3, m1
psraw m3, 2
psubw m1, m3, m0
pmulhrsw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m2
psraw m0, 2
psubw m2, m0, m3
pmulhrsw m2, m7
paddw m2, m3
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .hv_w32_loop
RET
.hv_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
.hv_w64_loop:
add srcq, strideq
pmullw m2, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m3, m4, [srcq+64]
pmullw m9, m5, [srcq+66]
psubw m2, m6
psubw m3, m6
paddw m2, m8
paddw m3, m9
psraw m2, 2
psraw m3, 2
psubw m8, m2, m0
psubw m9, m3, m1
pmulhrsw m8, m7
pmulhrsw m9, m7
paddw m8, m0
mova m0, m2
paddw m9, m1
mova m1, m3
mova [tmpq+64*0], m8
mova [tmpq+64*1], m9
add tmpq, 64*2
dec hd
jg .hv_w64_loop
RET
.hv_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m9, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m10, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m11, m5, [srcq+194]
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m8
paddw m1, m9
paddw m2, m10
paddw m3, m11
REPX {psraw x, 2}, m0, m1, m2, m3
.hv_w128_loop:
add srcq, strideq
pmullw m8, m4, [srcq+ 0]
pmullw m12, m5, [srcq+ 2]
pmullw m9, m4, [srcq+ 64]
pmullw m13, m5, [srcq+ 66]
pmullw m10, m4, [srcq+128]
pmullw m14, m5, [srcq+130]
pmullw m11, m4, [srcq+192]
pmullw m15, m5, [srcq+194]
REPX {psubw x, m6}, m8, m9, m10, m11
paddw m8, m12
paddw m9, m13
paddw m10, m14
paddw m11, m15
REPX {psraw x, 2}, m8, m9, m10, m11
psubw m12, m8, m0
psubw m13, m9, m1
psubw m14, m10, m2
psubw m15, m11, m3
REPX {pmulhrsw x, m7}, m12, m13, m14, m15
paddw m12, m0
mova m0, m8
paddw m13, m1
mova m1, m9
mova [tmpq+64*0], m12
mova [tmpq+64*1], m13
paddw m14, m2
mova m2, m10
paddw m15, m3
mova m3, m11
mova [tmpq+64*2], m14
mova [tmpq+64*3], m15
add tmpq, 64*4
dec hd
jg .hv_w128_loop
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
cglobal %1_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%if %0 == 5 ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
%if WIN64
DECLARE_REG_TMP 4, 5
%define buf rsp+stack_offset+8 ; shadow space
%else
DECLARE_REG_TMP 7, 8
%define buf rsp-40 ; red zone
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_6tap_16bpc, 4, 9, 0, dst, ds , src, ss , w, h, mx, my
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movifnidn wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
%if WIN64
pop r8
%endif
jmp wq
.h_w8:
mova m4, [spel_h_shufA]
movu m5, [spel_h_shufB]
movu m6, [spel_h_shufC]
.h_w8_loop:
movu ym2, [srcq+ssq*0]
vinserti32x8 m2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
vpermb m1, m4, m2
vpdpwssd m0, m10, m1
vpermb m1, m5, m2
vpdpwssd m0, m11, m1
vpermb m1, m6, m2
vpdpwssd m0, m12, m1
psrad m0, 6
vextracti32x8 ym1, m0, 1
packusdw ym0, ym1
pminsw ym0, ym15
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8_loop
RET
.h:
vpbroadcastw m15, r8m
test myd, 0xf00
jnz .hv
mov r7d, r8m
shr r7d, 11
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
cmp wd, 4
jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
shr mxd, 16
sub srcq, 4
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
mova [buf], xmm0
vpbroadcastd m10, xmm0
vpbroadcastd m12, [buf+8]
vpbroadcastd m11, [buf+4]
sub wd, 16
jl .h_w8
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
jg .h_w32
.h_w16_loop:
movu ym2, [srcq+ssq*0+ 0]
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
movu ym3, [srcq+ssq*0+12]
vinserti32x8 m3, [srcq+ssq*1+12], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0 b0
pshufb m4, m3, m7
vpdpwssd m1, m12, m4 ; a2' b2'
pshufb m2, m7
pshufb m3, m6
vpdpwssd m0, m11, m2 ; a1 b1
vpdpwssd m1, m11, m3 ; a1' b1'
shufpd m2, m3, 0x55
vpdpwssd m0, m12, m2 ; a2 b2
vpdpwssd m1, m10, m2 ; a0' b0'
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+12]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m7
vpdpwssd m1, m12, m4 ; b2
pshufb m2, m7
pshufb m3, m6
vpdpwssd m0, m11, m2 ; a1
vpdpwssd m1, m11, m3 ; b1
shufpd m2, m3, 0x55
vpdpwssd m0, m12, m2 ; a2
vpdpwssd m1, m10, m2 ; b0
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+r6*2], m0
add r6, 32
jl .h_w32_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w32_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastd m11, [pd_32]
pmovsxbw xmm0, [base+subpel_filters+1+myq*8]
tzcnt r7d, wd
vpbroadcastw m15, r8m
mov r6, ssq
movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
neg r6
mova [rsp+stack_offset+8], xmm0
vpbroadcastd m12, xmm0
add r7, r8
vpbroadcastd m13, [rsp+stack_offset+12]
vpbroadcastd m14, [rsp+stack_offset+16]
jmp r7
.v_w2:
movd xmm2, [srcq+r6 *2]
pinsrd xmm2, [srcq+r6 *1], 1
pinsrd xmm2, [srcq+ssq*0], 2
pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
movd xmm0, [srcq+ssq*0]
palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
punpcklwd xmm1, xmm2, xmm3 ; 01 12
punpckhwd xmm2, xmm3 ; 23 34
.v_w2_loop:
movd xmm3, [srcq+ssq*1]
mova xmm4, xm11
vpdpwssd xmm4, xmm1, xm12 ; a0 b0
lea srcq, [srcq+ssq*2]
mova xmm1, xmm2
vpdpwssd xmm4, xmm2, xm13 ; a1 b1
punpckldq xmm2, xmm0, xmm3 ; 4 5
movd xmm0, [srcq+ssq*0]
punpckldq xmm3, xmm0 ; 5 6
punpcklwd xmm2, xmm3 ; 45 56
vpdpwssd xmm4, xmm2, xm14 ; a2 b2
psrad xmm4, 6
packusdw xmm4, xmm4
pminsw xmm4, xm15
movd [dstq+dsq*0], xmm4
pextrd [dstq+dsq*1], xmm4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm1, [srcq+r6 *2]
vpbroadcastq ymm3, [srcq+r6 *1]
vpbroadcastq ymm2, [srcq+ssq*0]
vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm3, 0x30
vpblendd ymm3, ymm2, 0x30
punpcklwd ymm1, ymm3 ; 01 12
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm0, 0x30
punpcklwd ymm2, ymm4 ; 23 34
.v_w4_loop:
vpbroadcastq ymm3, [srcq+ssq*1]
mova ymm4, ym11
vpdpwssd ymm4, ymm1, ym12 ; a0 b0
lea srcq, [srcq+ssq*2]
mova ymm1, ymm2
vpdpwssd ymm4, ymm2, ym13 ; a1 b1
vpblendd ymm2, ymm0, ymm3, 0x30
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm3, ymm0, 0x30
punpcklwd ymm2, ymm3 ; 45 56
vpdpwssd ymm4, ymm2, ym14 ; a2 b2
psrad ymm4, 6
vextracti128 xmm3, ymm4, 1
packusdw xmm4, xmm3
pminsw xmm4, xm15
movq [dstq+dsq*0], xmm4
movhps [dstq+dsq*1], xmm4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
vzeroupper
RET
.v_w8:
vbroadcasti32x4 m0, [srcq+ssq*0]
vinserti32x4 m1, m0, [srcq+r6 *2], 0
vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2
vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m5, [spel_v_shuf8]
vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4
vpermb m1, m5, m1 ; 01 12
vpermb m2, m5, m0 ; 23 34
.v_w8_loop:
vinserti32x4 m0, [srcq+ssq*1], 3
lea srcq, [srcq+ssq*2]
movu xm3, [srcq+ssq*0]
mova m4, m11
vpdpwssd m4, m12, m1 ; a0 b0
vshufi32x4 m0, m3, q1032 ; 4 5 6
mova m1, m2
vpdpwssd m4, m13, m2 ; a1 b1
vpermb m2, m5, m0 ; 45 56
vpdpwssd m4, m14, m2 ; a2 b2
psrad m4, 6
vextracti32x8 ym3, m4, 1
packusdw ym4, ym3
pminsw ym4, ym15
mova [dstq+dsq*0], xm4
vextracti32x4 [dstq+dsq*1], ym4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m0, [srcq+r6 *1]
vinserti32x8 m1, m0, [srcq+ssq*0], 1
vinserti32x8 m0, [srcq+r6*2], 0
mova m6, [spel_v_shuf16]
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m3, [srcq+ssq*0], 1
vpermb m1, m6, m1 ; 12
vpermb m0, m6, m0 ; 01
vpermb m3, m6, m3 ; 34
mova m7, [deint_q_shuf]
vpshrdd m2, m1, m3, 16 ; 23
.v_w16_loop:
mova m5, m11
vpdpwssd m5, m12, m1 ; b0
mova m4, m11
vpdpwssd m4, m12, m0 ; a0
mova m1, m3
vpdpwssd m5, m13, m3 ; b1
mova m0, m2
vpdpwssd m4, m13, m2 ; a1
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m3, [srcq+ssq*0], 1
vpermb m3, m6, m3 ; 56
vpshrdd m2, m1, m3, 16 ; 45
vpdpwssd m5, m14, m3 ; b2
vpdpwssd m4, m14, m2 ; a2
psrad m5, 6
psrad m4, 6
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=96 H=83 G=89