; Copyright © 2020, VideoLAN and dav1d authors
; Copyright © 2020, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
obmc_masks:
pw_512: times 2 dw 512
; 2
db 45, 19, 64, 0
; 4
db 39, 25, 50, 14, 59, 5, 64, 0
; 8
db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
; 16
db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
; 32
db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20
db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22
db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24
db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24
db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26
db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28
db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30
warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43
pd_16384: dd 16384
pd_262144: dd 262144
warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54
warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71
db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11
spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13
db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
resize_permC: dd 0, 4, 8, 12
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
wm_420_perm64: dq 0xfedcba9876543210
wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
pb_8x0_8x8: times 8 db 0
times 8 db 8
pb_4: times 4 db 4
pb_32: times 4 db 32
pb_127: times 4 db 127
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_34: dd 34
pd_63: dd 63
pd_512: dd 512
%define pb_m64 (wm_sign+4)
%define pb_64 (wm_sign+8)
%define pd_2 (pd_0to7+8)
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern resize_filter
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask , avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
SECTION .text
%macro WRAP_YMM 1+
INIT_YMM cpuname
%1
INIT_ZMM cpuname
%endmacro
INIT_ZMM avx512icl
cglobal put_bilin_8bpc, 4, 8, 0, dst, ds , src, ss , w, h, mxy
movifnidn mxyd, r6m ; mx
lea r7, [put_avx512icl]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx wd, word [r7+wq*2+table_offset(put,)]
add wq, r7
jmp wq
.put_w2:
movzx r6d, word [srcq+ssq*0]
movzx r7d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6w
mov [dstq+dsq*1], r7w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6d
mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6
mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu xmm0, [srcq+ssq*0]
movu xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], xmm0
mova [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu ym0, [srcq+ssq*0]
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], ym0
mova [dstq+dsq*1], ym1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w64
RET
.put_w128:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+64*0], m0
mova [dstq+dsq*0+64*1], m1
mova [dstq+dsq*1+64*0], m2
mova [dstq+dsq*1+64*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w128
RET
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 255
vbroadcasti128 m4, [bilin_h_perm16]
add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .hv
movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
vpbroadcastd m3, [pw_2048]
add wq, r7
jmp wq
.h_w2:
movd xmm0, [srcq+ssq*0]
pinsrd xmm0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
pshufb xmm0, xm4
pmaddubsw xmm0, xm5
pmulhrsw xmm0, xm3
packuswb xmm0, xmm0
pextrw [dstq+dsq*0], xmm0, 0
pextrw [dstq+dsq*1], xmm0, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
mova xmm4, [bilin_h_shuf4]
.h_w4_loop:
movq xmm0, [srcq+ssq*0]
movhps xmm0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb xmm0, xmm4
pmaddubsw xmm0, xm5
pmulhrsw xmm0, xm3
packuswb xmm0, xmm0
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h_w8:
movu xm0, [srcq+ssq*0]
vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
pshufb ym0, ym4
pmaddubsw ym0, ym5
pmulhrsw ym0, ym3
vpmovuswb xm0, ym0
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
mova m4, [bilin_h_perm16]
.h_w16_loop:
movu ym0, [srcq+ssq*0]
vinserti32x8 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
vpermb m0, m4, m0
pmaddubsw m0, m5
pmulhrsw m0, m3
vpmovuswb ym0, m0
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
movu ym0, [srcq+ssq*0+8*0]
vinserti32x8 m0, [srcq+ssq*1+8*0], 1
movu ym1, [srcq+ssq*0+8*1]
vinserti32x8 m1, [srcq+ssq*1+8*1], 1
lea srcq, [srcq+ssq*2]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w32
RET
.h_w64:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
add srcq, ssq
mova [dstq], m0
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
movu m0, [srcq+8*0]
movu m2, [srcq+8*1]
movu m1, [srcq+8*8]
movu m6, [srcq+8*9]
add srcq, ssq
REPX {pshufb x, m4}, m0, m2, m1, m6
REPX {pmaddubsw x, m5}, m0, m2, m1, m6
REPX {pmulhrsw x, m3}, m0, m2, m1, m6
packuswb m0, m2
packuswb m1, m6
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
imul mxyd, 255
vpbroadcastd m5, [pw_2048]
add mxyd, 16
add wq, r7
vpbroadcastw m4, mxyd
jmp wq
.v_w2:
movd xmm0, [srcq+ssq*0]
.v_w2_loop:
pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
lea srcq, [srcq+ssq*2]
pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xmm1, xmm1, q2301 ; 1 0
punpcklbw xmm1, xmm0
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
packuswb xmm1, xmm1
pextrw [dstq+dsq*0], xmm1, 1
pextrw [dstq+dsq*1], xmm1, 0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movd xmm0, [srcq+ssq*0]
.v_w4_loop:
vpbroadcastd xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm2, xmm0, 0x02 ; 1 2
punpcklbw xmm1, xmm2
pmaddubsw xmm1, xm4
pmulhrsw xmm1, xm5
packuswb xmm1, xmm1
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movq xmm0, [srcq+ssq*0]
.v_w8_loop:
movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw xmm1, xmm0, xmm2
movq xmm0, [srcq+ssq*0]
punpcklbw xmm2, xmm0
pmaddubsw xmm1, xm4
pmaddubsw xmm2, xm4
pmulhrsw xmm1, xm5
pmulhrsw xmm2, xm5
packuswb xmm1, xmm2
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
movu xmm0, [srcq+ssq*0]
.v_w16_loop:
vbroadcasti128 ymm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1
vbroadcasti128 ymm0, [srcq+ssq*0]
vpblendd ymm3, ymm0, 0xf0 ; 1 2
punpcklbw ymm1, ymm2, ymm3
punpckhbw ymm2, ymm3
pmaddubsw ymm1, ym4
pmaddubsw ymm2, ym4
pmulhrsw ymm1, ym5
pmulhrsw ymm2, ym5
packuswb ymm1, ymm2
mova [dstq+dsq*0], xmm1
vextracti128 [dstq+dsq*1], ymm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
vzeroupper
RET
.v_w32:
movu ym0, [srcq+ssq*0]
kxnorb k1, k1, k1
.v_w32_loop:
vbroadcasti32x8 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendmd m2{k1}, m3, m0 ; 0 1
vbroadcasti32x8 m0, [srcq+ssq*0]
vpblendmd m3{k1}, m0, m3 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], ym1
vextracti32x8 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+ssq*0]
.v_w64_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw m1, m0, m3
punpckhbw m6, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m6, m4
punpcklbw m2, m3, m0
punpckhbw m3, m0
pmaddubsw m2, m4
pmaddubsw m3, m4
REPX {pmulhrsw x, m5}, m1, m6, m2, m3
packuswb m1, m6
packuswb m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
.v_w128_loop:
add srcq, ssq
movu m2, [srcq+64*0]
movu m3, [srcq+64*1]
punpcklbw m6, m0, m2
pmaddubsw m6, m4
punpckhbw m0, m2
pmaddubsw m0, m4
punpcklbw m7, m1, m3
pmaddubsw m7, m4
punpckhbw m1, m3
pmaddubsw m1, m4
REPX {pmulhrsw x, m5}, m6, m0, m7, m1
packuswb m6, m0
mova m0, m2
packuswb m7, m1
mova m1, m3
mova [dstq+64*0], m6
mova [dstq+64*1], m7
add dstq, dsq
dec hd
jg .v_w128_loop
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
vpbroadcastd m7, [pw_2048]
add wq, r7
vpbroadcastw m6, mxyd
jmp wq
.hv_w2:
vpbroadcastd xmm0, [srcq+ssq*0]
pshufb xmm0, xm4
pmaddubsw xmm0, xm5
.hv_w2_loop:
movd xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pinsrd xmm1, [srcq+ssq*0], 1
pshufb xmm1, xm4
pmaddubsw xmm1, xm5 ; 1 _ 2 _
shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm6
paddw xmm1, xmm2
pmulhrsw xmm1, xm7
packuswb xmm1, xmm1
pextrw [dstq+dsq*0], xmm1, 0
pextrw [dstq+dsq*1], xmm1, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
mova xmm4, [bilin_h_shuf4]
movddup xmm0, [srcq+ssq*0]
pshufb xmm0, xmm4
pmaddubsw xmm0, xm5
.hv_w4_loop:
movq xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm1, [srcq+ssq*0]
pshufb xmm1, xmm4
pmaddubsw xmm1, xm5 ; 1 2
shufps xmm2, xmm0, xmm1, q1032 ; 0 1
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm6
paddw xmm1, xmm2
pmulhrsw xmm1, xm7
packuswb xmm1, xmm1
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
vbroadcasti128 ym0, [srcq+ssq*0]
pshufb ym0, ym4
pmaddubsw ym0, ym5
.hv_w8_loop:
movu xm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti128 ym1, [srcq+ssq*0], 1
pshufb ym1, ym4
pmaddubsw ym1, ym5 ; 1 2
valignq ym2, ym1, ym0, 2
mova ym0, ym1
psubw ym1, ym2
paddw ym1, ym1
pmulhw ym1, ym6
paddw ym1, ym2
pmulhrsw ym1, ym7
vpmovuswb xm1, ym1
movq [dstq+dsq*0], xm1
movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
vbroadcasti32x8 m0, [srcq+ssq*0]
mova m4, [bilin_h_perm16]
vpermb m0, m4, m0
pmaddubsw m0, m5
.hv_w16_loop:
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m1, [srcq+ssq*0], 1
vpermb m1, m4, m1
pmaddubsw m1, m5 ; 1 2
valignq m2, m1, m0, 4 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
vpmovuswb ym1, m1
mova [dstq+dsq*0], xm1
vextracti32x4 [dstq+dsq*1], ym1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+ssq*0]
pmovzxbq m8, [pb_02461357]
pmaddubsw m0, m5
.hv_w32_loop:
vpermb m2, m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpermb m3, m4, [srcq+ssq*0]
pmaddubsw m2, m5
psubw m1, m2, m0
paddw m1, m1
pmulhw m1, m6
paddw m1, m0
pmaddubsw m0, m3, m5
psubw m3, m0, m2
paddw m3, m3
pmulhw m3, m6
paddw m3, m2
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
vpermq m1, m8, m1
mova [dstq+dsq*0], ym1
vextracti32x8 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w32_loop
RET
.hv_w64:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w64_loop:
add srcq, ssq
movu m2, [srcq+8*0]
movu m3, [srcq+8*1]
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
pmaddubsw m3, m5
psubw m8, m2, m0
psubw m9, m3, m1
paddw m8, m8
pmulhw m8, m6
paddw m9, m9
pmulhw m9, m6
paddw m8, m0
pmulhrsw m8, m7
paddw m9, m1
pmulhrsw m9, m7
mova m0, m2
mova m1, m3
packuswb m8, m9
mova [dstq], m8
add dstq, dsq
dec hd
jg .hv_w64_loop
RET
.hv_w128:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
movu m2, [srcq+8*8]
movu m3, [srcq+8*9]
REPX {pshufb x, m4}, m0, m1, m2, m3
REPX {pmaddubsw x, m5}, m0, m1, m2, m3
.hv_w128_loop:
add srcq, ssq
movu m8, [srcq+8*0]
movu m9, [srcq+8*1]
movu m10, [srcq+8*8]
movu m11, [srcq+8*9]
REPX {pshufb x, m4}, m8, m9, m10, m11
REPX {pmaddubsw x, m5}, m8, m9, m10, m11
psubw m12, m8, m0
psubw m13, m9, m1
psubw m14, m10, m2
psubw m15, m11, m3
paddw m12, m12
pmulhw m12, m6
paddw m13, m13
pmulhw m13, m6
paddw m14, m14
pmulhw m14, m6
paddw m15, m15
pmulhw m15, m6
paddw m12, m0
pmulhrsw m12, m7
paddw m13, m1
pmulhrsw m13, m7
paddw m14, m2
pmulhrsw m14, m7
paddw m15, m3
pmulhrsw m15, m7
mova m0, m8
mova m1, m9
mova m2, m10
mova m3, m11
packuswb m12, m13
packuswb m14, m15
mova [dstq+64*0], m12
mova [dstq+64*1], m14
add dstq, dsq
dec hd
jg .hv_w128_loop
RET
DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea t2, [prep_avx512icl]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
movzx wd, word [t2+wq*2+table_offset(prep,)]
add wq, t2
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
movd xmm0, [srcq+strideq*0]
pinsrd xmm0, [srcq+strideq*1], 1
pinsrd xmm0, [srcq+strideq*2], 2
pinsrd xmm0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pmovzxbw ym0, xmm0
psllw ym0, 4
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .prep_w4
RET
.prep_w8:
movq xmm0, [srcq+strideq*0]
movq xmm1, [srcq+strideq*1]
vinserti128 ym0, ymm0, [srcq+strideq*2], 1
vinserti128 ym1, ymm1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
punpcklqdq ym0, ym1
pmovzxbw m0, ym0
psllw m0, 4
mova [tmpq], m0
add tmpq, 32*2
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu xmm0, [srcq+strideq*0]
vinserti128 ym0, ymm0, [srcq+strideq*1], 1
movu xmm1, [srcq+strideq*2]
vinserti128 ym1, ymm1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
pmovzxbw m0, ym0
pmovzxbw m1, ym1
psllw m0, 4
psllw m1, 4
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 32*4
sub hd, 4
jg .prep_w16
RET
.prep_w32:
pmovzxbw m0, [srcq+strideq*0]
pmovzxbw m1, [srcq+strideq*1]
pmovzxbw m2, [srcq+strideq*2]
pmovzxbw m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {psllw x, 4}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 4
jg .prep_w32
RET
.prep_w64:
pmovzxbw m0, [srcq+strideq*0+32*0]
pmovzxbw m1, [srcq+strideq*0+32*1]
pmovzxbw m2, [srcq+strideq*1+32*0]
pmovzxbw m3, [srcq+strideq*1+32*1]
lea srcq, [srcq+strideq*2]
REPX {psllw x, 4}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 2
jg .prep_w64
RET
.prep_w128:
pmovzxbw m0, [srcq+32*0]
pmovzxbw m1, [srcq+32*1]
pmovzxbw m2, [srcq+32*2]
pmovzxbw m3, [srcq+32*3]
REPX {psllw x, 4}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
add srcq, strideq
dec hd
jg .prep_w128
RET
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 255
add mxyd, 16
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .hv
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
add wq, t2
lea stride3q, [strideq*3]
jmp wq
.h_w4:
vbroadcasti32x4 ym4, [bilin_h_shuf4]
.h_w4_loop:
movq xmm0, [srcq+strideq*0]
movq xmm1, [srcq+strideq*1]
vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
punpcklqdq ym0, ym1
pshufb ym0, ym4
pmaddubsw ym0, ym5
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
.h_w8:
vbroadcasti32x4 m4, [bilin_h_perm16]
.h_w8_loop:
movu xmm0, [srcq+strideq*0]
vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
vinserti32x4 m0, [srcq+strideq*2], 2
vinserti32x4 m0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pshufb m0, m4
pmaddubsw m0, m5
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8_loop
RET
.h_w16:
mova m4, [bilin_h_perm16]
.h_w16_loop:
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1
movu ym1, [srcq+strideq*2]
vinserti32x8 m1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
vpermb m0, m4, m0
vpermb m1, m4, m1
pmaddubsw m0, m5
pmaddubsw m1, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 4
jg .h_w16_loop
RET
.h_w32:
mova m4, [bilin_h_perm32]
.h_w32_loop:
vpermb m0, m4, [srcq+strideq*0]
vpermb m1, m4, [srcq+strideq*1]
vpermb m2, m4, [srcq+strideq*2]
vpermb m3, m4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 4
jg .h_w32_loop
RET
.h_w64:
mova m4, [bilin_h_perm32]
.h_w64_loop:
vpermb m0, m4, [srcq+strideq*0+32*0]
vpermb m1, m4, [srcq+strideq*0+32*1]
vpermb m2, m4, [srcq+strideq*1+32*0]
vpermb m3, m4, [srcq+strideq*1+32*1]
lea srcq, [srcq+strideq*2]
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 2
jg .h_w64_loop
RET
.h_w128:
mova m4, [bilin_h_perm32]
.h_w128_loop:
vpermb m0, m4, [srcq+32*0]
vpermb m1, m4, [srcq+32*1]
vpermb m2, m4, [srcq+32*2]
vpermb m3, m4, [srcq+32*3]
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
add srcq, strideq
dec hd
jg .h_w128_loop
RET
.v:
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
imul mxyd, 255
add mxyd, 16
add wq, t2
lea stride3q, [strideq*3]
vpbroadcastw m6, mxyd
jmp wq
.v_w4:
vpbroadcastd xm0, [srcq+strideq*0]
mov r3d, 0x29
vbroadcasti32x4 ym3, [bilin_v_shuf4]
kmovb k1, r3d
.v_w4_loop:
vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
vpbroadcastd ym2, [srcq+strideq*2]
vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
lea srcq, [srcq+strideq*4]
vpbroadcastd ym0, [srcq+strideq*0]
punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
pshufb ym2, ym3
pmaddubsw ym2, ym6
mova [tmpq], ym2
add tmpq, 32
sub hd, 4
jg .v_w4_loop
RET
.v_w8:
mova m5, [bilin_v_perm8]
vbroadcasti32x4 ym0, [srcq+strideq*0]
.v_w8_loop:
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
vpbroadcastq ym0, [srcq+strideq*2]
vinserti32x4 m1, [srcq+stride3q ], 2
lea srcq, [srcq+strideq*4]
vinserti32x4 ym0, [srcq+strideq*0], 0
vpermt2b m1, m5, m0
pmaddubsw m1, m6
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
mova m5, [bilin_v_perm16]
movu xm0, [srcq+strideq*0]
.v_w16_loop:
movu xm2, [srcq+strideq*2]
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
vpermt2b m1, m5, m2
vinserti32x4 ym2, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
movu xm0, [srcq+strideq*0]
vpermt2b m2, m5, m0
pmaddubsw m1, m6
pmaddubsw m2, m6
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
mova m5, [bilin_v_perm32]
movu ym0, [srcq+strideq*0]
.v_w32_loop:
movu ym2, [srcq+strideq*1]
movu ym3, [srcq+strideq*2]
movu ym4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpermt2b m0, m5, m2
vpermt2b m2, m5, m3
vpermt2b m3, m5, m4
pmaddubsw m1, m0, m6
movu ym0, [srcq+strideq*0]
vpermt2b m4, m5, m0
pmaddubsw m2, m6
pmaddubsw m3, m6
pmaddubsw m4, m6
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
mova [tmpq+64*2], m3
mova [tmpq+64*3], m4
add tmpq, 64*4
sub hd, 4
jg .v_w32_loop
RET
.v_w64:
mova m5, [bilin_v_perm64]
vpermq m0, m5, [srcq+strideq*0]
.v_w64_loop:
vpermq m1, m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
punpcklbw m4, m0, m1
punpckhbw m2, m0, m1
vpermq m0, m5, [srcq+strideq*0]
punpcklbw m3, m1, m0
punpckhbw m1, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
pmaddubsw m3, m6
pmaddubsw m1, m6
mova [tmpq+64*0], m4
mova [tmpq+64*1], m2
mova [tmpq+64*2], m3
mova [tmpq+64*3], m1
add tmpq, 64*4
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
mova m5, [bilin_v_perm64]
vpermq m0, m5, [srcq+strideq*0+ 0]
vpermq m1, m5, [srcq+strideq*0+64]
.v_w128_loop:
vpermq m2, m5, [srcq+strideq*1+ 0]
vpermq m3, m5, [srcq+strideq*1+64]
lea srcq, [srcq+strideq*2]
punpcklbw m4, m0, m2
punpckhbw m0, m2
pmaddubsw m4, m6
pmaddubsw m0, m6
mova [tmpq+64*0], m4
mova [tmpq+64*1], m0
punpcklbw m4, m1, m3
punpckhbw m1, m3
pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+64*2], m4
mova [tmpq+64*3], m1
vpermq m0, m5, [srcq+strideq*0+ 0]
vpermq m1, m5, [srcq+strideq*0+64]
punpcklbw m4, m2, m0
punpckhbw m2, m0
pmaddubsw m4, m6
pmaddubsw m2, m6
mova [tmpq+64*4], m4
mova [tmpq+64*5], m2
punpcklbw m4, m3, m1
punpckhbw m3, m1
pmaddubsw m4, m6
pmaddubsw m3, m6
mova [tmpq+64*6], m4
mova [tmpq+64*7], m3
add tmpq, 64*8
sub hd, 2
jg .v_w128_loop
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
vpbroadcastw m6, mxyd
add wq, t2
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
vbroadcasti32x4 ym4, [bilin_h_shuf4]
vpbroadcastq ym0, [srcq+strideq*0]
pshufb ym0, ym4
pmaddubsw ym0, ym5
.hv_w4_loop:
movq xmm1, [srcq+strideq*1]
movq xmm2, [srcq+strideq*2]
vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
punpcklqdq ym1, ym2
pshufb ym1, ym4
pmaddubsw ym1, ym5 ; 1 2 3 4
valignq ym2, ym1, ym0, 3 ; 0 1 2 3
mova ym0, ym1
psubw ym1, ym2
pmulhrsw ym1, ym6
paddw ym1, ym2
mova [tmpq], ym1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
vbroadcasti32x4 m4, [bilin_h_perm16]
vbroadcasti32x4 m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
movu xmm1, [srcq+strideq*1]
vinserti128 ym1, ymm1, [srcq+strideq*2], 1
vinserti128 m1, [srcq+stride3q ], 2
lea srcq, [srcq+strideq*4]
vinserti128 m1, [srcq+strideq*0], 3
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2 3 4
valignq m2, m1, m0, 6 ; 0 1 2 3
mova m0, m1
psubw m1, m2
pmulhrsw m1, m6
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
mova m4, [bilin_h_perm16]
vbroadcasti32x8 m0, [srcq+strideq*0]
vpermb m0, m4, m0
pmaddubsw m0, m5
.hv_w16_loop:
movu ym1, [srcq+strideq*1]
vinserti32x8 m1, [srcq+strideq*2], 1
movu ym2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vinserti32x8 m2, [srcq+strideq*0], 1
vpermb m1, m4, m1
vpermb m2, m4, m2
pmaddubsw m1, m5 ; 1 2
vshufi32x4 m3, m0, m1, q1032 ; 0 1
pmaddubsw m0, m2, m5 ; 3 4
vshufi32x4 m2, m1, m0, q1032 ; 2 3
psubw m1, m3
pmulhrsw m1, m6
paddw m1, m3
psubw m3, m0, m2
pmulhrsw m3, m6
paddw m3, m2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m3
add tmpq, 64*2
sub hd, 4
jg .hv_w16_loop
RET
.hv_w32:
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+strideq*0]
pmaddubsw m0, m5
.hv_w32_loop:
vpermb m1, m4, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vpermb m2, m4, [srcq+strideq*0]
pmaddubsw m1, m5
psubw m3, m1, m0
pmulhrsw m3, m6
paddw m3, m0
pmaddubsw m0, m2, m5
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+64*0], m3
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .hv_w32_loop
RET
.hv_w64:
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+32*0]
vpermb m1, m4, [srcq+32*1]
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w64_loop:
add srcq, strideq
vpermb m2, m4, [srcq+32*0]
vpermb m3, m4, [srcq+32*1]
pmaddubsw m2, m5
pmaddubsw m3, m5
psubw m7, m2, m0
psubw m8, m3, m1
pmulhrsw m7, m6
pmulhrsw m8, m6
paddw m7, m0
mova m0, m2
paddw m8, m1
mova m1, m3
mova [tmpq+64*0], m7
mova [tmpq+64*1], m8
add tmpq, 64*2
dec hd
jg .hv_w64_loop
RET
.hv_w128:
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+32*0]
vpermb m1, m4, [srcq+32*1]
vpermb m2, m4, [srcq+32*2]
vpermb m3, m4, [srcq+32*3]
REPX {pmaddubsw x, m5}, m0, m1, m2, m3
.hv_w128_loop:
add srcq, strideq
vpermb m7, m4, [srcq+32*0]
vpermb m8, m4, [srcq+32*1]
vpermb m9, m4, [srcq+32*2]
vpermb m10, m4, [srcq+32*3]
REPX {pmaddubsw x, m5}, m7, m8, m9, m10
psubw m11, m7, m0
psubw m12, m8, m1
psubw m13, m9, m2
psubw m14, m10, m3
REPX {pmulhrsw x, m6}, m11, m12, m13, m14
paddw m11, m0
mova m0, m7
paddw m12, m1
mova m1, m8
paddw m13, m2
mova m2, m9
paddw m14, m3
mova m3, m10
mova [tmpq+64*0], m11
mova [tmpq+64*1], m12
mova [tmpq+64*2], m13
mova [tmpq+64*3], m14
add tmpq, 64*4
dec hd
jg .hv_w128_loop
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%if %0 == 5 ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
%if %5
vpermb m%2, m6, m%1
vpermb m%3, m7, m%1
vpermb m%4, m8, m%1
%else
%if %2 < %4 ; reuse a previous value if possible
pshufb m%2, m%1, m6
%endif
pshufb m%3, m%1, m7
pshufb m%4, m%1, m8
%endif
mova m%1, m5
vpdpbusd m%1, m%2, m9
mova m%2, m5
vpdpbusd m%2, m%3, m9
vpdpbusd m%1, m%3, m10
vpdpbusd m%2, m%4, m10
packusdw m%1, m%2
psrlw m%1, 6
%endmacro
%if WIN64
DECLARE_REG_TMP 4, 5
%else
DECLARE_REG_TMP 7, 8
%endif
; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
; the horizontal filter, 6-tap is only used for the vertical filter.
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_6tap_8bpc, 4, 9, 0, dst, ds , src, ss , w, h, mx, my, ns
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movsxd wq, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
lea r6, [ssq*3]
lea r7, [dsq*3]
%if WIN64
pop r8
%endif
jmp wq
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
tzcnt r6d, wd
movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
vpbroadcastd m6, [pw_512]
lea myq, [base+subpel_filters+1+myq*8]
vpbroadcastw m7, [myq+0]
add r6, r8
vpbroadcastw m8, [myq+2]
mov nsq, ssq
vpbroadcastw m9, [myq+4]
neg nsq
jmp r6
.v_w2:
movd xmm2, [srcq+nsq*2]
pinsrw xmm2, [srcq+nsq*1], 2
pinsrw xmm2, [srcq+ssq*0], 4
pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
vpbroadcastd xmm0, [srcq+ssq*0]
palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
punpcklbw xmm1, xmm2, xmm3 ; 01 12
punpckhbw xmm2, xmm3 ; 23 34
.v_w2_loop:
vpbroadcastd xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw xmm3, xmm1, xm7 ; a0 b0
mova xmm1, xmm2
pmaddubsw xmm2, xm8 ; a1 b1
paddw xmm3, xmm2
vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm4, xmm0, 0x02 ; 5 6
punpcklbw xmm2, xmm4 ; 67 78
pmaddubsw xmm4, xmm2, xm9 ; a3 b3
paddw xmm3, xmm4
pmulhrsw xmm3, xm6
packuswb xmm3, xmm3
pextrw [dstq+dsq*0], xmm3, 0
pextrw [dstq+dsq*1], xmm3, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movd xmm2, [srcq+nsq*2]
pinsrd xmm2, [srcq+nsq*1], 1
pinsrd xmm2, [srcq+ssq*0], 2
pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
vpbroadcastd xmm0, [srcq+ssq*0]
palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
punpcklbw xmm1, xmm2, xmm3 ; 01 12
punpckhbw xmm2, xmm3 ; 23 34
.v_w4_loop:
vpbroadcastd xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw xmm3, xmm1, xm7 ; a0 b0
mova xmm1, xmm2
pmaddubsw xmm2, xm8 ; a1 b1
paddw xmm3, xmm2
vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm4, xmm0, 0x02 ; 5 6
punpcklbw xmm2, xmm4 ; 45 56
pmaddubsw xmm4, xmm2, xm9 ; a2 b2
paddw xmm3, xmm4
pmulhrsw xmm3, xm6
packuswb xmm3, xmm3
movd [dstq+dsq*0], xmm3
pextrd [dstq+dsq*1], xmm3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movq xmm1, [srcq+nsq*2]
vpbroadcastq ymm3, [srcq+nsq*1]
vpbroadcastq ymm2, [srcq+ssq*0]
vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm3, 0x30
vpblendd ymm3, ymm2, 0x30
punpcklbw ymm1, ymm3 ; 01 12
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm0, 0x30
punpcklbw ymm2, ymm4 ; 23 34
.v_w8_loop:
vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw ymm3, ymm1, ym7 ; a0 b0
mova ymm1, ymm2
pmaddubsw ymm2, ym8 ; a1 b1
paddw ymm3, ymm2
vpblendd ymm2, ymm0, ymm4, 0x30
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm4, ymm0, 0x30
punpcklbw ymm2, ymm4 ; 45 56
pmaddubsw ymm4, ymm2, ym9 ; a2 b2
paddw ymm3, ymm4
pmulhrsw ymm3, ym6
vextracti128 xmm4, ymm3, 1
packuswb xmm3, xmm4
movq [dstq+dsq*0], xmm3
movhps [dstq+dsq*1], xmm3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
vzeroupper
RET
.v_w16:
mova m5, [spel_v_perm16a]
vbroadcasti32x4 m1, [srcq+nsq*2]
vbroadcasti32x4 ym3, [srcq+nsq*1]
mov r6d, 0x0f
vbroadcasti32x4 m2, [srcq+ssq*0]
kmovb k1, r6d
vbroadcasti32x4 ym4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vbroadcasti32x4 m0, [srcq+ssq*0]
vshufpd m1{k1}, m3, m2, 0xcc
vshufpd m2{k1}, m4, m0, 0xcc
vpermb m1, m5, m1 ; 01 12
vpermb m2, m5, m2 ; 23 34
.v_w16_loop:
vbroadcasti32x4 ym4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddubsw m3, m1, m7 ; a0 b0
mova m1, m2
pmaddubsw m2, m8 ; a1 b1
paddw m3, m2
mova m2, m0
vbroadcasti32x4 m0, [srcq+ssq*0]
vshufpd m2{k1}, m4, m0, 0xcc
vpermb m2, m5, m2 ; 45 56
pmaddubsw m4, m2, m9 ; a2 b2
paddw m3, m4
pmulhrsw m3, m6
vextracti32x8 ym4, m3, 1
packuswb ym3, ym4
mova [dstq+dsq*0], xm3
vextracti32x4 [dstq+dsq*1], ym3, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
mova m10, [spel_v_perm32]
pmovzxbq m5, [pb_02461357]
vpshrdw m11, m10, m10, 8
movu ym0, [srcq+nsq*2]
vinserti32x8 m0, [srcq+nsq*1], 1
vpermb m1, m10, m0 ; 01
vinserti32x8 m0, [srcq+ssq*0], 0
vpermb m2, m11, m0 ; 12
vinserti32x8 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
vpermb m3, m10, m0 ; 23
vinserti32x8 m0, [srcq+ssq*0], 0
vpermb m4, m11, m0 ; 34
.v_w32_loop:
vinserti32x8 m0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
pmaddubsw m12, m1, m7
mova m1, m3
pmaddubsw m13, m2, m7
mova m2, m4
pmaddubsw m14, m3, m8
vpermb m3, m10, m0 ; 45
vinserti32x8 m0, [srcq+ssq*0], 0
pmaddubsw m15, m4, m8
vpermb m4, m11, m0 ; 56
paddw m12, m14
pmaddubsw m14, m3, m9
paddw m13, m15
pmaddubsw m15, m4, m9
paddw m12, m14
paddw m13, m15
pmulhrsw m12, m6
pmulhrsw m13, m6
packuswb m12, m13
vpermq m12, m5, m12
mova [dstq+dsq*0], ym12
vextracti32x8 [dstq+dsq*1], m12, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
.v_w128:
lea r6d, [hq+wq*4-256]
.v_loop0:
movu m2, [srcq+nsq*2]
movu m4, [srcq+nsq*1]
lea r4, [srcq+ssq*2]
movu m11, [srcq+ssq*0]
movu m13, [srcq+ssq*1]
mov r7, dstq
movu m0, [r4 +ssq*0]
punpcklbw m1, m2, m4 ; 01l
punpckhbw m2, m4 ; 01h
punpcklbw m3, m4, m11 ; 12l
punpckhbw m4, m11 ; 12h
punpcklbw m10, m11, m13 ; 23l
punpckhbw m11, m13 ; 23h
punpcklbw m12, m13, m0 ; 34l
punpckhbw m13, m0 ; 34h
.v_loop:
movu m5, [r4+ssq*1]
pmaddubsw m14, m1, m7 ; a0l
mova m1, m10
pmaddubsw m10, m8 ; a1l
lea r4, [r4+ssq*2]
pmaddubsw m15, m2, m7 ; a0h
mova m2, m11
pmaddubsw m11, m8 ; a1h
paddw m14, m10
punpcklbw m10, m0, m5 ; 45l
paddw m15, m11
punpckhbw m11, m0, m5 ; 45h
pmaddubsw m0, m10, m9 ; a2l
paddw m14, m0
pmaddubsw m0, m11, m9 ; a2h
paddw m15, m0
movu m0, [r4+ssq*0]
pmulhrsw m14, m6
pmulhrsw m15, m6
packuswb m14, m15
pmaddubsw m15, m3, m7 ; b0l
mova m3, m12
pmaddubsw m12, m8 ; b1l
mova [r7+dsq*0], m14
pmaddubsw m14, m4, m7 ; b0h
mova m4, m13
pmaddubsw m13, m8 ; b1h
paddw m15, m12
punpcklbw m12, m5, m0 ; 56l
paddw m14, m13
punpckhbw m13, m5, m0 ; 56h
pmaddubsw m5, m12, m9 ; b2l
paddw m15, m5
pmaddubsw m5, m13, m9 ; b2h
paddw m14, m5
pmulhrsw m15, m6
pmulhrsw m14, m6
packuswb m15, m14
mova [r7+dsq*1], m15
lea r7, [r7+dsq*2]
sub hd, 2
jg .v_loop
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=92 H=73 G=82