; Copyright © 2020, VideoLAN and dav1d authors
; Copyright © 2020, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db %1-128, 127-%1
%rotate 1
%endrep
%endmacro
smooth_weights: SMOOTH_WEIGHT_TABLE \
0, 0, 255, 128, 255, 149, 85, 64, \
255, 197, 146, 105, 73, 50, 37, 32, \
255, 225, 196, 170, 145, 123, 102, 84, \
68, 54, 43, 33, 26, 20, 17, 16, \
255, 240, 225, 210, 196, 182, 169, 157, \
145, 133, 122, 111, 101, 92, 83, 74, \
66, 59, 52, 45, 39, 34, 29, 25, \
21, 17, 14, 12, 10, 9, 8, 8, \
255, 248, 240, 233, 225, 218, 210, 203, \
196, 189, 182, 176, 169, 163, 156, 150, \
144, 138, 133, 127, 121, 116, 111, 106, \
101, 96, 91, 86, 82, 77, 73, 69, \
65, 61, 57, 54, 50, 47, 44, 41, \
38, 35, 32, 29, 27, 25, 22, 20, \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10
db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6
db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0
db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0
db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0
db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0
db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8
db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4
db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0
db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0
db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8
db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4
db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0
db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0
db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14
db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12
db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0
db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0
filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31
db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131
db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31
smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9
db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13
db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11
db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15
smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48
db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32
db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14
db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30
db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46
db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62
z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6
db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22
db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38
db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54
z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16
db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32
db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48
db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64
z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8
z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9
z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72
z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80
z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56
z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64
z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8
dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16
z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67
db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71
db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75
db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79
z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0
db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1
db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2
db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3
z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1
db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3
db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5
db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7
z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24
dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512
dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8
z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
db 39, 39, 47, 47, 47, 79, 79, 79
z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0
db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16
pb_8_56_0_0: db 8, 56, 0, 0
pb_m4_36: times 2 db -4, 36
pb_127_m127: times 2 db 127, -127
pb_8: times 4 db 8
pb_15: times 4 db 15
pb_16: times 4 db 16
pb_31: times 4 db 31
pb_63: times 4 db 63
pb_90: times 4 db 90
pb_128: times 4 db 128
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pw_512: times 2 dw 512
%define pb_1 (ipred_h_shuf+24)
%define pb_2 (ipred_h_shuf+20)
%define pb_3 (ipred_h_shuf+16)
%define pb_4 (smooth_shuf +48)
%define pb_7 (ipred_h_shuf+ 0)
%define pb_9 (z_xpos_bc + 8)
%define pb_17 (z_xpos_bc + 0)
%define pb_33 (z_xpos_bc + 4)
%define pd_8 (filter_taps+128)
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
cextern dr_intra_derivative
cextern pb_0to63
SECTION .text
INIT_ZMM avx512icl
cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
lea r5, [ipred_dc_left_8bpc_avx512icl_table]
movd xm0, wm
tzcnt wd, wm
inc tlq
movifnidn hd, hm
movu ym1, [tlq]
movd xmm3, wd
movsxd r6, [r5+wq*4]
vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
psrld xm0, 1
vpdpbusd ym0, ym1, ym2
add r6, r5
add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_left_8bpc_avx512icl_table]
mov hd, hm
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
movd xm0, hm
movu ym1, [tlq]
movd xmm3, r6d
movsxd r6, [r5+r6*4]
vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
psrld xm0, 1
vpdpbusd ym0, ym1, ym2
add r6, r5
add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
.h64:
movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
vpdpbusd ym0, ym1, ym2
.h32:
vextracti32x4 xm1, ym0, 1
paddd xm0, xm1
.h16:
punpckhqdq xm1, xm0, xm0
paddd xm0, xm1
.h8:
psrlq xm1, xm0, 32
paddd xm0, xm1
.h4:
vpsrlvd xm0, xmm3
lea stride3q, [strideq*3]
vpbroadcastb m0, xm0
jmp wq
cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd xm0, r5d
tzcnt r5d, r5d
movd xmm4, r5d
lea r5, [ipred_dc_8bpc_avx512icl_table]
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+5*4]
vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
psrld xm0, 1
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movd xmm1, [tlq-4]
vpdpbusd xm0, xmm1, xm3
jmp wq
.w4:
movd xmm1, [tlq+1]
vpdpbusd xm0, xmm1, xm3
cmp hd, 4
jg .w4_mul
psrlw xmm0, xm0, 3
jmp .w4_end
.w4_mul:
punpckhqdq xmm1, xm0, xm0
lea r2d, [hq*2]
mov r6d, 0x55563334
paddd xmm1, xm0
shrx r6d, r6d, r2d
psrlq xmm0, xmm1, 32
paddd xmm0, xmm1
movd xmm1, r6d
psrld xmm0, 2
pmulhuw xmm0, xmm1
.w4_end:
vpbroadcastb xm0, xmm0
.s4:
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm0
movd [dstq+strideq*2], xm0
movd [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
.h8:
movq xmm1, [tlq-8]
vpdpbusd xm0, xmm1, xm3
jmp wq
.w8:
movq xmm1, [tlq+1]
vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
paddd xmm0, xmm1
vpsrlvd xmm0, xmm4
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmove r6d, r2d
movd xmm1, r6d
pmulhuw xmm0, xmm1
.w8_end:
vpbroadcastb xm0, xmm0
.s8:
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm0
movq [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
.h16:
mova xmm1, [tlq-16]
vpdpbusd xm0, xmm1, xm3
jmp wq
.w16:
movu xmm1, [tlq+1]
vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
paddd xmm0, xmm1
vpsrlvd xmm0, xmm4
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hb, 8|32
cmovz r6d, r2d
movd xmm1, r6d
pmulhuw xmm0, xmm1
.w16_end:
vpbroadcastb xm0, xmm0
.s16:
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm0
mova [dstq+strideq*2], xm0
mova [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
.h32:
mova ym1, [tlq-32]
vpdpbusd ym0, ym1, ym3
jmp wq
.w32:
movu ym1, [tlq+1]
vpdpbusd ym0, ym1, ym3
vextracti32x4 xm1, ym0, 1
paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
paddd xmm0, xmm1
vpsrlvd xmm0, xmm4
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x33345556
shrx r6d, r6d, r2d
movd xmm1, r6d
pmulhuw xmm0, xmm1
.w32_end:
vpbroadcastb ym0, xmm0
.s32:
mova [dstq+strideq*0], ym0
mova [dstq+strideq*1], ym0
mova [dstq+strideq*2], ym0
mova [dstq+stride3q ], ym0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s32
RET
.h64:
mova ym1, [tlq-64]
mova ym2, [tlq-32]
vpdpbusd ym0, ym1, ym3
vpdpbusd ym0, ym2, ym3
jmp wq
.w64:
movu ym1, [tlq+ 1]
movu ym2, [tlq+33]
vpdpbusd ym0, ym1, ym3
vpdpbusd ym0, ym2, ym3
vextracti32x4 xm1, ym0, 1
paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
paddd xmm0, xmm1
vpsrlvd xmm0, xmm4
cmp hd, 64
je .w64_end
mov r6d, 0x33345556
shrx r6d, r6d, hd
movd xmm1, r6d
pmulhuw xmm0, xmm1
.w64_end:
vpbroadcastb m0, xmm0
.s64:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s64
RET
cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
tzcnt wd, wm
movu m0, [tlq+1]
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
%define base r6-ipred_h_8bpc_avx512icl_table
lea r6, [ipred_h_8bpc_avx512icl_table]
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
lea stride3q, [strideq*3]
sub tlq, hq
add wq, r6
jmp wq
.w4:
mova xmm1, [base+ipred_h_shuf+16]
.w4_loop:
movd xmm0, [tlq+hq-4]
pshufb xmm0, xmm1
movd [dstq+strideq*0], xmm0
pextrd [dstq+strideq*1], xmm0, 1
pextrd [dstq+strideq*2], xmm0, 2
pextrd [dstq+stride3q ], xmm0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
.w8:
movsldup xmm2, [base+ipred_h_shuf+16]
movshdup xmm3, [base+ipred_h_shuf+16]
.w8_loop:
movd xmm1, [tlq+hq-4]
pshufb xmm0, xmm1, xmm2
pshufb xmm1, xmm3
movq [dstq+strideq*0], xmm0
movq [dstq+strideq*1], xmm1
movhps [dstq+strideq*2], xmm0
movhps [dstq+stride3q ], xmm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
.w16:
movsldup m1, [base+smooth_shuf]
.w16_loop:
vpbroadcastd m0, [tlq+hq-4]
pshufb m0, m1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], m0, 2
vextracti32x4 [dstq+strideq*2], ym0, 1
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
.w32:
vpbroadcastd ym3, [base+pb_1]
vpord m2, m3, [base+pb_2] {1to16}
.w32_loop:
vpbroadcastd m1, [tlq+hq-4]
pshufb m0, m1, m2
pshufb m1, m3
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32_loop
RET
.w64:
vpbroadcastd m4, [base+pb_3]
vpbroadcastd m5, [base+pb_2]
vpbroadcastd m6, [base+pb_1]
pxor m7, m7
.w64_loop:
vpbroadcastd m3, [tlq+hq-4]
pshufb m0, m3, m4
pshufb m1, m3, m5
pshufb m2, m3, m6
pshufb m3, m7
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w64_loop
RET
%macro PAETH 0
psubusb m1, m5, m4
psubusb m0, m4, m5
por m1, m0 ; tdiff
pavgb m2, m6, m4
vpcmpub k1, m1, m7, 1 ; tdiff < ldiff
vpblendmb m0{k1}, m4, m6
vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
psubusb m3, m5, m2
psubb m2, m4
psubusb m2, m5
por m2, m3
pminub m1, m7
paddusb m2, m2
por m2, m4 ; min(tldiff, 255)
vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff
vmovdqu8 m0{k1}, m5
%endmacro
cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
lea r6, [ipred_paeth_8bpc_avx512icl_table]
tzcnt wd, wm
vpbroadcastb m5, [tlq] ; topleft
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
lea topq, [tlq+1]
sub tlq, hq
add wq, r6
lea stride3q, [strideq*3]
jmp wq
INIT_YMM avx512icl
.w4:
vpbroadcastd m6, [topq]
mova m9, [ipred_h_shuf]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0 ; ldiff
.w4_loop:
vpbroadcastq m4, [tlq+hq-8]
pshufb m4, m9 ; left
PAETH
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+stride3q ], xm0, 3
sub hd, 8
jl .w4_ret
vextracti32x4 xm0, m0, 1
lea dstq, [dstq+strideq*4]
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+stride3q ], xm0, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_ret:
RET
INIT_ZMM avx512icl
.w8:
vpbroadcastq m6, [topq]
movsldup m9, [smooth_shuf]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w8_loop:
vpbroadcastq m4, [tlq+hq-8]
pshufb m4, m9
PAETH
vextracti32x4 xm1, m0, 2
vextracti32x4 xm2, ym0, 1
vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movq [dstq+strideq*2], xm2
movq [dstq+stride3q ], xm3
sub hd, 8
jl .w8_ret
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w8_loop
.w8_ret:
RET
.w16:
vbroadcasti32x4 m6, [topq]
movsldup m9, [smooth_shuf]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w16_loop:
vpbroadcastd m4, [tlq+hq-4]
pshufb m4, m9
PAETH
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], m0, 2
vextracti32x4 [dstq+strideq*2], ym0, 1
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16_loop
RET
.w32:
vbroadcasti32x8 m6, [topq]
mova ym9, ym8
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w32_loop:
vpbroadcastd m4, [tlq+hq-2]
pshufb m4, m9
PAETH
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_loop
RET
.w64:
movu m6, [topq]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w64_loop:
vpbroadcastb m4, [tlq+hq-1]
PAETH
mova [dstq], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
%define base r6-ipred_smooth_v_8bpc_avx512icl_table
lea r6, [ipred_smooth_v_8bpc_avx512icl_table]
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m0, [base+pb_127_m127]
vpbroadcastd m1, [base+pw_128]
lea weightsq, [base+smooth_weights+hq*4]
neg hq
vpbroadcastb m4, [tlq+hq] ; bottom
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.w4:
vpbroadcastd m2, [tlq+1]
movshdup m5, [smooth_shuf]
mova ym6, [smooth_endA]
punpcklbw m2, m4 ; top, bottom
pmaddubsw m3, m2, m0
paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
paddw m3, m1 ; 128 * top + 129 * bottom + 128
.w4_loop:
vbroadcasti32x4 m0, [weightsq+hq*2]
pshufb m0, m5
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+stride3q ], xm1, 2
add hq, 8
jg .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jl .w4_loop
.ret :
RET
.w8:
vpbroadcastq m2, [tlq+1]
movshdup m5, [smooth_shuf]
mova ym6, [smooth_endA]
punpcklbw m2, m4
pmaddubsw m3, m2, m0
paddw m1, m2
paddw m3, m1
.w8_loop:
vpbroadcastq m0, [weightsq+hq*2]
pshufb m0, m5
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
RET
.w16:
vbroadcasti32x4 m3, [tlq+1]
movshdup m6, [smooth_shuf]
mova m7, [smooth_endB]
punpcklbw m2, m3, m4
punpckhbw m3, m4
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w16_loop:
vpbroadcastq m1, [weightsq+hq*2]
pshufb m1, m6
pmaddubsw m0, m2, m1
pmaddubsw m1, m3, m1
paddw m0, m4
paddw m1, m5
vpermt2b m0, m7, m1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], m0, 2
vextracti32x4 [dstq+strideq*2], ym0, 1
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w16_loop
RET
.w32:
vbroadcasti32x8 m3, [tlq+1]
movshdup m6, [smooth_shuf]
mova m7, [smooth_endB]
punpcklbw m2, m3, m4
punpckhbw m3, m4
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w32_loop:
vpbroadcastd m1, [weightsq+hq*2]
pshufb m1, m6
pmaddubsw m0, m2, m1
pmaddubsw m1, m3, m1
paddw m0, m4
paddw m1, m5
vpermt2b m0, m7, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w32_loop
RET
.w64:
movu m3, [tlq+1]
mova m6, [smooth_endB]
punpcklbw m2, m3, m4
punpckhbw m3, m4
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w64_loop:
vpbroadcastw m1, [weightsq+hq*2]
pmaddubsw m0, m2, m1
pmaddubsw m1, m3, m1
paddw m0, m4
paddw m1, m5
vpermt2b m0, m6, m1
mova [dstq], m0
add dstq, strideq
inc hq
jl .w64_loop
RET
cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
%define base r5-ipred_smooth_h_8bpc_avx512icl_table
lea r5, [ipred_smooth_h_8bpc_avx512icl_table]
mov r6d, wd
tzcnt wd, wd
vpbroadcastb m4, [tlq+r6] ; right
mov hd, hm
movsxd wq, [r5+wq*4]
vpbroadcastd m5, [base+pb_127_m127]
vpbroadcastd m6, [base+pw_128]
sub tlq, hq
add wq, r5
vpmovb2m k1, m6
lea stride3q, [strideq*3]
jmp wq
.w4:
movsldup m3, [smooth_shuf]
vpbroadcastq m7, [smooth_weights+4*2]
mova ym8, [smooth_endA]
.w4_loop:
vpbroadcastq m0, [tlq+hq-8]
mova m2, m4
vpshufb m2{k1}, m0, m3 ; left, right
pmaddubsw m0, m2, m5
pmaddubsw m1, m2, m7
paddw m2, m6
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret :
RET
.w8:
movsldup m3, [smooth_shuf]
vbroadcasti32x4 m7, [smooth_weights+8*2]
mova ym8, [smooth_endA]
.w8_loop:
vpbroadcastd m0, [tlq+hq-4]
mova m2, m4
vpshufb m2{k1}, m0, m3
pmaddubsw m0, m2, m5
pmaddubsw m1, m2, m7
paddw m2, m6
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
.w16:
movsldup m7, [smooth_shuf]
vbroadcasti32x4 m8, [smooth_weights+16*2]
vbroadcasti32x4 m9, [smooth_weights+16*3]
mova m10, [smooth_endB]
.w16_loop:
vpbroadcastd m0, [tlq+hq-4]
mova m3, m4
vpshufb m3{k1}, m0, m7
pmaddubsw m2, m3, m5
pmaddubsw m0, m3, m8
pmaddubsw m1, m3, m9
paddw m3, m6
paddw m2, m3
paddw m0, m2
paddw m1, m2
vpermt2b m0, m10, m1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], m0, 2
vextracti32x4 [dstq+strideq*2], ym0, 1
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16_loop
RET
.w32:
mova m10, [smooth_endA]
vpbroadcastd ym7, [pb_1]
vbroadcasti32x8 m8, [smooth_weights+32*2]
vbroadcasti32x8 m9, [smooth_weights+32*3]
vshufi32x4 m10, m10, q3120
.w32_loop:
vpbroadcastd m0, [tlq+hq-2]
mova m3, m4
vpshufb m3{k1}, m0, m7
pmaddubsw m2, m3, m5
pmaddubsw m0, m3, m8
pmaddubsw m1, m3, m9
paddw m3, m6
paddw m2, m3
paddw m0, m2
paddw m1, m2
vpermt2b m0, m10, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_loop
RET
.w64:
mova m7, [smooth_weights+64*2]
mova m8, [smooth_weights+64*3]
mova m9, [smooth_endA]
.w64_loop:
mova m3, m4
vpbroadcastb m3{k1}, [tlq+hq-1]
pmaddubsw m2, m3, m5
pmaddubsw m0, m3, m7
pmaddubsw m1, m3, m8
paddw m3, m6
paddw m2, m3
paddw m0, m2
paddw m1, m2
vpermt2b m0, m9, m1
mova [dstq], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
%define base r5-ipred_smooth_8bpc_avx512icl_table
lea r5, [ipred_smooth_8bpc_avx512icl_table]
mov r6d, wd
tzcnt wd, wd
mov hd, hm
vpbroadcastb m6, [tlq+r6] ; right
sub tlq, hq
movsxd wq, [r5+wq*4]
vpbroadcastd m7, [base+pb_127_m127]
vpbroadcastb m0, [tlq] ; bottom
vpbroadcastd m1, [base+pw_255]
add wq, r5
lea v_weightsq, [base+smooth_weights+hq*2]
vpmovb2m k1, m1
lea stride3q, [strideq*3]
jmp wq
.w4:
vpbroadcastd m8, [tlq+hq+1]
movsldup m4, [smooth_shuf]
movshdup m5, [smooth_shuf]
vpbroadcastq m9, [smooth_weights+4*2]
mova ym11, [smooth_endA]
punpcklbw m8, m0 ; top, bottom
pmaddubsw m10, m8, m7
paddw m1, m8 ; 1 * top + 256 * bottom + 255
paddw m10, m1 ; 128 * top + 129 * bottom + 255
.w4_loop:
vpbroadcastq m1, [tlq+hq-8]
vbroadcasti32x4 m0, [v_weightsq]
add v_weightsq, 16
mova m2, m6
vpshufb m2{k1}, m1, m4 ; left, right
pmaddubsw m1, m2, m7 ; 127 * left - 127 * right
pshufb m0, m5
pmaddubsw m0, m8, m0
paddw m1, m2 ; 128 * left + 129 * right
pmaddubsw m2, m9
paddw m0, m10
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret :
RET
.w8:
vpbroadcastq m8, [tlq+hq+1]
movsldup m4, [smooth_shuf]
movshdup m5, [smooth_shuf]
vbroadcasti32x4 m9, [smooth_weights+8*2]
mova ym11, [smooth_endA]
punpcklbw m8, m0
pmaddubsw m10, m8, m7
paddw m1, m8
paddw m10, m1
.w8_loop:
vpbroadcastd m1, [tlq+hq-4]
vpbroadcastq m0, [v_weightsq]
add v_weightsq, 8
mova m2, m6
vpshufb m2{k1}, m1, m4
pmaddubsw m1, m2, m7
pshufb m0, m5
pmaddubsw m0, m8, m0
paddw m1, m2
pmaddubsw m2, m9
paddw m0, m10
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
.w16:
vbroadcasti32x4 m9, [tlq+hq+1]
movsldup m5, [smooth_shuf]
movshdup m10, [smooth_shuf]
vbroadcasti32x4 m11, [smooth_weights+16*2]
vbroadcasti32x4 m12, [smooth_weights+16*3]
mova m15, [smooth_endB]
punpcklbw m8, m9, m0
punpckhbw m9, m0
pmaddubsw m13, m8, m7
pmaddubsw m14, m9, m7
paddw m0, m1, m8
paddw m1, m9
paddw m13, m0
paddw m14, m1
.w16_loop:
vpbroadcastd m0, [tlq+hq-4]
vpbroadcastq m1, [v_weightsq]
add v_weightsq, 8
mova m4, m6
vpshufb m4{k1}, m0, m5
pmaddubsw m2, m4, m7
pshufb m1, m10
pmaddubsw m0, m8, m1
pmaddubsw m1, m9, m1
paddw m2, m4
pmaddubsw m3, m4, m11
pmaddubsw m4, m12
paddw m0, m13
paddw m1, m14
paddw m3, m2
paddw m4, m2
pavgw m0, m3
pavgw m1, m4
vpermt2b m0, m15, m1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], m0, 2
vextracti32x4 [dstq+strideq*2], ym0, 1
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16_loop
RET
.w32:
vbroadcasti32x8 m9, [tlq+hq+1]
movshdup m10, [smooth_shuf]
mova m12, [smooth_weights+32*2]
vpbroadcastd ym5, [pb_1]
mova m15, [smooth_endB]
punpcklbw m8, m9, m0
punpckhbw m9, m0
pmaddubsw m13, m8, m7
pmaddubsw m14, m9, m7
vshufi32x4 m11, m12, m12, q2020
vshufi32x4 m12, m12, q3131
paddw m0, m1, m8
paddw m1, m9
paddw m13, m0
paddw m14, m1
.w32_loop:
vpbroadcastd m0, [tlq+hq-2]
vpbroadcastd m1, [v_weightsq]
add v_weightsq, 4
mova m4, m6
vpshufb m4{k1}, m0, m5
pmaddubsw m2, m4, m7
pshufb m1, m10
pmaddubsw m0, m8, m1
pmaddubsw m1, m9, m1
paddw m2, m4
pmaddubsw m3, m4, m11
pmaddubsw m4, m12
paddw m0, m13
paddw m1, m14
paddw m3, m2
paddw m4, m2
pavgw m0, m3
pavgw m1, m4
vpermt2b m0, m15, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_loop
RET
.w64:
movu m9, [tlq+hq+1]
mova m11, [smooth_weights+64*2]
mova m2, [smooth_weights+64*3]
mova m14, [smooth_endB]
punpcklbw m8, m9, m0
punpckhbw m9, m0
pmaddubsw m12, m8, m7
pmaddubsw m13, m9, m7
vshufi32x4 m10, m11, m2, q2020
vshufi32x4 m11, m2, q3131
paddw m0, m1, m8
paddw m1, m9
paddw m12, m0
paddw m13, m1
.w64_loop:
mova m4, m6
vpbroadcastb m4{k1}, [tlq+hq-1]
vpbroadcastw m1, [v_weightsq]
add v_weightsq, 2
pmaddubsw m2, m4, m7
pmaddubsw m0, m8, m1
pmaddubsw m1, m9, m1
paddw m2, m4
pmaddubsw m3, m4, m10
pmaddubsw m4, m11
paddw m0, m12
paddw m1, m13
paddw m3, m2
paddw m4, m2
pavgw m0, m3
pavgw m1, m4
vpermt2b m0, m14, m1
mova [dstq], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
movifnidn wd, wm
movifnidn hd, hm
lea stride3q, [strideq*3]
cmp wd, 8
jg .w32
movq xmm3, [palq]
je .w8
.w4:
movq xmm0, [idxq]
add idxq, 8
psrlw xmm1, xmm0, 4
punpcklbw xmm0, xmm1
pshufb xmm0, xmm3, xmm0
movd [dstq+strideq*0], xmm0
pextrd [dstq+strideq*1], xmm0, 1
pextrd [dstq+strideq*2], xmm0, 2
pextrd [dstq+stride3q ], xmm0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
RET
.w8:
movu xmm2, [idxq]
add idxq, 16
pshufb xmm1, xmm3, xmm2
psrlw xmm2, 4
pshufb xmm2, xmm3, xmm2
punpcklbw xmm0, xmm1, xmm2
punpckhbw xmm1, xmm2
movq [dstq+strideq*0], xmm0
movhps [dstq+strideq*1], xmm0
movq [dstq+strideq*2], xmm1
movhps [dstq+stride3q ], xmm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
.w16:
pmovzxdq m0, [idxq]
add idxq, 32
vpmultishiftqb m0, m3, m0
pshufb m0, m5, m0
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
.w32:
vpbroadcastq m3, [pal_unpack+0]
vpbroadcastq m5, [palq]
cmp wd, 32
jl .w16
pmovzxbd m2, [pal_perm]
vpbroadcastq m4, [pal_unpack+8]
jg .w64
.w32_loop:
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32_loop
RET
.w64:
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET
%if WIN64
DECLARE_REG_TMP 4
%else
DECLARE_REG_TMP 8
%endif
cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
%define base r7-z_filter_t0
lea r7, [z_filter_t0]
tzcnt wd, wm
movifnidn angled, anglem
lea t0, [dr_intra_derivative]
movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4]
inc tlq
mov dxd, angled
and dxd, 0x7e
add angled, 165 ; ~90
movzx dxd, word [t0+dxq]
lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq]
movifnidn hd, hm
xor angled, 0x4ff ; d = 90 - angle
mova m14, [base+z_frac_table]
vpbroadcastd m15, [base+pw_512]
jmp wq
.w4:
mova m9, [pb_0to63]
pminud m8, m9, [base+pb_7] {1to16}
vpbroadcastq m7, [tlq]
pshufb m7, m8
cmp angleb, 40
jae .w4_no_upsample
lea r3d, [angleq-1024]
sar r3d, 7
add r3d, hd
jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
pshufb xmm0, xm7, [base+z_filter_s4]
mova xmm1, [tlq-1]
pshufb xmm1, [base+z_xpos_off2a]
vpbroadcastd xmm2, [base+pb_m4_36]
vpbroadcastq m4, [pb_0to63]
pmaddubsw xmm0, xmm2
pmaddubsw xmm1, xmm2
add dxd, dxd
kxnorw k1, k1, k1
paddw xmm0, xmm1
pmulhrsw xm0, xmm0, xm15
packuswb xm0, xm0
punpcklbw ym7{k1}, ym0
jmp .w4_main2
.w4_no_upsample:
test angled, 0x400
jnz .w4_main ; !enable_intra_edge_filter
lea r3d, [hq+3]
vpbroadcastb xm0, r3d
vpbroadcastb xm1, angled
shr angled, 8 ; is_sm << 1
vpcmpeqb k1, xm0, [base+z_filter_wh]
vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8]
kmovw r5d, k1
test r5d, r5d
jz .w4_main
vbroadcasti32x4 ym0, [tlq-1]
pshufb ym0, [base+z_filter4_s1]
popcnt r5d, r5d ; filter_strength
pshufb ym1, ym7, [z_filter_s4]
pshufb ym7, [base+z_filter_s3]
vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
pmaddubsw ym0, ym11
pmaddubsw ym1, ym11
pmaddubsw ym7, ym12
paddw ym0, ym1
paddw ym7, ym0
pmulhrsw ym7, ym15
cmp hd, 4
je .w4_filter_end
vpbroadcastd m8, [base+pb_9]
pminub m8, m9
.w4_filter_end:
paddb m8, m8
vpermb m7, m8, m7
.w4_main:
vpbroadcastq m4, [base+z_xpos_off1a]
.w4_main2:
movsldup m2, [base+z_xpos_mul]
vpbroadcastw m5, dxd
vbroadcasti32x4 m3, [base+z_xpos_bc]
lea r2, [strideq*3]
pmullw m2, m5 ; xpos
psllw m5, 5 ; dx*8
.w4_loop:
psrlw m1, m2, 3
pshufb m0, m2, m3
vpermw m1, m1, m14 ; 64-frac, frac
paddsb m0, m4 ; base, base+1
vpermb m0, m0, m7 ; top[base], top[base+1]
paddsw m2, m5 ; xpos += dx
pmaddubsw m0, m1 ; v
pmulhrsw m0, m15
packuswb m0, m0
vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
movd [dstq+strideq*2], xm1
pextrd [dstq+r2 ], xm1, 1
sub hd, 8
jl .w4_end
vextracti32x4 xm1, m0, 2 ; top[max_base_x]
lea dstq, [dstq+strideq*4]
vextracti32x4 xm0, m0, 3
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
movd [dstq+strideq*2], xm0
pextrd [dstq+r2 ], xm0, 1
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_end:
RET
.w8_filter:
mova ym0, [base+z_filter_s1]
popcnt r5d, r5d
vbroadcasti32x4 ym1, [base+z_filter_s2]
vbroadcasti32x4 ym3, [base+z_filter_s3]
vbroadcasti32x4 ym4, [base+z_filter_s4]
vpermi2b ym0, ym7, ym2 ; al bl
mova ym5, [base+z_filter_s5]
pshufb ym1, ym7, ym1 ; ah bh
vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0]
pshufb ym3, ym7, ym3 ; cl ch
vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1]
pshufb ym4, ym7, ym4 ; el dl
vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2]
vpermb ym5, ym5, ym7 ; eh dh
pmaddubsw ym0, ym11
pmaddubsw ym1, ym11
pmaddubsw ym2, ym3, ym12
pmaddubsw ym3, ym13
pmaddubsw ym4, ym11
pmaddubsw ym5, ym11
paddw ym0, ym2
paddw ym1, ym3
paddw ym0, ym4
paddw ym1, ym5
pmulhrsw ym0, ym15
pmulhrsw ym1, ym15
packuswb ym0, ym1
ret
.w8:
lea r3d, [angleq+216]
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
lea r3d, [hq-1]
mova xm1, [base+z_filter_s4]
vpbroadcastb xm2, r3d
mova xm7, [tlq-1]
vinserti32x4 ym7, [tlq+7], 1
vbroadcasti32x4 ym0, [base+z_xpos_off1a]
vpbroadcastd ym3, [base+pb_m4_36]
pminub xm2, xm1
pshufb ym0, ym7, ym0
vinserti32x4 ym1, xm2, 1
psrldq ym7, 1
pshufb ym1, ym7, ym1
pmaddubsw ym0, ym3
pmaddubsw ym1, ym3
vbroadcasti32x4 m8, [pb_0to63]
add dxd, dxd
paddw ym0, ym1
pmulhrsw ym0, ym15
packuswb ym0, ym0
punpcklbw ym7, ym0
jmp .w8_main2
.w8_no_upsample:
lea r3d, [hq+7]
mova m9, [pb_0to63]
vpbroadcastb ym0, r3d
and r3d, 7
vbroadcasti32x4 m7, [tlq]
or r3d, 8 ; imin(h+7, 15)
vpbroadcastb m8, r3d
pminub m8, m9
pshufb m7, m8
test angled, 0x400
jnz .w8_main
vpbroadcastb ym1, angled
shr angled, 8
vpcmpeqb k1, ym0, [base+z_filter_wh]
mova xm0, [base+z_filter_t0+angleq*8]
vpcmpgtb k1{k1}, ym1, ym0
kmovd r5d, k1
test r5d, r5d
jz .w8_main
vpbroadcastd ym2, [tlq-4]
call .w8_filter
cmp hd, 8
jle .w8_filter_end
vpbroadcastd m8, [base+pb_17]
add r3d, 2
pminub m8, m9
.w8_filter_end:
vpermb m7, m8, m0
.w8_main:
vbroadcasti32x4 m8, [base+z_xpos_off1a]
.w8_main2:
movsldup m4, [base+z_xpos_mul]
vpbroadcastw m9, dxd
shl r3d, 6
vpbroadcastd m5, [base+z_xpos_bc+8*0]
pmullw m4, m9 ; xpos
vpbroadcastd m6, [base+z_xpos_bc+8*1]
sub r3d, dxd
shl dxd, 3
psllw m9, 5 ; dx*8
lea r2, [strideq*3]
.w8_loop:
psrlw m3, m4, 3
pshufb m0, m4, m5
pshufb m1, m4, m6
vpermw m3, m3, m14
paddsb m0, m8
paddsb m1, m8
vpermb m0, m0, m7
vpermb m1, m1, m7
paddsw m4, m9
punpcklqdq m2, m3, m3
pmaddubsw m0, m2
punpckhqdq m3, m3
pmaddubsw m1, m3
pmulhrsw m0, m15
pmulhrsw m1, m15
packuswb m0, m1
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r2 ], xm1
sub hd, 8
jl .w8_end
vextracti32x8 ym0, m0, 1
lea dstq, [dstq+strideq*4]
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r2 ], xm1
jz .w8_end
lea dstq, [dstq+strideq*4]
sub r3d, dxd
jg .w8_loop
vextracti32x4 xm7, m7, 3
.w8_end_loop:
movq [dstq+strideq*0], xm7
movq [dstq+strideq*1], xm7
movq [dstq+strideq*2], xm7
movq [dstq+r2 ], xm7
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_end_loop
.w8_end:
RET
.w16_filter:
mova m0, [base+z_filter_s1]
popcnt r5d, r5d
vbroadcasti32x4 m1, [base+z_filter_s2]
vbroadcasti32x4 m3, [base+z_filter_s3]
vbroadcasti32x4 m4, [base+z_filter_s4]
vpermi2b m0, m7, m2 ; al bl
mova m5, [base+z_filter_s5]
pshufb m1, m7, m1 ; ah bh
vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0]
pshufb m3, m7, m3 ; cl ch
vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1]
pshufb m4, m7, m4 ; el dl
vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2]
vpermb m5, m5, m7 ; eh dh
pmaddubsw m0, m11
pmaddubsw m1, m11
pmaddubsw m2, m3, m12
pmaddubsw m3, m13
pmaddubsw m4, m11
pmaddubsw m5, m11
paddw m0, m2
paddw m1, m3
paddw m0, m4
paddw m1, m5
pmulhrsw m0, m15
pmulhrsw m1, m15
packuswb m0, m1
ret
.w16:
lea r3d, [hq+15]
mova m9, [pb_0to63]
vpbroadcastb ym0, r3d
and r3d, 15
movu ym7, [tlq]
or r3d, 16 ; imin(h+15, 31)
vpbroadcastb m8, r3d
pminub m8, m9
vpermb m7, m8, m7
test angled, 0x400
jnz .w16_main
vpbroadcastb ym1, angled
shr angled, 8
vpcmpeqb k1, ym0, [base+z_filter_wh]
mova xm0, [base+z_filter_t0+angleq*8]
vpcmpgtb k1{k1}, ym1, ym0
kmovd r5d, k1
test r5d, r5d
jz .w16_main
vpbroadcastd m2, [tlq-4]
call .w16_filter
cmp hd, 16
jle .w16_filter_end
vpbroadcastd m8, [base+pb_33]
add r3d, 2
pminub m8, m9
.w16_filter_end:
vpermb m7, m8, m0
.w16_main:
movshdup m3, [base+z_xpos_mul]
vpbroadcastw m8, dxd
shl r3d, 6
vpbroadcastd m4, [base+z_xpos_bc]
pmullw m3, m8 ; xpos
vbroadcasti32x4 m5, [base+z_xpos_off1a]
sub r3d, dxd
shl dxd, 2
vbroadcasti32x4 m6, [base+z_xpos_off1b]
psllw m8, 4 ; dx*4
lea r2, [strideq*3]
.w16_loop:
pshufb m1, m3, m4
psrlw m2, m3, 3
paddsb m0, m1, m5
vpermw m2, m2, m14
paddsb m1, m6
vpermb m0, m0, m7
vpermb m1, m1, m7
paddsw m3, m8
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m15
pmulhrsw m1, m15
packuswb m0, m1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+r2 ], m0, 3
sub hd, 4
jz .w16_end
lea dstq, [dstq+strideq*4]
sub r3d, dxd
jg .w16_loop
vextracti32x4 xm7, m7, 3
.w16_end_loop:
mova [dstq+strideq*0], xm7
mova [dstq+strideq*1], xm7
mova [dstq+strideq*2], xm7
mova [dstq+r2 ], xm7
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16_end_loop
.w16_end:
RET
.w32_filter:
mova m0, [base+z_filter_s1]
vbroadcasti32x4 m1, [base+z_filter_s2]
vbroadcasti32x4 m3, [base+z_filter_s3]
vbroadcasti32x4 m4, [base+z_filter_s4]
vpermi2b m0, m7, m2 ; al bl
mova m5, [base+z_filter_s5]
pshufb m1, m7, m1 ; ah bh
vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
pshufb m3, m7, m3 ; cl ch
vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
pshufb m4, m7, m4 ; el dl
vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
vpermi2b m5, m7, m8 ; eh dh
pmaddubsw m0, m11
pmaddubsw m1, m11
pmaddubsw m2, m3, m12
pmaddubsw m3, m13
pmaddubsw m4, m11
pmaddubsw m5, m11
paddw m0, m2
paddw m1, m3
paddw m0, m4
paddw m1, m5
pmulhrsw m0, m15
pmulhrsw m1, m15
packuswb m7, m0, m1
ret
.w32:
lea r3d, [hq+31]
vpbroadcastb m9, r3d
and r3d, 31
pminub m10, m9, [pb_0to63]
or r3d, 32 ; imin(h+31, 63)
vpermb m7, m10, [tlq]
vpbroadcastb m8, [tlq+r3]
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
vpbroadcastd m2, [tlq-4]
call .w32_filter
cmp hd, 64
je .w32_h64_filter_end
vpermb m8, m9, m7
vpermb m7, m10, m7
jmp .w32_main
.w32_h64_filter_end: ; edge case for 32x64
movd xmm0, [tlq+r3-1]
movd xmm1, [base+pb_8_56_0_0]
add r3d, 2
pmaddubsw xmm0, xmm1
vptestmw k1, xmm1, xmm1 ; 0x01
pmulhrsw xm0, xmm0, xm15
vmovdqu8 m8{k1}, m0
.w32_main:
rorx r2d, dxd, 30
vpbroadcastd m4, [base+z_xpos_bc]
vpbroadcastw m3, r2d
vbroadcasti32x8 m5, [base+z_xpos_off2a]
shl r3d, 6
vbroadcasti32x8 m6, [base+z_xpos_off2b]
sub r3d, dxd
paddw m9, m3, m3
add dxd, dxd
vinserti32x8 m3, ym9, 1
.w32_loop:
pshufb m1, m3, m4
psrlw m2, m3, 3
paddsb m0, m1, m5
vpermw m2, m2, m14
paddsb m1, m6
vpermi2b m0, m7, m8
vpermi2b m1, m7, m8
paddsw m3, m9
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m15
pmulhrsw m1, m15
packuswb m0, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
sub hd, 2
jz .w32_end
lea dstq, [dstq+strideq*2]
sub r3d, dxd
jg .w32_loop
punpckhqdq ym8, ym8
.w32_end_loop:
mova [dstq+strideq*0], ym8
mova [dstq+strideq*1], ym8
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_end_loop
.w32_end:
RET
.w64_filter:
vbroadcasti32x4 m3, [base+z_filter_s2]
mova m1, [base+z_filter_s1]
pshufb m0, m3 ; al bl
vpermi2b m1, m7, m2
vbroadcasti32x4 m4, [base+z_filter_s4]
pshufb m6, m8, m4 ; el dl
pshufb m9, m7, m4
pminub m10, m13, [base+z_filter_s5]
pshufb m2, m8, m3 ; ah bh
pshufb m3, m7, m3
vbroadcasti32x4 m5, [base+z_filter_s3]
vpermb m10, m10, m8 ; eh dh
pshufb m11, m4
vpbroadcastd m4, [base+z_filter_k+4*2+12*0]
pshufb m8, m5 ; cl ch
pshufb m7, m5
vpbroadcastd m5, [base+z_filter_k+4*2+12*1]
REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11
pmaddubsw m4, m8, m5
pmaddubsw m5, m7, m5
paddw m0, m6
vpbroadcastd m6, [base+z_filter_k+4*2+12*2]
paddw m1, m9
pmaddubsw m7, m6
pmaddubsw m8, m6
paddw m2, m10
paddw m3, m11
paddw m0, m4
paddw m1, m5
paddw m2, m8
paddw m3, m7
REPX {pmulhrsw x, m15}, m0, m2, m1, m3
packuswb m0, m2
packuswb m7, m1, m3
vpermb m8, m12, m0
ret
.w64:
lea r3d, [hq-1]
movu m7, [tlq+64*0]
vpbroadcastb m13, r3d
pminub m12, m13, [pb_0to63]
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=96 H=68 G=83