; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db %1-128, 127-%1
%rotate 1
%endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights: SMOOTH_WEIGHT_TABLE \
0, 0, 255, 128, 255, 149, 85, 64, \
255, 197, 146, 105, 73, 50, 37, 32, \
255, 225, 196, 170, 145, 123, 102, 84, \
68, 54, 43, 33, 26, 20, 17, 16, \
255, 240, 225, 210, 196, 182, 169, 157, \
145, 133, 122, 111, 101, 92, 83, 74, \
66, 59, 52, 45, 39, 34, 29, 25, \
21, 17, 14, 12, 10, 9, 8, 8, \
255, 248, 240, 233, 225, 218, 210, 203, \
196, 189, 182, 176, 169, 163, 156, 150, \
144, 138, 133, 127, 121, 116, 111, 106, \
101, 96, 91, 86, 82, 77, 73, 69, \
65, 61, 57, 54, 50, 47, 44, 41, \
38, 35, 32, 29, 27, 25, 22, 20, \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
z_filter_wh4: db 7, 7, 19, 7,
z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
pd_32768: dd 32768
z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11
z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
pw_m1to4: dw -1, -2, -3, -4
z_filter_k: times 4 db 0, 16
times 4 db 0, 20
times 4 db 8, 16
times 4 db 32, 16
times 4 db 24, 20
times 4 db 16, 16
times 4 db 0, 0
times 4 db 0, 0
pw_8: times 8 db 8, 0
pb_3: times 16 db 3
pb_16: times 16 db 16
pw_62: times 8 dw 62
pw_64: times 8 dw 64
pw_256: times 8 dw 256
pw_512: times 8 dw 512
pw_m256: times 8 dw -256
pb_2: times 8 db 2
pb_4: times 8 db 4
pb_8: times 8 db 8
pb_128: times 8 db 128
pb_m16: times 8 db -16
pw_128: times 4 dw 128
pw_255: times 4 dw 255
pb_36_m4: times 4 db 36, -4
pb_127_m127: times 4 db 127, -127
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
;---------------------------------------------------------------------------------------
;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
punpcklqdq m1, m1
mova [dstq + %2], m1
%if %1 > 16
mova [dstq + 16 + %2], m1
%endif
%if %1 > 32
mova [dstq + 32 + %2], m1
mova [dstq + 48 + %2], m1
%endif
%endmacro
%macro IPRED_H 1 ; width
sub tlq, 4
movd m0, [tlq] ; get 4 bytes of topleft data
punpcklbw m0, m0 ; extend 2 byte
%if %1 == 4
pshuflw m1, m0, q2233
movd [dstq+strideq*0], m1
psrlq m1, 32
movd [dstq+strideq*1], m1
pshuflw m0, m0, q0011
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+stride3q ], m0
%elif %1 == 8
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
movq [dstq+strideq*1], m1
movhps [dstq+strideq*0], m1
movq [dstq+stride3q ], m0
movhps [dstq+strideq*2], m0
%else
IPRED_SET %1, 0, q3333
IPRED_SET %1, strideq, q2222
IPRED_SET %1, strideq*2, q1111
IPRED_SET %1, stride3q, q0000
%endif
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w%1
RET
%endmacro
INIT_XMM ssse3
cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
LEA r5, ipred_h_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w4:
IPRED_H 4
.w8:
IPRED_H 8
.w16:
IPRED_H 16
.w32:
IPRED_H 32
.w64:
IPRED_H 64
;---------------------------------------------------------------------------------------
;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movu m0, [tlq+ 1]
movu m1, [tlq+17]
movu m2, [tlq+33]
movu m3, [tlq+49]
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd m4, r5d
tzcnt r5d, r5d
movd m5, r5d
LEA r5, ipred_dc_ssse3_table
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+20]
pcmpeqd m3, m3
psrlw m4, 1 ; dc = (width + height) >> 1;
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movd m0, [tlq-4]
pmaddubsw m0, m3
jmp wq
.w4:
movd m1, [tlq+1]
pmaddubsw m1, m3
psubw m0, m4
paddw m0, m1
pmaddwd m0, m3
cmp hd, 4
jg .w4_mul
psrlw m0, 3 ; dc >>= ctz(width + height);
jmp .w4_end
.w4_mul:
punpckhqdq m1, m0, m0
paddw m0, m1
psrlq m1, m0, 32
paddw m0, m1
psrlw m0, 2
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8
cmovz r6d, r2d
movd m5, r6d
pmulhuw m0, m5
.w4_end:
pxor m1, m1
pshufb m0, m1
.s4:
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m0
movd [dstq+strideq*2], m0
movd [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
ALIGN function_align
.h8:
movq m0, [tlq-8]
pmaddubsw m0, m3
jmp wq
.w8:
movq m1, [tlq+1]
pmaddubsw m1, m3
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
paddw m0, m1
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w8_end:
pxor m1, m1
pshufb m0, m1
.s8:
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
ALIGN function_align
.h16:
mova m0, [tlq-16]
pmaddubsw m0, m3
jmp wq
.w16:
movu m1, [tlq+1]
pmaddubsw m1, m3
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8|32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w16_end:
pxor m1, m1
pshufb m0, m1
.s16:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
ALIGN function_align
.h32:
mova m0, [tlq-32]
pmaddubsw m0, m3
mova m2, [tlq-16]
pmaddubsw m2, m3
paddw m0, m2
jmp wq
.w32:
movu m1, [tlq+1]
pmaddubsw m1, m3
movu m2, [tlq+17]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 64|16
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w32_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
.s32:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq*2], m0
mova [dstq+strideq*2+16], m1
mova [dstq+stride3q], m0
mova [dstq+stride3q+16], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s32
RET
ALIGN function_align
.h64:
mova m0, [tlq-64]
mova m1, [tlq-48]
pmaddubsw m0, m3
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-32]
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-16]
pmaddubsw m1, m3
paddw m0, m1
jmp wq
.w64:
movu m1, [tlq+ 1]
movu m2, [tlq+17]
pmaddubsw m1, m3
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+33]
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+49]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 64
je .w64_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w64_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
mova m2, m0
mova m3, m0
.s64:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq+32], m2
mova [dstq+strideq+48], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .s64
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_left_ssse3_table
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
movu m0, [tlq]
movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
movd m2, r6d
psrld m3, m2
movsxd r6, [r5+r6*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
.h64:
movu m1, [tlq+48] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
movu m1, [tlq+32] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h32:
movu m1, [tlq+16] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h16:
pshufd m1, m0, q3232 ; psrlq m1, m0, 16
paddw m0, m1
.h8:
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
.h4:
pmaddwd m0, m2
pmulhrsw m0, m3
lea stride3q, [strideq*3]
pxor m1, m1
pshufb m0, m1
mova m1, m0
mova m2, m0
mova m3, m0
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
mova m1, m0
mova m2, m0
mova m3, m0
add wq, r5
lea stride3q, [strideq*3]
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
LEA r5, ipred_dc_left_ssse3_table
tzcnt wd, wm
inc tlq
movu m0, [tlq]
movifnidn hd, hm
movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
movd m2, wd
psrld m3, m2
movsxd r6, [r5+wq*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
pmaddubsw m6, m%3, m%1
pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
paddw m6, m%5
paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
psrlw m6, 8
psrlw m0, 8
packuswb m6, m0
%endmacro
cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_ssse3_table
LEA r6, ipred_smooth_v_ssse3_table
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
lea weightsq, [base+smooth_weights+hq*4]
neg hq
movd m5, [tlq+hq]
pxor m2, m2
pshufb m5, m2
add wq, r6
jmp wq
.w4:
movd m2, [tlq+1]
punpckldq m2, m2
punpcklbw m2, m5 ; top, bottom
lea r3, [strideq*3]
mova m4, [base+ipred_v_shuf]
mova m5, m4
punpckldq m4, m4
punpckhdq m5, m5
pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
.w4_loop:
movu m1, [weightsq+hq*2]
pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
movd [dstq+strideq*0], m6
pshuflw m1, m6, q1032
movd [dstq+strideq*1], m1
punpckhqdq m6, m6
movd [dstq+strideq*2], m6
psrlq m6, 32
movd [dstq+r3 ], m6
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w4_loop
RET
ALIGN function_align
.w8:
movq m2, [tlq+1]
punpcklbw m2, m5
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
pshufd m4, m5, q0000
pshufd m5, m5, q1111
pmaddubsw m3, m2, m0
paddw m1, m2
paddw m3, m1 ; m3 is output for loop
.w8_loop:
movq m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
movq [dstq+strideq*0], m6
movhps [dstq+strideq*1], m6
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w8_loop
RET
ALIGN function_align
.w16:
movu m3, [tlq+1]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1 ; m4 and m5 is output for loop
.w16_loop:
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add dstq, strideq
add hq, 1
jl .w16_loop
RET
ALIGN function_align
.w32:
WIN64_PUSH_XMM 8, 7
mova m7, m5
.w32_loop_init:
mov r3d, 2
.w32_loop:
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
movu m3, [tlq+1]
punpcklbw m2, m3, m7
punpckhbw m3, m7
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add tlq, 16
add dstq, 16
dec r3d
jg .w32_loop
lea dstq, [dstq-32+strideq]
sub tlq, 32
add hq, 1
jl .w32_loop_init
RET
ALIGN function_align
.w64:
WIN64_PUSH_XMM 8, 7
mova m7, m5
.w64_loop_init:
mov r3d, 4
.w64_loop:
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
movu m3, [tlq+1]
punpcklbw m2, m3, m7
punpckhbw m3, m7
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add tlq, 16
add dstq, 16
dec r3d
jg .w64_loop
lea dstq, [dstq-64+strideq]
sub tlq, 64
add hq, 1
jl .w64_loop_init
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_ssse3_table
LEA r6, ipred_smooth_h_ssse3_table
mov wd, wm
movd m3, [tlq+wq]
pxor m1, m1
pshufb m3, m1 ; right
tzcnt wd, wd
mov hd, hm
movsxd wq, [r6+wq*4]
movddup m4, [base+pb_127_m127]
movddup m5, [base+pw_128]
add wq, r6
jmp wq
.w4:
movddup m6, [base+smooth_weights+4*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
sub tlq, hq
lea r3, [strideq*3]
.w4_loop:
movd m2, [tlq+hq] ; left
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r3 ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
ALIGN function_align
.w8:
mova m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
sub tlq, hq
punpckldq m7, m7
.w8_loop:
movd m2, [tlq+hq] ; left
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
mova m6, [base+smooth_weights+16*2]
mova m7, [base+smooth_weights+16*3]
sub tlq, 1
sub tlq, hq
.w16_loop:
pxor m1, m1
movd m2, [tlq+hq] ; left
pshufb m2, m1
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
lea dstq, [dstq+strideq]
sub hd, 1
jg .w16_loop
RET
ALIGN function_align
.w32:
sub tlq, 1
sub tlq, hq
pxor m6, m6
.w32_loop_init:
mov r5, 2
lea r3, [base+smooth_weights+16*4]
.w32_loop:
mova m7, [r3]
add r3, 16
movd m2, [tlq+hq] ; left
pshufb m2, m6
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m7
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
mova m7, [r3]
add r3, 16
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
add dstq, 16
dec r5
jg .w32_loop
lea dstq, [dstq-32+strideq]
sub hd, 1
jg .w32_loop_init
RET
ALIGN function_align
.w64:
sub tlq, 1
sub tlq, hq
pxor m6, m6
.w64_loop_init:
mov r5, 4
lea r3, [base+smooth_weights+16*8]
.w64_loop:
mova m7, [r3]
add r3, 16
movd m2, [tlq+hq] ; left
pshufb m2, m6
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m7
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
mova m7, [r3]
add r3, 16
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
add dstq, 16
dec r5
jg .w64_loop
lea dstq, [dstq-64+strideq]
sub hd, 1
jg .w64_loop_init
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
pmaddubsw m6, m%3, m%1
mova m0, m6
pmaddubsw m6, m%4, m%2
mova m1, m6
%ifnum %5
paddw m0, m%5
%else
paddw m0, %5
%endif
%ifnum %6
paddw m1, m%6
%else
paddw m1, %6
%endif
%ifnum %7
%else
mova m3, %7
%endif
pavgw m0, m2
pavgw m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
mova m1, [rsp+16*%1] ; top
punpckhbw m6, m1, m0 ; top, bottom
punpcklbw m1, m0 ; top, bottom
pmaddubsw m2, m1, m5
mova [rsp+16*%2], m1
paddw m1, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m1 ; 128 * top + 129 * bottom + 255
mova [rsp+16*%3], m2
pmaddubsw m2, m6, m5
mova [rsp+16*%4], m6
paddw m6, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m6 ; 128 * top + 129 * bottom + 255
mova [rsp+16*%5], m2
movd m1, [tlq+hq] ; left
pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
punpcklbw m1, m4 ; left, right
pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
paddw m2, m1 ; 128 * left + 129 * right
mova m3, m2
pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
pmaddubsw m1, %7
paddw m2, m3, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
mova m7, [rsp+16*%9]
pshufb m1, m7
mova [rsp+16*%8], m3
mova m4, [rsp+16*%2]
mova m5, [rsp+16*%3]
mova m3, [rsp+16*%4]
mova m7, [rsp+16*%5]
SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
mova [dstq], m0
movddup m3, [base+pw_255] ; recovery
mova m0, [rsp+16*%10] ; recovery
mova m4, [rsp+16*%11] ; recovery
mova m5, [rsp+16*%12] ; recovery
%endmacro
cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_ssse3_table
mov wd, wm
mov hd, hm
LEA r6, ipred_smooth_ssse3_table
movd m4, [tlq+wq] ; right
pxor m2, m2
pshufb m4, m2
tzcnt wd, wd
mov r5, tlq
sub r5, hq
movsxd wq, [r6+wq*4]
movddup m5, [base+pb_127_m127]
movd m0, [r5]
pshufb m0, m2 ; bottom
movddup m3, [base+pw_255]
add wq, r6
lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
jmp wq
.w4:
mova m7, [base+ipred_v_shuf]
movd m1, [tlq+1] ; left
pshufd m1, m1, q0000
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m1, m0 ; top, bottom
pshufd m6, m7, q1100
pshufd m7, m7, q3322
pmaddubsw m2, m1, m5
paddw m3, m1 ; 1 * top + 255 * bottom + 255
paddw m2, m3 ; 128 * top + 129 * bottom + 255
mova [rsp+16*0], m1
mova [rsp+16*1], m2
movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
punpcklqdq m1, m1
mova [rsp+16*2], m1
mova [rsp+16*3], m4
mova [rsp+16*4], m6
mova [rsp+16*5], m5
.w4_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+ipred_h_shuf]
punpcklbw m0, m1, m4 ; left, right
punpckhbw m1, m4
pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
pmaddubsw m3, m1, m5
paddw m2, m0 ; 128 * left + 129 * right
paddw m3, m1
mova m4, [rsp+16*2]
pmaddubsw m0, m4
pmaddubsw m1, m4
paddw m2, m0
paddw m3, m1
movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 8
pshufb m0, m1, m6
pshufb m1, m7
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
mova m4, [rsp+16*3]
mova m6, [rsp+16*4]
mova m5, [rsp+16*5]
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r3 ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
ALIGN function_align
.w8:
mova m7, [base+ipred_v_shuf]
movq m1, [tlq+1] ; left
punpcklqdq m1, m1
sub tlq, 4
sub tlq, hq
punpcklbw m1, m0
pshufd m6, m7, q0000
pshufd m7, m7, q1111
pmaddubsw m2, m1, m5
paddw m3, m1
paddw m2, m3
mova [rsp+16*0], m1
mova [rsp+16*1], m2
mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
mova [rsp+16*2], m1
mova [rsp+16*3], m4
mova [rsp+16*4], m6
mova [rsp+16*5], m5
.w8_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+ipred_h_shuf]
pshufd m1, m1, q1100
punpcklbw m0, m1, m4
punpckhbw m1, m4
pmaddubsw m2, m0, m5
pmaddubsw m3, m1, m5
paddw m2, m0
paddw m3, m1
mova m4, [rsp+16*2]
pmaddubsw m0, m4
pmaddubsw m1, m4
paddw m2, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 4
pshufb m0, m1, m6
pshufb m1, m7
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
mova m4, [rsp+16*3]
mova m6, [rsp+16*4]
mova m5, [rsp+16*5]
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
mova m7, [base+ipred_v_shuf]
movu m1, [tlq+1] ; left
sub tlq, 4
sub tlq, hq
punpckhbw m6, m1, m0 ; top, bottom
punpcklbw m1, m0 ; top, bottom
pshufd m7, m7, q0000
mova [rsp+16*2], m7
pmaddubsw m2, m6, m5
mova [rsp+16*5], m6
paddw m6, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m6 ; 128 * top + 129 * bottom + 255
mova [rsp+16*6], m2
pmaddubsw m2, m1, m5
paddw m3, m1 ; 1 * top + 255 * bottom + 255
mova [rsp+16*0], m1
paddw m2, m3 ; 128 * top + 129 * bottom + 255
mova [rsp+16*1], m2
mova [rsp+16*3], m4
mova [rsp+16*4], m5
.w16_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
punpcklbw m1, m4 ; left, right
pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
paddw m2, m1 ; 128 * left + 129 * right
mova m0, m1
mova m3, m2
pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
pmaddubsw m1, [base+smooth_weights+16*3]
paddw m2, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 2
mova m7, [rsp+16*2]
pshufb m1, m7
mova [rsp+16*7], m3
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
mova m3, [rsp+16*5]
mova m7, [rsp+16*6]
SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
mova m4, [rsp+16*3]
mova m5, [rsp+16*4]
mova [dstq], m0
lea dstq, [dstq+strideq]
sub hd, 1
jg .w16_loop
RET
ALIGN function_align
.w32:
movu m1, [tlq+1] ; top topleft[1 + x]
movu m2, [tlq+17] ; top
mova [rsp+16*0], m1
mova [rsp+16*1], m2
sub tlq, 4
sub tlq, hq
mova m7, [base+ipred_v_shuf]
pshufd m7, m7, q0000
mova [rsp+16*2], m7
mova [rsp+16*3], m0
mova [rsp+16*4], m4
mova [rsp+16*5], m5
.w32_loop:
SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
lea dstq, [dstq-16+strideq]
add v_weightsq, 2
sub hd, 1
jg .w32_loop
RET
ALIGN function_align
.w64:
movu m1, [tlq+1] ; top topleft[1 + x]
movu m2, [tlq+17] ; top
mova [rsp+16*0], m1
mova [rsp+16*1], m2
movu m1, [tlq+33] ; top
movu m2, [tlq+49] ; top
mova [rsp+16*11], m1
mova [rsp+16*12], m2
sub tlq, 4
sub tlq, hq
mova m7, [base+ipred_v_shuf]
pshufd m7, m7, q0000
mova [rsp+16*2], m7
mova [rsp+16*3], m0
mova [rsp+16*4], m4
mova [rsp+16*5], m5
.w64_loop:
SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
lea dstq, [dstq-48+strideq]
add v_weightsq, 2
sub hd, 1
jg .w64_loop
RET
%if ARCH_X86_64
cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
%define base r7-$$
lea r7, [$$]
mova m8, [base+pw_62]
mova m9, [base+pw_64]
mova m10, [base+pw_512]
%else
cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
%define base r1-$$
%define m8 [base+pw_62]
%define m9 [base+pw_64]
%define m10 [base+pw_512]
%define strideq r3
%define stridemp dword [rsp+16*12]
mov stridemp, r1
LEA r1, $$
%endif
tzcnt wd, wm
movifnidn angled, anglem
movifnidn hd, hm
inc tlq
movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
mov dxd, angled
and dxd, 0x7e
add angled, 165 ; ~90
lea wq, [base+wq+ipred_z1_ssse3_table]
movzx dxd, word [base+dr_intra_derivative+dxq]
xor angled, 0x4ff ; d = 90 - angle
jmp wq
.w4:
lea r3d, [angleq+88]
test r3d, 0x480
jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
sar r3d, 9
add r3d, hd
cmp r3d, 8
jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
mova m1, [tlq-1]
pshufb m0, m1, [base+z_upsample1]
pshufb m1, [base+z_upsample2]
movddup m2, [base+pb_36_m4]
add dxd, dxd
pmaddubsw m0, m2
pshufd m7, m1, q3333
movd [rsp+16], m7 ; top[max_base_x]
pmaddubsw m1, m2
movd m6, dxd
mov r5d, dxd ; xpos
pshufb m6, [base+pw_256]
paddw m1, m0
movq m0, [tlq]
pmulhrsw m1, m10
paddw m7, m6, m6
punpcklqdq m6, m7 ; xpos0 xpos1
packuswb m1, m1
punpcklbw m0, m1
movifnidn strideq, stridemp
mova [rsp], m0
.w4_upsample_loop:
lea r2d, [r5+dxq]
shr r5d, 6 ; base0
movq m0, [rsp+r5]
lea r5d, [r2+dxq]
shr r2d, 6 ; base1
movhps m0, [rsp+r2]
pand m2, m8, m6 ; frac
psubw m1, m9, m2 ; 64-frac
psllw m2, 8
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
paddw m6, m7 ; xpos += dx
pmulhrsw m0, m10
packuswb m0, m0
movd [dstq+strideq*0], m0
pshuflw m0, m0, q1032
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w4_upsample_loop
RET
.w4_no_upsample:
mov r3d, 7 ; max_base
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w4_main
lea r3d, [hq+3]
movd m0, r3d
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
pcmpeqb m1, m0, [base+z_filter_wh4]
pand m1, m2
pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
pmovmskb r5d, m1
mov r3d, 7
test r5d, r5d
jz .w4_main ; filter_strength == 0
mova m3, [tlq-1]
imul r5d, 0x55555555
movu m7, [base+z_filter_s+8]
shr r5d, 30 ; filter_strength
movddup m0, [base+pb_8]
pminub m7, m0
pshufb m0, m3, [base+z_filter_s]
movddup m4, [base+z_filter_k-8+r5*8+24*0]
pshufb m3, m7
movddup m5, [base+z_filter_k-8+r5*8+24*1]
shufps m2, m0, m3, q2121
movddup m6, [base+z_filter_k-8+r5*8+24*2]
pmaddubsw m0, m4
pmaddubsw m1, m2, m4
pmaddubsw m2, m5
paddd m5, m6
pmaddubsw m4, m3, m5
pmaddubsw m3, m6
paddw m0, m2
paddw m1, m4
paddw m0, m3
pshufd m1, m1, q3333
pmulhrsw m0, m10
pmulhrsw m1, m10
mov r5d, 9
mov tlq, rsp
cmp hd, 4
cmovne r3d, r5d
packuswb m0, m1
mova [tlq], m0
.w4_main:
add tlq, r3
movd m5, dxd
movddup m0, [base+z_base_inc] ; base_inc << 6
movd m7, [tlq] ; top[max_base_x]
shl r3d, 6
movd m4, r3d
pshufb m5, [base+pw_256]
mov r5d, dxd ; xpos
pshufb m7, [base+pw_m256]
sub r5, r3
pshufb m4, [base+pw_256]
mova m3, [base+z1_shuf_w4]
paddw m6, m5, m5
psubw m4, m0 ; max_base_x
punpcklqdq m5, m6 ; xpos0 xpos1
.w4_loop:
lea r3, [r5+dxq]
sar r5, 6 ; base0
movq m0, [tlq+r5]
lea r5, [r3+dxq]
sar r3, 6 ; base1
movhps m0, [tlq+r3]
pand m2, m8, m5 ; frac
psubw m1, m9, m2 ; 64-frac
psllw m2, 8
pshufb m0, m3
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
movifnidn strideq, stridemp
pcmpgtw m1, m4, m5 ; base < max_base_x
pmulhrsw m0, m10
paddw m5, m6 ; xpos += dx
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movd [dstq+strideq*0], m0
pshuflw m0, m0, q1032
movd [dstq+strideq*1], m0
sub hd, 2
jz .w4_end
lea dstq, [dstq+strideq*2]
test r5d, r5d
jl .w4_loop
packuswb m7, m7
.w4_end_loop:
movd [dstq+strideq*0], m7
movd [dstq+strideq*1], m7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w4_end_loop
.w4_end:
RET
.w8:
lea r3d, [angleq+88]
and r3d, ~0x7f
or r3d, hd
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
mova m5, [base+z_upsample1]
movu m3, [base+z_filter_s+6]
movd m4, hd
mova m0, [tlq-1]
movu m1, [tlq+7]
pxor m7, m7
pshufb m4, m7
movddup m7, [base+pb_36_m4]
pminub m4, m3
add dxd, dxd
pshufb m2, m0, m5
pmaddubsw m2, m7
pshufb m0, m3
pmaddubsw m0, m7
movd m6, dxd
pshufb m3, m1, m5
pmaddubsw m3, m7
pshufb m1, m4
pmaddubsw m1, m7
pshufb m6, [base+pw_256]
mov r5d, dxd
paddw m2, m0
paddw m7, m6, m6
paddw m3, m1
punpcklqdq m6, m7 ; xpos0 xpos1
movu m1, [tlq]
pmulhrsw m2, m10
pmulhrsw m3, m10
packuswb m2, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
movifnidn strideq, stridemp
mova [rsp+16*0], m0
mova [rsp+16*1], m1
.w8_upsample_loop:
lea r2d, [r5+dxq]
shr r5d, 6 ; base0
movu m0, [rsp+r5]
lea r5d, [r2+dxq]
shr r2d, 6 ; base1
movu m1, [rsp+r2]
pand m2, m8, m6
psubw m3, m9, m2
psllw m2, 8
por m3, m2
punpcklqdq m2, m3, m3 ; frac0
pmaddubsw m0, m2
punpckhqdq m3, m3 ; frac1
pmaddubsw m1, m3
paddw m6, m7
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_upsample_loop
RET
.w8_no_upsample:
lea r3d, [hq+7]
movd m0, r3d
and r3d, 7
or r3d, 8 ; imin(h+7, 15)
test angled, 0x400
jnz .w8_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movu m1, [base+z_filter_wh8]
psrldq m3, [base+z_filter_t_w48+angleq*8], 4
pcmpeqb m1, m0
pand m1, m2
pcmpgtb m1, m3
pmovmskb r5d, m1
test r5d, r5d
jz .w8_main ; filter_strength == 0
movd m3, [tlq-1]
movu m0, [tlq+16*0]
imul r5d, 0x55555555
movu m1, [tlq+16*1]
shr r5d, 30 ; filter_strength
movd m2, [tlq+r3]
lea tlq, [rsp+16*4]
sub r5, 3
mova [tlq-16*1], m0
pxor m7, m7
mova [tlq+16*0], m1
pshufb m3, m7
pshufb m2, m7
mova [tlq-16*2], m3
movq [tlq+r3-15], m2
call .filter_edge
sar r5d, 1
add r5d, 17
cmp hd, 8
cmova r3d, r5d
.w8_main:
add tlq, r3
movd m5, dxd
movd m7, [tlq]
shl r3d, 6
movu m3, [base+z_filter_s+2]
movd m4, r3d
pshufb m5, [base+pw_256]
mov r5d, dxd
pshufb m7, [base+pw_m256]
sub r5, r3
pshufb m4, [base+pw_256]
psubw m4, [base+z_base_inc]
mova m6, m5
.w8_loop:
mov r3, r5
sar r3, 6
movu m0, [tlq+r3]
pand m1, m8, m5
psubw m2, m9, m1
psllw m1, 8
pshufb m0, m3
por m1, m2
pmaddubsw m0, m1
pcmpgtw m1, m4, m5
paddw m5, m6
pmulhrsw m0, m10
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movq [dstq], m0
dec hd
jz .w8_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w8_loop
packuswb m7, m7
.w8_end_loop:
movq [dstq], m7
add dstq, strideq
dec hd
jg .w8_end_loop
.w8_end:
RET
.w16:
lea r3d, [hq+15]
movd m0, r3d
and r3d, 15
or r3d, 16 ; imin(h+15, 31)
test angled, 0x400
jnz .w16_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movq m3, [base+z_filter_t_w16+angleq*4]
pcmpeqb m0, [base+z_filter_wh16]
pand m0, m2
pcmpgtb m0, m3
pmovmskb r5d, m0
test r5d, r5d
jz .w16_main ; filter_strength == 0
movd m4, [tlq-1]
movu m0, [tlq+16*0]
imul r5d, 0x24924924
movu m1, [tlq+16*1]
shr r5d, 30
movd m2, [tlq+30]
adc r5, -4 ; filter_strength-3
movd m3, [tlq+r3]
lea tlq, [rsp+16*4]
mova [tlq-16*1], m0
pxor m7, m7
mova [tlq+16*0], m1
pshufb m4, m7
movd [rsp], m2
pshufb m3, m7
mova [tlq-16*2], m4
movd [tlq+r3-16], m3
call .filter_edge
cmp hd, 16
jle .w16_main
pshuflw m0, [rsp], q0000
sar r5, 1
movd m1, [base+z_filter_k_tail+4+r5*4]
lea r3d, [r5+33]
pmaddubsw m0, m1
%if ARCH_X86_64
pmulhrsw m0, m10
%else
pmulhrsw m0, m4
%endif
packuswb m0, m0
movd [tlq+32], m0
.w16_main:
add tlq, r3
movd m5, dxd
movd m7, [tlq]
movd m4, r3d
shl r3d, 6
pshufb m5, [base+pw_256]
pxor m6, m6
pshufb m7, m6
mov r5d, dxd
pshufb m4, m6
sub r5, r3
psubb m4, [base+pb_0to15]
mova m6, m5
.w16_loop:
mov r3, r5
sar r3, 6
movu m1, [tlq+r3+0]
pand m0, m8, m5
movu m2, [tlq+r3+1]
psubw m3, m9, m0
psllw m0, 8
por m3, m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
psrlw m3, m5, 6
packsswb m3, m3
pmulhrsw m0, m10
pmulhrsw m1, m10
paddw m5, m6
pcmpgtb m2, m4, m3
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
mova [dstq], m0
dec hd
jz .w16_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w16_loop
.w16_end_loop:
mova [dstq], m7
add dstq, strideq
dec hd
jg .w16_end_loop
.w16_end:
RET
.w32:
lea r3d, [hq+31]
and r3d, 31
or r3d, 32 ; imin(h+31, 63)
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
movd m6, [tlq-1]
movu m0, [tlq+16*0]
movu m1, [tlq+16*1]
movu m2, [tlq+16*2]
movu m3, [tlq+16*3]
movd m4, [tlq+62]
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=89 H=79 G=83
¤ Dauer der Verarbeitung: 0.24 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland