; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; Copyright © 2021, Matthias Dressel
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5
iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
%macro COEF_PAIR 2-3 0
pd_%1_%2: dd %1, %1, %2, %2
%define pd_%1 (pd_%1_%2 + 4*0)
%define pd_%2 (pd_%1_%2 + 4*2)
%if %3
dd -%2, -%2
%define pd_%2_m%2 pd_%2
%endif
%endmacro
COEF_PAIR 201, 995
COEF_PAIR 401, 1931
COEF_PAIR 799, 3406
COEF_PAIR 1380, 601
COEF_PAIR 1751, 2440
COEF_PAIR 2598, 1189
COEF_PAIR 2751, 2106
COEF_PAIR 2896, 1567, 1
COEF_PAIR 2896, 3784, 1
COEF_PAIR 3035, 3513
COEF_PAIR 3166, 3920
COEF_PAIR 3703, 3290
COEF_PAIR 3857, 4052
COEF_PAIR 4017, 2276
COEF_PAIR 4076, 3612
COEF_PAIR 4091, 3973
pd_8: dd 8
pd_m601: dd -601
pd_m1189: dd -1189
pd_m1380: dd -1380
pd_m2106: dd -2106
pd_m2598: dd -2598
pd_m2751: dd -2751
pd_m3344: dd -3344
pd_1024: dd 1024
pd_1321: dd 1321
pd_1448: dd 1448
pd_1697: dd 1697
pd_2482: dd 2482
pd_3072: dd 3072 ; 1024 + 2048
pd_3803: dd 3803
pd_5119: dd 5119 ; 1024 + 4096 - 1
pd_5120: dd 5120 ; 1024 + 4096
pd_5793: dd 5793
pd_6144: dd 6144 ; 2048 + 4096
pd_17408: dd 17408 ; 1024 + 16384
pixel_10bpc_max: times 2 dw 0x03ff
pixel_12bpc_max: times 2 dw 0x0fff
dconly_10bpc: times 2 dw 0x7c00
dconly_12bpc: times 2 dw 0x7000
clip_18b_min: dd -0x20000
clip_18b_max: dd 0x1ffff
clip_20b_min: dd -0x80000
clip_20b_max: dd 0x7ffff
const idct64_mul_16bpc
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
cextern deint_shuf
cextern idct64_mul
cextern pw_1697x8
cextern pw_1697x16
cextern pw_1567_3784
cextern pw_m1567_m3784
cextern pw_m3784_1567
cextern pw_2896_2896
cextern pw_m2896_2896
cextern pw_5
cextern pw_2048
cextern pw_4096
cextern pw_8192
cextern pw_16384
cextern pw_2896x8
cextern pd_2048
cextern idct_4x8_internal_8bpc_avx2.main
cextern idct_4x16_internal_8bpc_avx2.main
cextern idct_8x8_internal_8bpc_avx2.main
cextern idct_8x16_internal_8bpc_avx2.main
cextern idct_16x4_internal_8bpc_avx2.main
cextern idct_16x8_internal_8bpc_avx2.main
cextern idct_16x16_internal_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
cextern iadst_4x4_internal_8bpc_avx2.main
cextern iadst_4x8_internal_8bpc_avx2.main_pass2
cextern iadst_4x16_internal_8bpc_avx2.main2
cextern iadst_8x4_internal_8bpc_avx2.main
cextern iadst_8x8_internal_8bpc_avx2.main_pass2
cextern iadst_8x16_internal_8bpc_avx2.main
cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x4_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x16_internal_8bpc_avx2.main
cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
SECTION .text
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro WRAP_XMM 1+
INIT_XMM cpuname
%1
INIT_YMM cpuname
%endmacro
%macro IWHT4_1D_PACKED 0
; m0 = in0 in2, m1 = in1 in3
psubd m2, m0, m1 ; t2
paddd xm0, xm1 ; t0
vpermq m2, m2, q3322
vpermq m0, m0, q1100
vpermq m1, m1, q3120
psubd m3, m0, m2
psrad m3, 1
psubd m3, m1 ; t1 t3
psubd m0, m3 ; ____ out0
paddd m2, m3 ; out3 ____
%endmacro
INIT_YMM avx2
cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
mova xm0, [cq+16*0]
vinserti128 m0, [cq+16*2], 1
mova xm1, [cq+16*1]
vinserti128 m1, [cq+16*3], 1
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
lea r6, [dstq+strideq*2]
psrad m0, 2
psrad m1, 2
IWHT4_1D_PACKED
punpckhdq m0, m3
punpckldq m3, m2
punpckhqdq m1, m0, m3
punpcklqdq m0, m3
IWHT4_1D_PACKED
vpblendd m0, m2, 0x33
packssdw m0, m3
vextracti128 xm2, m0, 1
punpckhdq xm1, xm0, xm2 ; out2 out1
punpckldq xm0, xm2 ; out3 out0
movq xm2, [r6 +strideq*1]
movhps xm2, [dstq+strideq*0]
movq xm3, [r6 +strideq*0]
movhps xm3, [dstq+strideq*1]
%ifidn bdmaxd, bdmaxm
movd xm5, bdmaxd
vpbroadcastw xm5, xm5
%else ; win64: load from stack
vpbroadcastw xm5, bdmaxm
%endif
paddsw xm0, xm2
paddsw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movhps [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm1
movq [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm0
RET
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
; flags: 1 = packed, 2 = inv_dst2
; skip round/shift if rnd is not a number
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
%if %8 < 32
pmulld m%4, m%1, m%8
pmulld m%3, m%2, m%8
%else
%if %9 & 1
vbroadcasti128 m%3, [pd_%8]
%else
vpbroadcastd m%3, [pd_%8]
%endif
pmulld m%4, m%1, m%3
pmulld m%3, m%2
%endif
%if %7 < 32
pmulld m%1, m%7
pmulld m%2, m%7
%else
%if %9 & 1
vbroadcasti128 m%5, [pd_%7]
%else
vpbroadcastd m%5, [pd_%7]
%endif
pmulld m%1, m%5
pmulld m%2, m%5
%endif
%if %9 & 2
psubd m%4, m%6, m%4
psubd m%2, m%4, m%2
%else
%ifnum %6
paddd m%4, m%6
%endif
paddd m%2, m%4
%endif
%ifnum %6
paddd m%1, m%6
%endif
psubd m%1, m%3
%ifnum %6
psrad m%2, 12
psrad m%1, 12
%endif
%endmacro
%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
%define %%p1 m(i%1_%4_internal_%5bpc)
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
%ifidn %1_%2, dct_dct
test eobd, eobd
jnz %%p1
%else
%if %3
add eobd, %3
%endif
; jump to the 1st txfm function unless it's located directly after this
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end :
%endif
%endmacro
%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x4, %3
%ifidn %1_%2, dct_dct
vpbroadcastd xm2, [dconly_%3bpc]
%if %3 = 10
.dconly:
imul r6d, [cq], 181
mov [cq], eobd ; 0
or r3d, 4
.dconly2:
add r6d, 128
sar r6d, 8
.dconly3:
imul r6d, 181
add r6d, 2176
sar r6d, 12
movd xm0, r6d
paddsw xm0, xm2
vpbroadcastw xm0, xm0
.dconly_loop:
movq xm1, [dstq+strideq*0]
movhps xm1, [dstq+strideq*1]
paddsw xm1, xm0
psubusw xm1, xm2
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
WRAP_XMM RET
%else
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
%endif
%endif
%endmacro
%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
punpckhqdq m%3, m%2, m%1 ; t3 t2
punpcklqdq m%2, m%1 ; t0 t1
paddd m%1, m%2, m%3 ; out0 out1
psubd m%2, m%3 ; out3 out2
%endmacro
%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
vpbroadcastd m%5, [pw_m3784_1567]
punpckhwd m%3, m%2, m%1
vpbroadcastd m%4, [pw_1567_3784]
punpcklwd m%2, m%1
vpbroadcastd m%1, [pw_m2896_2896]
pmaddwd m%5, m%3
pmaddwd m%3, m%4
vpbroadcastd m%4, [pw_2896_2896]
pmaddwd m%1, m%2
pmaddwd m%2, m%4
REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
packssdw m%3, m%5 ; t3 t2
packssdw m%2, m%1 ; t0 t1
paddsw m%1, m%2, m%3 ; out0 out1
psubsw m%2, m%3 ; out3 out2
%endmacro
INV_TXFM_4X4_FN dct, dct
INV_TXFM_4X4_FN dct, identity
INV_TXFM_4X4_FN dct, adst
INV_TXFM_4X4_FN dct, flipadst
cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call .main
vbroadcasti128 m2, [idct4_shuf]
packssdw m0, m1
pshufb m0, m2
jmp tx2q
.pass2:
vextracti128 xm1, m0, 1
WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
packssdw xm5, xm5 ; pw_2048
pmulhrsw xm0, xm5
pmulhrsw xm1, xm5
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movq xm3, [r6 +strideq*1]
movhps xm3, [r6 +strideq*0]
vpbroadcastd xm5, [pixel_10bpc_max]
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movhps [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm1
RET
ALIGN function_align
.main:
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m5, [pd_2048]
.main2:
IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
ret
INV_TXFM_4X4_FN adst, dct
INV_TXFM_4X4_FN adst, adst
INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
%macro IADST4_1D 0
vpbroadcastd m5, [pd_1321]
vpbroadcastd m7, [pd_2482]
pmulld m4, m0, m5 ; 1321*in0
pmulld m6, m3, m7 ; 2482*in3
paddd m4, m6 ; 1321*in0 + 2482*in3
pmulld m6, m0, m7 ; 2482*in0
paddd m0, m3 ; in0 + in3
paddd m7, m5 ; pd_3803
pmulld m5, m2 ; 1321*in2
pmulld m3, m7 ; 3803*in3
pmulld m7, m2 ; 3803*in2
psubd m2, m0 ; in2 - in0 - in3
vpbroadcastd m0, [pd_m3344]
pmulld m1, m0 ; -t3
pmulld m2, m0 ; out2 (unrounded)
psubd m6, m5 ; 2482*in0 - 1321*in2
paddd m4, m7 ; t0
psubd m6, m3 ; t1
paddd m3, m4, m6
psubd m4, m1 ; out0 (unrounded)
psubd m6, m1 ; out1 (unrounded)
paddd m3, m1 ; out3 (unrounded)
%endmacro
cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call .main
vinserti128 m0, m4, xm6, 1
vinserti128 m1, m2, xm3, 1
.pass1_end:
vpbroadcastd m5, [pd_2048]
mova m2, [itx4_shuf]
paddd m0, m5
paddd m1, m5
psrad m0, 12
psrad m1, 12
packssdw m0, m1
vpermd m0, m2, m0
psrld m2, 4
pshufb m0, m2
%if WIN64
movaps xmm6, [rsp+ 8]
movaps xmm7, [rsp+24]
%endif
jmp tx2q
.pass2:
lea r6, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal_8bpc).main
.end :
vpbroadcastd xm4, [pw_2048]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movq xm3, [r6 +strideq*0]
movhps xm3, [r6 +strideq*1]
vpbroadcastd xm5, [pixel_10bpc_max]
pmulhrsw xm0, xm4
pmulhrsw xm1, xm4
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [r6 +strideq*0], xm1
movhps [r6 +strideq*1], xm1
RET
ALIGN function_align
.main:
mova xm0, [cq+16*0]
mova xm1, [cq+16*1]
mova xm2, [cq+16*2]
mova xm3, [cq+16*3]
%if WIN64
movaps [rsp+16], xmm6
movaps [rsp+32], xmm7
%endif
.main2:
WRAP_XMM IADST4_1D
ret
INV_TXFM_4X4_FN flipadst, dct
INV_TXFM_4X4_FN flipadst, adst
INV_TXFM_4X4_FN flipadst, flipadst
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
vinserti128 m0, m3, xm2, 1
vinserti128 m1, m6, xm4, 1
jmp m(iadst_4x4_internal_10bpc).pass1_end
.pass2:
lea r6, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
movq xm3, [dstq+strideq*1]
movhps xm3, [dstq+strideq*0]
lea r6, [dstq+strideq*2]
movq xm2, [r6 +strideq*1]
movhps xm2, [r6 +strideq*0]
vpbroadcastd xm5, [pixel_10bpc_max]
pmulhrsw xm0, xm4
pmulhrsw xm1, xm4
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movhps [dstq+strideq*0], xm1
movq [dstq+strideq*1], xm1
movhps [r6 +strideq*0], xm0
movq [r6 +strideq*1], xm0
RET
INV_TXFM_4X4_FN identity, dct
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpbroadcastd m1, [pd_5793]
pmulld m0, m1, [cq+32*0]
pmulld m1, [cq+32*1]
vpbroadcastd m5, [pd_2048]
mova m3, [itx4_shuf]
paddd m0, m5
paddd m1, m5
psrad m0, 12
psrad m1, 12
packssdw m0, m1
vpermd m0, m3, m0
psrld m3, 4
pshufb m0, m3
jmp tx2q
.pass2:
vpbroadcastd m1, [pw_1697x8]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
pmulhrsw m1, m0
paddsw m0, m1
movq xm3, [r6 +strideq*0]
movhps xm3, [r6 +strideq*1]
vpbroadcastd xm4, [pixel_10bpc_max]
packssdw m5, m5 ; pw_2048
pmulhrsw m0, m5
pxor m5, m5
mova [cq+32*0], m5
mova [cq+32*1], m5
vextracti128 xm1, m0, 1
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm5
pmaxsw xm1, xm5
pminsw xm0, xm4
pminsw xm1, xm4
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [r6 +strideq*0], xm1
movhps [r6 +strideq*1], xm1
RET
INV_TXFM_4X4_FN dct, dct, 12
INV_TXFM_4X4_FN dct, identity, 12
INV_TXFM_4X4_FN dct, adst, 12
INV_TXFM_4X4_FN dct, flipadst, 12
cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(idct_4x4_internal_10bpc).main
mova m3, [idct4_12_shuf]
mova m4, [idct4_12_shuf2]
vpermd m2, m4, m1
vpermd m1, m3, m0
jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
vpbroadcastd m5, [pd_2048]
vpermq m0, m0, q3120
vpermq m1, m1, q3120
call m(idct_4x4_internal_10bpc).main2
vpermq m0, m0, q3120
vpermq m1, m1, q2031
jmp m(iadst_4x4_internal_12bpc).end
INV_TXFM_4X4_FN adst, dct, 12
INV_TXFM_4X4_FN adst, adst, 12
INV_TXFM_4X4_FN adst, flipadst, 12
INV_TXFM_4X4_FN adst, identity, 12
cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
vinserti128 m1, m4, xm6, 1
vinserti128 m2, xm3, 1
.pass1_end:
mova m3, [itx4_shuf]
vpbroadcastd m5, [pd_1024]
psrad m1, 1
psrad m2, 1
vpermd m1, m3, m1
vpermd m2, m3, m2
paddd m1, m5
paddd m2, m5
psrad m1, 11
psrad m2, 11
.pass1_end2:
vpbroadcastd m3, [clip_18b_min]
vpbroadcastd m4, [clip_18b_max]
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
pmaxsd m0, m3
pmaxsd m1, m3
pminsd m0, m4
pminsd m1, m4
jmp tx2q
.pass2:
call .main_pass2
vinserti128 m0, m4, xm6, 1
vinserti128 m1, m2, xm3, 1
.pass2_end:
vpbroadcastd m5, [pd_2048]
paddd m0, m5
paddd m1, m5
psrad m0, 12
psrad m1, 12
.end :
%if WIN64
WIN64_RESTORE_XMM_INTERNAL
%assign xmm_regs_used 6
%endif
.end2:
vpbroadcastd m4, [pw_16384]
movq xm2, [dstq+strideq*0]
movq xm3, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movhps xm2, [r6 +strideq*0] ; dst0 dst2
movhps xm3, [r6 +strideq*1] ; dst1 dst3
vpbroadcastd m5, [pixel_12bpc_max]
vinserti128 m2, xm3, 1
psrad m0, 3
psrad m1, 3
packssdw m0, m1 ; t0 t2 t1 t3
pmulhrsw m0, m4
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw m0, m2 ; out0 out2 out1 out3
pmaxsw m0, m4
pminsw m0, m5
vextracti128 xm1, m0, 1 ; out1 out3
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [r6 +strideq*0], xm0
movhps [r6 +strideq*1], xm1
RET
.main_pass2:
vextracti128 xm3, m1, 1
mova xm2, xm1
vextracti128 xm1, m0, 1
jmp m(iadst_4x4_internal_10bpc).main2
INV_TXFM_4X4_FN flipadst, dct, 12
INV_TXFM_4X4_FN flipadst, adst, 12
INV_TXFM_4X4_FN flipadst, flipadst, 12
INV_TXFM_4X4_FN flipadst, identity, 12
cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
vinserti128 m1, m3, xm2, 1
vinserti128 m2, m6, xm4, 1
jmp m(iadst_4x4_internal_12bpc).pass1_end
.pass2:
call m(iadst_4x4_internal_12bpc).main_pass2
vinserti128 m0, m3, xm2, 1
vinserti128 m1, m6, xm4, 1
jmp m(iadst_4x4_internal_12bpc).pass2_end
INV_TXFM_4X4_FN identity, dct, 12
INV_TXFM_4X4_FN identity, adst, 12
INV_TXFM_4X4_FN identity, flipadst, 12
INV_TXFM_4X4_FN identity, identity, 12
cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
mova m2, [itx4_shuf]
vpbroadcastd m3, [pd_1697]
vpermd m0, m2, [cq+32*0]
vpermd m2, m2, [cq+32*1]
vpbroadcastd m5, [pd_2048]
pmulld m1, m3, m0
pmulld m3, m2
paddd m1, m5
paddd m3, m5
psrad m1, 12
psrad m3, 12
paddd m1, m0
paddd m2, m3
jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
; m0 = in0 in1
; m1 = in2 in3
vpbroadcastd m3, [pd_5793]
vpbroadcastd m5, [pd_2048]
pmulld m0, m3
pmulld m1, m3
paddd m0, m5 ; 2048
paddd m1, m5
psrad m0, 12
psrad m1, 12
jmp m(iadst_4x4_internal_12bpc).end
%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x8, %3
%ifidn %1_%2, dct_dct
vpbroadcastd xm2, [dconly_%3bpc]
%if %3 = 10
.dconly:
imul r6d, [cq], 181
mov [cq], eobd ; 0
or r3d, 8
add r6d, 128
sar r6d, 8
imul r6d, 181
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
%else
jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
%endif
%endif
%endmacro
%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
vpbroadcastd m%5, [pd_2896]
pmulld m%1, m%5
pmulld m%3, m%5
paddd m%1, m%8
paddd m%5, m%1, m%3
psubd m%1, m%3
psrad m%5, 12 ; t0
psrad m%1, 12 ; t1
psubd m%3, m%1, m%2
paddd m%2, m%1
paddd m%1, m%5, m%4
psubd m%4, m%5, m%4
%endmacro
INV_TXFM_4X8_FN dct, dct
INV_TXFM_4X8_FN dct, identity
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
.pass1:
vpbroadcastd m3, [pd_2896]
pmulld m0, m3, [cq+32*0]
pmulld m1, m3, [cq+32*1]
pmulld m2, m3, [cq+32*2]
pmulld m3, m3, [cq+32*3]
vpbroadcastd m7, [pd_2048]
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
jmp tx2q
.pass2:
packssdw m0, m2
packssdw m1, m3
lea r6, [deint_shuf+128]
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m2 ; 2 3
punpckldq m0, m2 ; 0 1
vextracti128 xm2, m0, 1 ; 4 5
vextracti128 xm3, m1, 1 ; 6 7
call m(idct_4x8_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
movq xm5, [dstq+r3 ]
movhps xm5, [dstq+strideq*2]
movq xm6, [r6 +strideq*0]
movhps xm6, [r6 +strideq*1]
movq xm7, [r6 +r3 ]
movhps xm7, [r6 +strideq*2]
paddw xm0, xm4 ; 0 1
paddw xm1, xm5 ; 3 2
paddw xm2, xm6 ; 4 5
paddw xm3, xm7 ; 7 6
vpbroadcastd xm5, [pixel_10bpc_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movhps [dstq+strideq*2], xm1
movq [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movhps [r6 +strideq*2], xm3
movq [r6 +r3 ], xm3
RET
INV_TXFM_4X8_FN adst, dct
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_10bpc).main
vpbroadcastd m5, [pd_2048]
paddd m0, m5, m4
paddd m1, m5, m6
paddd m2, m5
paddd m3, m5
.pass1_end:
REPX {psrad x, 12}, m0, m1, m2, m3
jmp tx2q
.pass2:
call .pass2_main
mova xm4, [pw_2048_m2048]
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
.end :
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
movq xm5, [dstq+strideq*2]
movhps xm5, [dstq+r3 ]
movq xm6, [r6 +strideq*0]
movhps xm6, [r6 +strideq*1]
movq xm7, [r6 +strideq*2]
movhps xm7, [r6 +r3 ]
paddw xm0, xm4 ; 0 1
paddw xm1, xm5 ; 2 3
paddw xm2, xm6 ; 4 5
paddw xm3, xm7 ; 6 7
vpbroadcastd xm5, [pixel_10bpc_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movq [r6 +strideq*2], xm3
movhps [r6 +r3 ], xm3
RET
ALIGN function_align
.pass2_main:
packssdw m0, m2
packssdw m1, m3
lea r6, [deint_shuf+128]
punpcklwd m4, m0, m1
punpckhwd m0, m1
punpckhdq m5, m4, m0
punpckldq m4, m0
vextracti128 xm2, m4, 1 ; 4 5
vextracti128 xm3, m5, 1 ; 6 7
pshufd xm4, xm4, q1032 ; 1 0
pshufd xm5, xm5, q1032 ; 3 2
jmp m(iadst_4x8_internal_8bpc).main_pass2
ALIGN function_align
.main:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
.main2:
vbroadcasti128 m0, [cq+16*0]
vbroadcasti128 m2, [cq+16*2]
vbroadcasti128 m3, [cq+16*5]
vbroadcasti128 m1, [cq+16*7]
vpbroadcastd m6, [pd_2896]
shufpd m0, m2, 0x0c ; 0 2
shufpd m1, m3, 0x0c ; 7 5
vbroadcasti128 m2, [cq+16*4]
vbroadcasti128 m4, [cq+16*6]
vbroadcasti128 m5, [cq+16*1]
vbroadcasti128 m3, [cq+16*3]
vpbroadcastd m7, [pd_2048]
shufpd m2, m4, 0x0c ; 4 6
shufpd m3, m5, 0x0c ; 3 1
REPX {pmulld x, m6}, m0, m1, m2, m3
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
.main3:
ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
psubd m4, m0, m2 ; t4 t6
paddd m0, m2 ; t0 t2
psubd m2, m1, m3 ; t5 t7
paddd m1, m3 ; t1 t3
REPX {pmaxsd x, m8}, m4, m2, m0, m1
REPX {pminsd x, m9}, m4, m2, m0, m1
pxor m5, m5
psubd m5, m4
vpblendd m4, m2, 0xcc ; t4 t7
vpblendd m2, m5, 0xcc ; t5 -t6
ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
vpbroadcastd m5, [pd_2896]
vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
punpckhqdq m3, m0, m1
punpcklqdq m0, m1
psubd m1, m0, m3 ; t2 t3
paddd m0, m3 ; out0 -out7
punpckhqdq m3, m4, m2 ; t7a t6a
punpcklqdq m4, m2 ; t5a t4a
psubd m2, m4, m3 ; t7 t6
paddd m4, m3 ; out6 -out1
REPX {pmaxsd x, m8}, m1, m2
REPX {pminsd x, m9}, m1, m2
vpblendd m3, m1, m2, 0xcc
shufpd m1, m2, 0x05
pmulld m3, m5
pmulld m5, m1
psignd m0, m6 ; out0 out7
psignd m4, m6 ; out6 out1
paddd m3, m7
psubd m2, m3, m5
paddd m5, m3
psrad m2, 12 ; out4 -out5
psrad m5, 12 ; -out3 out2
ret
INV_TXFM_4X8_FN flipadst, dct
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_10bpc).main
vpbroadcastd m5, [pd_2048]
paddd m0, m5, m3
paddd m1, m5, m2
paddd m2, m5, m6
paddd m3, m5, m4
jmp m(iadst_4x8_internal_10bpc).pass1_end
.pass2:
call m(iadst_4x8_internal_10bpc).pass2_main
mova xm4, [pw_2048_m2048]
REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*1]
movhps xm4, [dstq+strideq*0]
movq xm5, [dstq+r3 ]
movhps xm5, [dstq+strideq*2]
movq xm6, [r6 +strideq*1]
movhps xm6, [r6 +strideq*0]
movq xm7, [r6 +r3 ]
movhps xm7, [r6 +strideq*2]
paddw xm3, xm4 ; 1 0
paddw xm2, xm5 ; 3 2
paddw xm1, xm6 ; 5 4
paddw xm0, xm7 ; 7 6
vpbroadcastd xm5, [pixel_10bpc_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
movhps [dstq+strideq*0], xm3
movq [dstq+strideq*1], xm3
movhps [dstq+strideq*2], xm2
movq [dstq+r3 ], xm2
movhps [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm1
movhps [r6 +strideq*2], xm0
movq [r6 +r3 ], xm0
RET
INV_TXFM_4X8_FN identity, dct
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
.pass1:
vpbroadcastd m3, [pd_2896]
pmulld m0, m3, [cq+32*0]
pmulld m1, m3, [cq+32*1]
pmulld m2, m3, [cq+32*2]
pmulld m3, [cq+32*3]
vpbroadcastd m5, [pd_2048]
vpbroadcastd m4, [pd_5793]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
REPX {pmulld x, m4}, m0, m1, m2, m3
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m6, [pixel_10bpc_max]
call .pass2_end
RET
ALIGN function_align
.pass2_end:
vpbroadcastd m4, [pw_4096]
packssdw m0, m2
packssdw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmulhrsw m2, m4
pmulhrsw m0, m4
punpckhdq m1, m0, m2 ; 2 3 6 7
punpckldq m0, m2 ; 0 1 4 5
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
vpbroadcastq m4, [r6 +strideq*0]
vpbroadcastq m5, [r6 +strideq*1]
movq xm3, [dstq+strideq*2]
movhps xm3, [dstq+r3 ]
vpblendd m2, m4, 0x30
vpblendd m2, m5, 0xc0
vpbroadcastq m4, [r6 +strideq*2]
vpbroadcastq m5, [r6 +r3 ]
vpblendd m3, m4, 0x30
vpblendd m3, m5, 0xc0
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
paddw m0, m2 ; out0 out1 out4 out5
paddw m1, m3 ; out2 out3 out6 out7
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m6
pminsw m1, m6
vextracti128 xm2, m0, 1 ; out4 out5
vextracti128 xm3, m1, 1 ; out6 out7
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movq [r6 +strideq*2], xm3
movhps [r6 +r3 ], xm3
ret
INV_TXFM_4X8_FN dct, dct, 12
INV_TXFM_4X8_FN dct, identity, 12
INV_TXFM_4X8_FN dct, adst, 12
INV_TXFM_4X8_FN dct, flipadst, 12
cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
jmp m(idct_4x8_internal_10bpc).pass1
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
; transpose & interleave
pshufd m0, m0, q1320
pshufd m1, m1, q1320
pshufd m2, m2, q1320
pshufd m3, m3, q1320
punpckldq m4, m0, m1
punpckhdq m0, m1
punpckldq m5, m2, m3
punpckhdq m2, m3
vpermq m0, m0, q3102
vpermq m2, m2, q3102
vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved)
vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved)
vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
vpbroadcastd m7, [pd_2048]
call m(idct_8x4_internal_10bpc).main
psubd m3, m0, m4 ; out7 out6
paddd m0, m4 ; out0 out1
paddd m1, m2, m5 ; out3 out2
psubd m2, m5 ; out4 out5
pshufd m1, m1, q1032
pshufd m3, m3, q1032
jmp m(iadst_4x8_internal_12bpc).end
INV_TXFM_4X8_FN adst, dct, 12
INV_TXFM_4X8_FN adst, adst, 12
INV_TXFM_4X8_FN adst, flipadst, 12
INV_TXFM_4X8_FN adst, identity, 12
cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_10bpc).main
psrad m0, m4, 1
psrad m1, m6, 1
psrad m2, 1
psrad m3, 1
.pass1_end:
vpbroadcastd m5, [pd_1024]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 11}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call .pass2_main
vpblendd m3, m0, m4, 0x33 ; out6 out7
vpblendd m0, m4, 0xcc ; out0 out1
pshufd m1, m5, q1032
psignd m2, m6 ; out4 out5
psignd m1, m6 ; out2 out3
.end :
vpbroadcastd m4, [pw_16384]
REPX {psrad x, 3}, m0, m1, m2, m3
packssdw m0, m2 ; 0 1 4 5 (interleaved)
packssdw m1, m3 ; 2 3 6 7 (interleaved)
mova m2, [iadst8_12_shuf]
vpermd m0, m2, m0 ; 0 1 4 5
vpermd m1, m2, m1 ; 2 3 6 7
pmulhrsw m0, m4
pmulhrsw m1, m4
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
movq xm5, [dstq+strideq*2]
movhps xm5, [dstq+r3 ]
movq xm6, [r6 +strideq*0]
movhps xm6, [r6 +strideq*1]
vinserti128 m4, xm6, 1
movq xm7, [r6 +strideq*2]
movhps xm7, [r6 +r3 ]
vinserti128 m5, xm7, 1
paddw m0, m4 ; 0 1 4 5
paddw m1, m5 ; 2 3 6 7
vpbroadcastd m5, [pixel_12bpc_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, m4}, m0, m1
REPX {pminsw x, m5}, m0, m1
vextracti128 xm2, m0, 1 ; out4 out5
vextracti128 xm3, m1, 1 ; out6 out7
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movq [r6 +strideq*2], xm3
movhps [r6 +r3 ], xm3
RET
ALIGN function_align
.pass2_main:
; transpose & interleave
pshufd m0, m0, q1320
pshufd m1, m1, q1320
pshufd m2, m2, q1320
pshufd m3, m3, q1320
punpckldq m4, m0, m1
punpckhdq m0, m1
punpckldq m5, m2, m3
punpckhdq m2, m3
vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved)
vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved)
vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
vpbroadcastd m7, [pd_2048]
jmp m(iadst_4x8_internal_10bpc).main3
INV_TXFM_4X8_FN flipadst, dct, 12
INV_TXFM_4X8_FN flipadst, adst, 12
INV_TXFM_4X8_FN flipadst, flipadst, 12
INV_TXFM_4X8_FN flipadst, identity, 12
cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_10bpc).main
psrad m0, m3, 1
psrad m1, m2, 1
psrad m2, m6, 1
psrad m3, m4, 1
jmp m(iadst_4x8_internal_12bpc).pass1_end
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call m(iadst_4x8_internal_12bpc).pass2_main
shufpd m3, m4, m0, 0x05 ; out1 out0
shufpd m0, m4, 0x05 ; out7 out6
psignd m2, m6
pshufd m6, m6, q1032
pshufd m1, m2, q1032 ; out5 out4
psignd m2, m5, m6 ; out3 out2
jmp m(iadst_4x8_internal_12bpc).end
INV_TXFM_4X8_FN identity, dct, 12
INV_TXFM_4X8_FN identity, adst, 12
INV_TXFM_4X8_FN identity, flipadst, 12
INV_TXFM_4X8_FN identity, identity, 12
cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
jmp m(iidentity_4x8_internal_10bpc).pass1
.pass2:
; m0 = in0 in1
; m1 = in2 in3
; m2 = in4 in5
; m3 = in6 in7
vpbroadcastd m6, [pixel_12bpc_max]
call m(iidentity_4x8_internal_10bpc).pass2_end
RET
%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x16, %3
%ifidn %1_%2, dct_dct
imul r6d, [cq], 181
vpbroadcastd xm2, [dconly_%3bpc]
mov [cq], eobd ; 0
or r3d, 16
add r6d, 384
sar r6d, 9
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
%endif
%endmacro
INV_TXFM_4X16_FN dct, dct
INV_TXFM_4X16_FN dct, identity
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
.pass1:
vpbroadcastd m10, [pd_3072]
mova m1, [cq+32*2]
mova m3, [cq+32*6]
mova m5, [cq+32*3]
mova m7, [cq+32*7]
call .pass1_main
pmulld m0, m6, [cq+32*0]
pmulld m2, m6, [cq+32*4]
pmulld m4, m6, [cq+32*1]
pmulld m6, [cq+32*5]
call .pass1_main2
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
lea r6, [deint_shuf+128]
punpcklwd m4, m2, m3
punpckhwd m2, m3
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m4 ; 2 3
punpckldq m0, m4 ; 0 1
punpckldq m4, m5, m2 ; 8 9
punpckhdq m5, m2 ; a b
vextracti128 xm2, m0, 1 ; 4 5
vextracti128 xm3, m1, 1 ; 6 7
vextracti128 xm6, m4, 1 ; c d
vextracti128 xm7, m5, 1 ; e f
call m(idct_4x16_internal_8bpc).main
vpbroadcastd m9, [pw_2048]
vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
vinserti128 m2, m4, xm5, 1 ; 8 9 b a
vinserti128 m3, m6, xm7, 1 ; c d f e
vpbroadcastd m8, [pixel_10bpc_max]
call .pass2_end
RET
ALIGN function_align
.pass1_main:
vpbroadcastd m4, [pd_3784]
vpbroadcastd m8, [pd_1567]
vpbroadcastd m9, [pd_2048]
vpbroadcastd m6, [pd_1448]
ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
ret
ALIGN function_align
.pass1_main2:
paddd m0, m10
paddd m4, m10
paddd m8, m0, m2
psubd m0, m2
paddd m9, m4, m6
psubd m4, m6
REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
psubd m2, m0, m1
paddd m1, m0
psubd m6, m4, m5
paddd m5, m4
paddd m0, m8, m3
psubd m3, m8, m3
paddd m4, m9, m7
psubd m7, m9, m7
ret
ALIGN function_align
.pass2_end:
lea r6, [strideq*3]
pxor m7, m7
pmulhrsw m0, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
ret
ALIGN function_align
.write_4x4:
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
vpbroadcastq m5, [dstq+strideq*2]
vpbroadcastq m6, [dstq+r6 ]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0xc0
vpblendd m4, m6, 0x30
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movq [dstq+strideq*0], xm4
movhps [dstq+strideq*1], xm4
movhps [dstq+strideq*2], xm5
movq [dstq+r6 ], xm5
lea dstq, [dstq+strideq*4]
ret
INV_TXFM_4X16_FN adst, dct
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
call m(iadst_16x4_internal_10bpc).main
vpbroadcastd m6, [pd_6144]
call m(iadst_16x4_internal_10bpc).main_end
psrad m0, m4, 13
psrad m1, m5, 13
psrad m2, 13
psrad m3, 13
psrad m4, m8, 13
psrad m5, m9, 13
psrad m6, 13
psrad m7, 13
jmp tx2q
.pass2:
call .pass2_main
vpbroadcastd m5, [pw_2048]
vpbroadcastd m8, [pixel_10bpc_max]
lea r6, [strideq*3]
vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
pxor m7, m7
psubw m9, m7, m5
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
pmulhrsw m0, m4, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
RET
ALIGN function_align
.write_4x4:
movq xm4, [dstq+r6 ]
movhps xm4, [dstq+strideq*0]
vpbroadcastq m5, [dstq+strideq*1]
vpbroadcastq m6, [dstq+strideq*2]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0xc0
vpblendd m4, m6, 0x30
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movhps [dstq+strideq*0], xm4
movhps [dstq+strideq*1], xm5
movq [dstq+strideq*2], xm5
movq [dstq+r6 ], xm4
lea dstq, [dstq+strideq*4]
ret
ALIGN function_align
.pass2_main:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
lea r6, [deint_shuf+128]
punpcklwd m4, m2, m3
punpckhwd m2, m3
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m4
punpckldq m0, m4
punpckldq m4, m5, m2
punpckhdq m5, m2
vpblendd m3, m0, m1, 0x33
vpblendd m0, m1, 0xcc
shufpd m2, m5, m4, 0x05
shufpd m4, m5, 0x05
vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
vinserti128 m0, xm3, 1 ; 0 3 2 1
vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
vinserti128 m2, xm4, 1 ; b 8 9 a
call m(iadst_4x16_internal_8bpc).main2
vpbroadcastd m5, [pw_2896x8]
paddsw m1, m2, m4
psubsw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
ret
ALIGN function_align
.main:
vbroadcasti128 m0, [cq+16* 0]
vbroadcasti128 m4, [cq+16* 2]
vbroadcasti128 m1, [cq+16*15]
vbroadcasti128 m5, [cq+16*13]
vbroadcasti128 m2, [cq+16* 4]
vbroadcasti128 m6, [cq+16* 6]
vbroadcasti128 m3, [cq+16*11]
vbroadcasti128 m7, [cq+16* 9]
shufpd m0, m4, 0x0c ; 0 2
shufpd m1, m5, 0x0c ; 15 13
shufpd m2, m6, 0x0c ; 4 6
shufpd m3, m7, 0x0c ; 11 9
vbroadcasti128 m4, [cq+16* 8]
vbroadcasti128 m6, [cq+16*10]
vbroadcasti128 m5, [cq+16* 7]
vbroadcasti128 m7, [cq+16* 5]
shufpd m4, m6, 0x0c ; 8 10
shufpd m5, m7, 0x0c ; 7 5
vbroadcasti128 m6, [cq+16*12]
vbroadcasti128 m7, [cq+16*14]
shufpd m6, m7, 0x0c ; 12 14
vbroadcasti128 m7, [cq+16* 3]
vbroadcasti128 m8, [cq+16* 1]
shufpd m7, m8, 0x0c ; 3 1
.main2:
; expects: m12 = clip_min m13 = clip_max
vpbroadcastd m11, [pd_2048]
ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
psubd m8, m0, m4 ; t8a t10a
paddd m0, m4 ; t0a t2a
psubd m4, m1, m5 ; t9a t11a
paddd m1, m5 ; t1a t3a
psubd m5, m2, m6 ; t12a t14a
paddd m2, m6 ; t4a t6a
psubd m6, m3, m7 ; t13a t15a
paddd m3, m7 ; t5a t7a
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
psubd m7, m0, m2 ; t4 t6
paddd m0, m2 ; t0 t2
psubd m2, m1, m3 ; t5 t7
paddd m1, m3 ; t1 t3
psubd m3, m4, m6 ; t12a t14a
paddd m4, m6 ; t8a t10a
psubd m6, m8, m5 ; t13a t15a
paddd m8, m5 ; t9a t11a
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
punpcklqdq m5, m3, m7 ; t12a t4
punpckhqdq m3, m7 ; t14a t6
punpckhqdq m7, m6, m2 ; t15a t7
punpcklqdq m6, m2 ; t13a t5
ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
vpbroadcastd m10, [pd_2896]
vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
punpckhqdq m2, m4, m0 ; t10a t2
punpcklqdq m4, m0 ; t8a t0
punpckhqdq m0, m8, m1 ; t11a t3
punpcklqdq m8, m1 ; t9a t1
paddd m1, m6, m7 ; out2 -out3
psubd m6, m7 ; t14a t6
paddd m7, m5, m3 ; -out13 out12
psubd m5, m3 ; t15a t7
psubd m3, m8, m0 ; t11 t3a
paddd m8, m0 ; out14 -out15
paddd m0, m4, m2 ; -out1 out0
psubd m4, m2 ; t10 t2a
REPX {pmaxsd x, m12}, m6, m5, m3, m4
REPX {pminsd x, m13}, m6, m5, m3, m4
REPX {pmulld x, m10}, m6, m5, m3, m4
paddd m6, m11
paddd m4, m11
paddd m2, m6, m5 ; -out5 out4
psubd m6, m5 ; out10 -out11
psubd m5, m4, m3 ; -out9 out8
paddd m3, m4 ; out6 -out7
REPX {psrad x, 12}, m2, m3, m5, m6
REPX {psignd x, m9}, m1, m8, m3, m6
pshufd m9, m9, q1032
REPX {psignd x, m9}, m0, m7, m2, m5
ret
INV_TXFM_4X16_FN flipadst, dct
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
.pass1:
call m(iadst_16x4_internal_10bpc).main
vpbroadcastd m6, [pd_6144]
call m(iadst_16x4_internal_10bpc).main_end
psrad m0, m3, 13
psrad m1, m2, 13
psrad m2, m5, 13
psrad m3, m4, 13
psrad m4, m7, 13
psrad m5, m6, 13
psrad m6, m9, 13
psrad m7, m8, 13
jmp tx2q
.pass2:
call m(iadst_4x16_internal_10bpc).pass2_main
vpbroadcastd m5, [pw_2048]
vpbroadcastd m8, [pixel_10bpc_max]
lea r6, [strideq*3]
vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
pxor m7, m7
psubw m9, m7, m5
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
pmulhrsw m0, m4, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
RET
ALIGN function_align
.write_4x4:
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+r6 ]
vpbroadcastq m5, [dstq+strideq*1]
vpbroadcastq m6, [dstq+strideq*2]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0x30
vpblendd m4, m6, 0xc0
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movq [dstq+strideq*0], xm4
movq [dstq+strideq*1], xm5
movhps [dstq+strideq*2], xm5
movhps [dstq+r6 ], xm4
lea dstq, [dstq+strideq*4]
ret
INV_TXFM_4X16_FN identity, dct
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
vpbroadcastd m7, [pd_5793]
pmulld m0, m7, [cq+32*0]
pmulld m4, m7, [cq+32*1]
pmulld m1, m7, [cq+32*2]
pmulld m5, m7, [cq+32*3]
pmulld m2, m7, [cq+32*4]
pmulld m6, m7, [cq+32*5]
pmulld m3, m7, [cq+32*6]
pmulld m7, [cq+32*7]
vpbroadcastd m8, [pd_6144]
REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
jmp tx2q
.pass2:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
vpbroadcastd m7, [pw_1697x16]
vpbroadcastd m8, [pw_2048]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
REPX {paddsw x, x}, m0, m1, m2, m3
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
vpbroadcastd m4, [pixel_10bpc_max]
call .pass2_end
RET
ALIGN function_align
.pass2_end:
punpckhwd m7, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
lea r6, [strideq*5]
pxor m3, m3
punpckhdq m5, m0, m2 ; 2 3 6 7
punpckldq m0, m2 ; 0 1 4 5
punpckldq m6, m7, m1 ; 8 9 c d
punpckhdq m7, m1 ; a b e f
pmulhrsw m0, m8
call .write_2x4x2
pmulhrsw m0, m5, m8
call .write_2x4x2
pmulhrsw m0, m6, m8
lea dstq, [dstq+strideq*4]
call .write_2x4x2
pmulhrsw m0, m7, m8
call .write_2x4x2
ret
ALIGN function_align
.write_2x4x2:
movq xm1, [dstq+strideq*0]
movhps xm1, [dstq+strideq*1]
vpbroadcastq m2, [dstq+strideq*4]
vpblendd m1, m2, 0x30
vpbroadcastq m2, [dstq+r6 ]
vpblendd m1, m2, 0xc0
mova [cq+32*0], m3
mova [cq+32*1], m3
add cq, 32*2
paddw m1, m0
pmaxsw m1, m3
pminsw m1, m4
vextracti128 xm2, m1, 1
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
movq [dstq+strideq*4], xm2
movhps [dstq+r6 ], xm2
lea dstq, [dstq+strideq*2]
ret
INV_TXFM_4X16_FN dct, dct, 12
INV_TXFM_4X16_FN dct, identity, 12
INV_TXFM_4X16_FN dct, adst, 12
INV_TXFM_4X16_FN dct, flipadst, 12
cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
jmp m(idct_4x16_internal_10bpc).pass1
.pass2:
punpckldq m8, m0, m1
punpckhdq m0, m1
punpckldq m9, m2, m3
punpckhdq m2, m3
punpckldq m1, m4, m5
punpckhdq m4, m5
punpckldq m3, m6, m7
punpckhdq m6, m7
punpcklqdq m5, m0, m2 ; 2 6
punpckhqdq m12, m0, m2 ; 3 7
punpcklqdq m0, m8, m9 ; 0 4
punpckhqdq m10, m8, m9 ; 1 5
punpcklqdq m2, m1, m3 ; 8 12
punpckhqdq m13, m1, m3 ; 9 13
punpcklqdq m9, m4, m6 ; 10 14
punpckhqdq m4, m6 ; 11 15
vperm2i128 m1, m5, m9, 0x20 ; 2 10
vperm2i128 m3, m9, m5, 0x31 ; 14 6
vpermq m11, m4, q1302 ; 15 11
; interleave
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
call m(idct_16x4_internal_10bpc).pass1_main
vpermq m6, m12, q1302 ; 7 3
vpermq m5, m13, q3120 ; 9 13
call m(idct_16x4_internal_10bpc).pass1_main2
call m(idct_16x4_internal_10bpc).pass1_main3
REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
packssdw m3, m6, m7
mova m4, [idct16_12_shuf]
REPX {vpermd x, m4, x}, m0, m1, m2, m3
vpbroadcastd m9, [pw_16384]
vpbroadcastd m8, [pixel_12bpc_max]
call m(idct_4x16_internal_10bpc).pass2_end
RET
INV_TXFM_4X16_FN adst, dct, 12
INV_TXFM_4X16_FN adst, adst, 12
INV_TXFM_4X16_FN adst, flipadst, 12
INV_TXFM_4X16_FN adst, identity, 12
cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
call .main_pass1
psrad m0, m4, 12
psrad m1, m5, 12
psrad m2, 12
psrad m3, 12
psrad m4, m8, 12
psrad m5, m9, 12
psrad m6, 12
psrad m7, 12
jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
call .transpose_16x4
call m(iadst_4x16_internal_10bpc).main2
pshufd m4, m5, q1032
psrad m5, m6, 3
pshufd m6, m7, q1032
psrad m7, m8, 3
REPX {pshufd x, x, q1032}, m0, m2
REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6
.pass2_end:
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
packssdw m3, m6, m7
mova m4, [iadst16_12_shuf]
REPX {vpermd x, m4, x}, m0, m1, m2, m3
vpbroadcastd m9, [pw_16384]
vpbroadcastd m8, [pixel_12bpc_max]
lea r6, [strideq*3]
pxor m7, m7
pmulhrsw m0, m9
call m(iadst_4x16_internal_10bpc).write_4x4
pmulhrsw m0, m9, m1
call m(iadst_4x16_internal_10bpc).write_4x4
pmulhrsw m0, m9, m2
call m(iadst_4x16_internal_10bpc).write_4x4
pmulhrsw m0, m9, m3
call m(iadst_4x16_internal_10bpc).write_4x4
RET
ALIGN function_align
.transpose_16x4:
; transpose & interleave
punpckldq m8, m0, m1
punpckhdq m0, m1
punpckldq m9, m2, m3
punpckhdq m2, m3
punpckldq m1, m4, m5
punpckhdq m4, m5
punpckldq m3, m6, m7
punpckhdq m6, m7
punpcklqdq m10, m8, m0
punpckhqdq m0, m8
punpcklqdq m11, m9, m2
punpckhqdq m2, m9
punpcklqdq m8, m1, m4
punpckhqdq m4, m1
punpcklqdq m9, m3, m6
punpckhqdq m6, m3
vperm2i128 m5, m0, m2, 0x31 ; 7 5
vperm2i128 m7, m0, m2, 0x20 ; 3 1
vperm2i128 m0, m10, m11, 0x20 ; 0 2
vperm2i128 m2, m10, m11, 0x31 ; 4 6
vperm2i128 m1, m4, m6, 0x31 ; 15 13
vperm2i128 m3, m4, m6, 0x20 ; 11 9
vperm2i128 m4, m8, m9, 0x20 ; 8 10
vperm2i128 m6, m8, m9, 0x31 ; 12 14
ret
ALIGN function_align
.main_pass1:
call m(iadst_16x4_internal_10bpc).main
vpbroadcastd m6, [pd_3072]
paddd m10, m4, m5
psubd m4, m3
psubd m5, m3
paddd m3, m10
psubd m8, m7, m1
paddd m7, m9
psubd m9, m1
paddd m7, m1
REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
paddd m6, m0
ret
INV_TXFM_4X16_FN flipadst, dct, 12
INV_TXFM_4X16_FN flipadst, adst, 12
INV_TXFM_4X16_FN flipadst, flipadst, 12
INV_TXFM_4X16_FN flipadst, identity, 12
cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(iadst_4x16_internal_12bpc).main_pass1
psrad m0, m3, 12
psrad m1, m2, 12
psrad m2, m5, 12
psrad m3, m4, 12
psrad m4, m7, 12
psrad m5, m6, 12
psrad m6, m9, 12
psrad m7, m8, 12
jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
call m(iadst_4x16_internal_12bpc).transpose_16x4
call m(iadst_4x16_internal_10bpc).main2
pshufd m4, m3, q1032
psrad m3, m5, 3
psrad m5, m2, 3
pshufd m2, m6, q1032
pshufd m6, m1, q1032
psrad m1, m7, 3
psrad m7, m0, 3
pshufd m0, m8, q1032
REPX {psrad x, 3}, m0, m2, m4, m6
jmp m(iadst_4x16_internal_12bpc).pass2_end
INV_TXFM_4X16_FN identity, dct, 12
INV_TXFM_4X16_FN identity, adst, 12
INV_TXFM_4X16_FN identity, flipadst, 12
INV_TXFM_4X16_FN identity, identity, 12
cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
vpbroadcastd m8, [pd_1697]
mova m0, [cq+32*0]
mova m4, [cq+32*1]
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=86 H=82 G=83
¤ Dauer der Verarbeitung: 0.22 Sekunden
¤
*© Formatika GbR, Deutschland