/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
#include "src/loongarch/loongson_util.S"
.macro PUSH_REG
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
.endm
.macro POP_REG
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
.endm
.macro malloc_space number
li.w t0, \number
sub .d sp, sp, t0
addi.d sp, sp, -64
PUSH_REG
.endm
.macro free_space number
POP_REG
li.w t0, \number
add.d sp, sp, t0
addi.d sp, sp, 64
.endm
.macro iwht4
vadd.h vr0, vr0, vr1
vsub.h vr4, vr2, vr3
vsub.h vr5, vr0, vr4
vsrai.h vr5, vr5, 1
vsub.h vr2, vr5, vr1
vsub.h vr1, vr5, vr3
vadd.h vr3, vr4, vr2
vsub.h vr0, vr0, vr1
.endm
.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
vilvl.w \in0, \in1, \in0 // 0 1 2 3 4 5 6 7 x ...
vilvl.w \in2, \in3, \in2 // 8 9 10 11 12 13 14 15 x ...
vsllwil.hu.bu \in0, \in0, 0
vsllwil.hu.bu \in2, \in2, 0
vadd.h \in0, \in4, \in0
vadd.h \in2, \in5, \in2
vssrani.bu.h \in2, \in0, 0
vstelm.w \in2, a0, 0, 0
vstelmx.w \in2, a0, a1, 1
vstelmx.w \in2, a0, a1, 2
vstelmx.w \in2, a0, a1, 3
.endm
.macro VLD_DST_ADD_W4 in0, in1
vld vr0, a0, 0
vldx vr1, a0, a1
vld vr2, t2, 0
vldx vr3, t2, a1
DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1
.endm
function inv_txfm_add_wht_wht_4x4_8bpc_lsx
vld vr0, a2, 0
vld vr2, a2, 16
vxor.v vr20, vr20, vr20
vsrai.h vr0, vr0, 2
vsrai.h vr2, vr2, 2
vst vr20, a2, 0
vpickod.d vr1, vr0, vr0
vpickod.d vr3, vr2, vr2
vst vr20, a2, 16
iwht4
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
iwht4
vilvl.d vr4, vr1, vr0
vilvl.d vr5, vr3, vr2
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr4, vr5
endfunc
const idct_coeffs, align =4
.word 2896, 2896*8, 1567, 3784
.word 799, 4017, 3406, 2276
.word 401, 4076, 3166, 2598
.word 1931, 3612, 3920, 1189
.word 201, 4091, 3035, 2751
.word 1751, 3703, 3857, 1380
.word 995, 3973, 3513, 2106
.word 2440, 3290, 4052, 601
endconst
.macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
vsrari.h \out0, \in0, \shift
vsrari.h \out1, \in1, \shift
vsrari.h \out2, \in2, \shift
vsrari.h \out3, \in3, \shift
.endm
.macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \
out1, out2, out3, out4, out5, out6, out7, shift
vsrari.h \out0, \in0, \shift
vsrari.h \out1, \in1, \shift
vsrari.h \out2, \in2, \shift
vsrari.h \out3, \in3, \shift
vsrari.h \out4, \in4, \shift
vsrari.h \out5, \in5, \shift
vsrari.h \out6, \in6, \shift
vsrari.h \out7, \in7, \shift
.endm
.macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz
vmulwev.w.h \out0, \in0, \in2
vmulwod.w.h \out1, \in0, \in2
vmaddwev.w.h \out0, \in1, \in3
vmaddwod.w.h \out1, \in1, \in3
.ifc \sz, .4h
vilvl.w \out0, \out1, \out0
.else
vilvl.w vr22, \out1, \out0
vilvh.w \out1, \out1, \out0
vor.v \out0, vr22, vr22
.endif
.endm
const idct_coeffs_h, align =4
.short 2896, 2896*8, 1567, 3784
.short 799, 4017, 3406, 2276
.short 401, 4076, 3166, 2598
.short 1931, 3612, 3920, 1189
.short 201, 4091, 3035, 2751
.short 1751, 3703, 3857, 1380
.short 995, 3973, 3513, 2106
.short 2440, 3290, 4052, 601
endconst
const iadst4_coeffs, align =4
.word 1321, 3803, 2482, 3344
endconst
.macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 0 // 2896
vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz
vssrarni.h.w vr18, vr16, 12 // t0
vssrarni.h.w vr19, vr17, 12 // t1
vldrepl.h vr20, t0, 4 // 1567
vldrepl.h vr21, t0, 6 // 3784
vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz
vssrarni.h.w vr16, \in0, 12 // t3
vssrarni.h.w vr17, \in2, 12 // t2
vsadd.h \out0, vr18, vr16
vsadd.h \out1, vr19, vr17
vssub.h \out2, vr19, vr17
vssub.h \out3, vr18, vr16
.endm
functionl inv_dct_4h_x4_lsx
inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h
endfuncl
functionl inv_dct_8h_x4_lsx
inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h
endfuncl
.macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3
vsub.w vr16, \in0, \in2 // in0-in2
vmul.w vr17, \in0, vr20 // in0*1321
vmul.w vr19, \in0, vr22 // in0*2482
vmul.w vr18, \in1, vr23 // in1*3344
vmadd.w vr17, \in2, vr21 // in0*1321+in2*3803
vmsub.w vr19, \in2, vr20 // in2*1321
vadd.w vr16, vr16, \in3 // in0-in2+in3
vmadd.w vr17, \in3, vr22 // in0*1321+in2*3803+in3*2482
vmsub.w vr19, \in3, vr21 // in0*2482-in2*1321-in3*3803
vadd.w vr15, vr17, vr19
vmul.w \out2, vr16, vr23 // out[2] 8 9 10 11
vadd.w \out0, vr17, vr18 // out[0] 0 1 2 3
vadd.w \out1, vr19, vr18 // out[1] 4 5 6 7
vsub.w \out3, vr15, vr18 // out[3] 12 13 14 15
.endm
.macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3
la.local t0, iadst4_coeffs
vldrepl.w vr20, t0, 0 // 1321
vldrepl.w vr21, t0, 4 // 3803
vldrepl.w vr22, t0, 8 // 2482
vldrepl.w vr23, t0, 12 // 3344
vsllwil.w.h vr0, \in0, 0
vsllwil.w.h vr1, \in1, 0
vsllwil.w.h vr2, \in2, 0
vsllwil.w.h vr3, \in3, 0
inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3
vssrarni.h.w \out0, \out0, 12
vssrarni.h.w \out1, \out1, 12
vssrarni.h.w \out2, \out2, 12
vssrarni.h.w \out3, \out3, 12
.endm
functionl inv_adst_4h_x4_lsx
inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
endfuncl
functionl inv_flipadst_4h_x4_lsx
inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
endfuncl
.macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3
la.local t0, iadst4_coeffs
vldrepl.w vr20, t0, 0 // 1321
vldrepl.w vr21, t0, 4 // 3803
vldrepl.w vr22, t0, 8 // 2482
vldrepl.w vr23, t0, 12 // 3344
vsllwil.w.h vr10, \in0, 0 // in0
vsllwil.w.h vr11, \in1, 0 // in1
vsllwil.w.h vr12, \in2, 0 // in2
vsllwil.w.h vr13, \in3, 0 // in3
inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
vexth.w.h \in0, \in0 // in0
vexth.w.h \in1, \in1 // in1
vexth.w.h \in2, \in2 // in2
vexth.w.h \in3, \in3 // in3
inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3
vssrarni.h.w \out0, vr10, 12
vssrarni.h.w \out1, vr11, 12
vssrarni.h.w \out2, vr12, 12
vssrarni.h.w \out3, vr13, 12
.endm
functionl inv_adst_8h_x4_lsx
inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
endfuncl
functionl inv_flipadst_8h_x4_lsx
inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
endfuncl
functionl inv_identity_4h_x4_lsx
li.w t0, 1697
vreplgr2vr.h vr20, t0
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vmulwev.w.h vr16, vr0, vr20
vmulwod.w.h vr17, vr0, vr20
vmulwev.w.h vr18, vr2, vr20
vmulwod.w.h vr19, vr2, vr20
vilvl.w vr1, vr17, vr16
vilvh.w vr3, vr17, vr16
vilvl.w vr22, vr19, vr18
vilvh.w vr23, vr19, vr18
vssrarni.h.w vr3, vr1, 12
vssrarni.h.w vr23, vr22, 12
vsadd.h vr0, vr3, vr0 // t0
vsadd.h vr2, vr23, vr2 // t2
vilvh.d vr1, vr0, vr0 // t1
vilvh.d vr3, vr2, vr2 // t3
endfuncl
.macro inv_identity4_lsx1 in0, in1, in2, out0, out1
vsllwil.w.h vr16, \in0, 0
vexth.w.h vr17, \in1
vmul.w vr18, vr16, \in2
vmul.w vr19, vr17, \in2
vsrari.w vr18, vr18, 12
vsrari.w vr19, vr19, 12
vadd.w \out0, vr18, vr16
vadd.w \out1, vr19, vr17
vssrarni.h.w \out1, \out0, 1
.endm
functionl inv_identity_8h_x4_lsx
li.w t0, 1697
vreplgr2vr.h vr20, t0
vmulwev.w.h vr16, vr0, vr20
vmulwod.w.h vr17, vr0, vr20
vmulwev.w.h vr18, vr1, vr20
vmulwod.w.h vr19, vr1, vr20
vilvl.w vr21, vr17, vr16
vilvh.w vr22, vr17, vr16
vilvl.w vr23, vr19, vr18
vilvh.w vr16, vr19, vr18
vssrarni.h.w vr22, vr21, 12
vssrarni.h.w vr16, vr23, 12
vsadd.h vr0, vr22, vr0 // t0
vsadd.h vr1, vr16, vr1 // t1
vmulwev.w.h vr16, vr2, vr20
vmulwod.w.h vr17, vr2, vr20
vmulwev.w.h vr18, vr3, vr20
vmulwod.w.h vr19, vr3, vr20
vilvl.w vr21, vr17, vr16
vilvh.w vr22, vr17, vr16
vilvl.w vr23, vr19, vr18
vilvh.w vr16, vr19, vr18
vssrarni.h.w vr22, vr21, 12
vssrarni.h.w vr16, vr23, 12
vsadd.h vr2, vr22, vr2 // t2
vsadd.h vr3, vr16, vr3 // t3
endfuncl
functionl inv_identity_8h_x4_lsx1
li.w t0, 1697
vreplgr2vr.w vr20, t0
.irp i, vr0, vr1, vr2, vr3
inv_identity4_lsx1 \i, \i vr20, vr21, \i
.endr
endfuncl
functionl inv_txfm_add_4x4_lsx
vxor.v vr23, vr23, vr23
vld vr0, a2, 0
vld vr2, a2, 16
vilvh.d vr1, vr0, vr0
vilvh.d vr3, vr2, vr2
vst vr23, a2, 0
vst vr23, a2, 16
move t6, ra
jirl ra, t7, 0
move ra, t6
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
move t6, ra
jirl ra, t8, 0
move ra, t6
vilvl.d vr4, vr1, vr0
vilvl.d vr5, vr3, vr2
vsrari.h vr4, vr4, 4
vsrari.h vr5, vr5, 4
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr4, vr5
endfuncl
.macro idct_dc w, h, shift
ld .h t2, a2, 0 // dc
vldi vr0, 0x8b5 // 181
vreplgr2vr.w vr1, t2
vldi vr20, 0x880 // 128
vmul.w vr2, vr0, vr1 // dc * 181
st .h zero, a2, 0
vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
vld vr10, a0, 0 // 0 1 2 3 4 5 6 7
.if (2*\w == \h) || (2*\h == \w)
vmul.w vr2, vr0, vr2
vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8
.endif
.if \shift>0
vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift
.endif
vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15
alsl.d t2, a1, a0, 1
vmadd.w vr20, vr2, vr0
vld vr12, t2, 0 // 16 17 18 19 20 21 22 23
vssrarni.h.w vr20, vr20, 12
vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31
.endm
.macro fun4x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
bnez a3, 1f
idct_dc 4, 4, 0
DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20
b .\txfm1\()_\txfm2\()_4X4_END
1:
.endif
la.local t7, inv_\txfm1\()_4h_x4_lsx
la.local t8, inv_\txfm2\()_4h_x4_lsx
b inv_txfm_add_4x4_lsx
.\txfm1\()_\txfm2\()_4X4_END:
endfunc
.endm
fun4x4 dct, dct
fun4x4 identity, identity
fun4x4 adst, dct
fun4x4 dct, adst
fun4x4 adst, adst
fun4x4 dct, flipadst
fun4x4 flipadst, adst
fun4x4 adst, flipadst
fun4x4 flipadst, dct
fun4x4 flipadst, flipadst
fun4x4 dct, identity
fun4x4 identity, dct
fun4x4 flipadst, identity
fun4x4 identity, flipadst
fun4x4 identity, adst
fun4x4 adst, identity
const iadst8_coeffs_h, align =4
.short 4076, 401, 3612, 1931
.short 2598, 3166, 1189, 3920
.short 2896, 0, 1567, 3784, 0, 0, 0, 0
endconst
.macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz
la.local t0, iadst8_coeffs_h
vldrepl.h vr20, t0, 0 // 4076
vldrepl.h vr21, t0, 2 // 401
vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz
vssrarni.h.w vr17, vr16, 12 // t0a
vssrarni.h.w vr19, vr18, 12 // t1a
vldrepl.h vr20, t0, 4 // 3612
vldrepl.h vr21, t0, 6 // 1931
vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz
vssrarni.h.w vr16, vr0, 12 // t2a
vssrarni.h.w vr18, vr7, 12 // t3a
vldrepl.h vr20, t0, 8 // 2598
vldrepl.h vr21, t0, 10 // 3166
vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz
vssrarni.h.w vr0, vr2, 12 // t4a
vssrarni.h.w vr7, vr5, 12 // t5a
vldrepl.h vr20, t0, 12 // 1189
vldrepl.h vr21, t0, 14 // 3920
vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz
vssrarni.h.w vr2, vr3, 12 // t6a
vssrarni.h.w vr5, vr4, 12 // t7a
vsadd.h vr3, vr17, vr0 // t0
vssub.h vr4, vr17, vr0 // t4
vsadd.h vr1, vr19, vr7 // t1
vssub.h vr6, vr19, vr7 // t5
vsadd.h vr17, vr16, vr2 // t2
vssub.h vr19, vr16, vr2 // t6
vsadd.h vr0, vr18, vr5 // t3
vssub.h vr7, vr18, vr5 // t7
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 4 // 1567
vldrepl.h vr21, t0, 6 // 3784
vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz
vssrarni.h.w vr5, vr16, 12 // t4a
vssrarni.h.w vr2, vr18, 12 // t5a
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz
vssrarni.h.w vr16, vr4, 12 // t7a
vssrarni.h.w vr18, vr6, 12 // t6a
vsadd.h vr4, vr5, vr18 // out1
vssub.h vr19, vr5, vr18 // t6
vsadd.h vr20, vr1, vr0 // out7
vssub.h vr18, vr1, vr0 // t3
vsadd.h \out0, vr3, vr17 // out0
vssub.h vr5, vr3, vr17 // t2
vsadd.h \out6, vr2, vr16 // out6
vssub.h vr23, vr2, vr16 // t7
vsllwil.w.h vr3, vr20, 0 // out7
vexth.w.h \out7, vr20 // out7
vsllwil.w.h vr21, vr4, 0 // out1
vexth.w.h \out1, vr4 // out1
vneg.w vr3, vr3
vneg.w \out7, \out7
vneg.w vr21, vr21
vneg.w \out1, \out1
vssrarni.h.w \out7, vr3, 0
vssrarni.h.w \out1, vr21, 0
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 0 // 2896
vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz
vsrari.w vr16, vr16, 12
vsrari.w \out3, \out3, 12
vneg.w vr16, vr16
vneg.w \out3, \out3
vssrarni.h.w \out3, vr16, 0 // out3
vssrarni.h.w \out4, vr17, 12 // out4
vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz
vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz
vssrarni.h.w \out2, vr16, 12 // out2
vsrari.w vr17, vr17, 12
vsrari.w \out5, \out5, 12
vneg.w vr17, vr17
vneg.w \out5, \out5
vssrarni.h.w \out5, vr17, 0 // out5
.endm
functionl inv_adst_8h_x8_lsx
inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl
functionl inv_flipadst_8h_x8_lsx
inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
endfuncl
functionl inv_adst_4h_x8_lsx
inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl
functionl inv_flipadst_4h_x8_lsx
inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
endfuncl
.macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz
inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 8 // 799
vldrepl.h vr21, t0, 10 // 4017
vmulev_vmaddod_lsx \in1, \in7, vr21, vr20, vr16, vr17, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx \in1, \in7, vr20, vr21, vr18, vr19, \sz
vssrarni.h.w vr17, vr16, 12 // t7a
vssrarni.h.w vr19, vr18, 12 // t4a
vldrepl.h vr20, t0, 12 // 3406
vldrepl.h vr21, t0, 14 // 2276
vmulev_vmaddod_lsx \in5, \in3, vr21, vr20, \in1, vr16, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx \in5, \in3, vr20, vr21, \in7, vr18, \sz
vssrarni.h.w vr16, \in1, 12 // t6a
vssrarni.h.w vr18, \in7, 12 // t5a
vssub.h \in7, vr19, vr18 // t5a
vsadd.h vr18, vr19, vr18 // t4
vssub.h \in5, vr17, vr16 // t6a
vsadd.h vr16, vr17, vr16 // t7
vldrepl.h vr20, t0, 0 // 2896
vmulev_vmaddod_lsx \in5, \in7, vr20, vr20, \in1, vr17, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx \in5, \in7, vr20, vr21, vr23, vr19, \sz
vssrarni.h.w vr17, \in1, 12 // t6
vssrarni.h.w vr19, vr23, 12 // t5
vssub.h \in7, \in0, vr16 //c[7]
vsadd.h \in0, \in0, vr16 //c[0]
vssub.h \in5, \in4, vr19 //c[5]
vsadd.h vr23, \in4, vr19 //c[2]
vssub.h \in4, \in6, vr18 //c[4]
vsadd.h \in3, \in6, vr18 //c[3]
vssub.h \in6, \in2, vr17 //c[6]
vsadd.h \in1, \in2, vr17 //c[1]
vor.v \in2, vr23, vr23
.endm
functionl inv_dct_8h_x8_lsx
inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl
functionl inv_dct_4h_x8_lsx
inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h
endfuncl
.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
vsllwil.hu.bu vr0, \in0, 0
vsllwil.hu.bu vr1, \in1, 0
vsllwil.hu.bu vr2, \in2, 0
vsllwil.hu.bu vr3, \in3, 0
vadd.h vr0, \in4, vr0
vadd.h vr1, \in5, vr1
vadd.h vr2, \in6, vr2
vadd.h vr3, \in7, vr3
vssrani.bu.h vr1, vr0, 0
vssrani.bu.h vr3, vr2, 0
vstelm.d vr1, a0, 0, 0
vstelmx.d vr1, a0, a1, 1
vstelmx.d vr3, a0, a1, 0
vstelmx.d vr3, a0, a1, 1
.endm
.macro VLD_DST_ADD_W8 in0, in1, in2, in3
vld vr0, a0, 0
vldx vr1, a0, a1
vld vr2, t2, 0
vldx vr3, t2, a1
DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
.endm
functionl inv_identity_8h_x8_lsx
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vsadd.h \i, \i, \i
.endr
endfuncl
functionl inv_identity_4h_x8_lsx
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vsadd.h \i, \i, \i
.endr
endfuncl
.macro def_fn_8x8_base variant
functionl inv_txfm_\variant\()add_8x8_lsx
vxor.v vr23, vr23, vr23
vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
.irp i, 0, 16, 32, 48, 64, 80, 96, 112
vst vr23, a2, \i
.endr
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
b .itx_8x8_epilog
.else
move t6, ra
jirl ra, t7, 0
move ra, t6
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vsrari.h \i, \i, 1
.endr
.itx_8x8_epilog:
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
move t6, ra
jirl ra, t8, 0
move ra, t6
vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
add.d a0, a0, a1
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
.endif
endfuncl
.endm
def_fn_8x8_base identity_
def_fn_8x8_base
.macro fn8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
bnez a3, .NO_HAS_DCONLY_8x8
idct_dc 8, 8, 1
DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
b .\txfm1\()_\txfm2\()_8X8_END
.NO_HAS_DCONLY_8x8:
.endif
la.local t8, inv_\txfm2\()_8h_x8_lsx
.ifc \txfm1, identity
b inv_txfm_identity_add_8x8_lsx
.else
la.local t7, inv_\txfm1\()_8h_x8_lsx
b inv_txfm_add_8x8_lsx
.endif
.\txfm1\()_\txfm2\()_8X8_END:
endfunc
.endm
fn8x8 dct, dct
fn8x8 identity, identity
fn8x8 dct, adst
fn8x8 dct, flipadst
fn8x8 dct, identity
fn8x8 adst, dct
fn8x8 adst, adst
fn8x8 adst, flipadst
fn8x8 flipadst, dct
fn8x8 flipadst, adst
fn8x8 flipadst, flipadst
fn8x8 identity, dct
fn8x8 adst, identity
fn8x8 flipadst, identity
fn8x8 identity, adst
fn8x8 identity, flipadst
.macro rect2_lsx in0, in1, out0
vsllwil.w.h vr22, \in0, 0 // in1
vexth.w.h \in0, \in0 // in1
vmul.w vr22, vr22, \in1
vmul.w \out0, \in0, \in1
vssrarni.h.w \out0, vr22, 12
.endm
.macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
vilvl.h \tmp0, \in1, \in0
vilvl.h \tmp1, \in3, \in2
vilvl.w \tmp2, \tmp1, \tmp0
vilvh.w \tmp3, \tmp1, \tmp0
vilvl.h \tmp0, \in5, \in4
vilvl.h \tmp1, \in7, \in6
vilvl.w \tmp4, \tmp1, \tmp0
vilvh.w \tmp5, \tmp1, \tmp0
vilvl.d \out0, \tmp4, \tmp2
vilvh.d \out1, \tmp4, \tmp2
vilvl.d \out2, \tmp5, \tmp3
vilvh.d \out3, \tmp5, \tmp3
.endm
functionl inv_txfm_add_8x4_lsx
vxor.v vr23, vr23, vr23
vld vr0, a2, 0
vld vr2, a2, 16
vld vr4, a2, 32
vld vr6, a2, 48
.irp i, 0, 16, 32, 48
vst vr23, a2, \i
.endr
li.w t0, 2896
vreplgr2vr.w vr23, t0
rect2_lsx vr0, vr23, vr0
rect2_lsx vr2, vr23, vr2
rect2_lsx vr4, vr23, vr4
rect2_lsx vr6, vr23, vr6
vilvh.d vr1, vr0, vr0
vilvh.d vr3, vr2, vr2
vilvh.d vr5, vr4, vr4
vilvh.d vr7, vr6, vr6
move t6, ra
jirl ra, t7, 0
move ra, t6
LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21
move t6, ra
jirl ra, t8, 0
move ra, t6
vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
endfuncl
.macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \
out5, out6, out7, tmp0, tmp1, tmp2, tmp3
vilvl.h \tmp0, \in1, \in0
vilvl.h \tmp1, \in3, \in2
vilvh.h \tmp2, \in1, \in0
vilvh.h \tmp3, \in3, \in2
vilvl.w \out0, \tmp1, \tmp0
vilvh.w \out2, \tmp1, \tmp0
vilvl.w \out4, \tmp3, \tmp2
vilvh.w \out6, \tmp3, \tmp2
vbsrl.v \out1, \out0, 8
vbsrl.v \out3, \out2, 8
vbsrl.v \out5, \out4, 8
vbsrl.v \out7, \out6, 8
vinsgr2vr.d \out0, zero, 1
vinsgr2vr.d \out2, zero, 1
vinsgr2vr.d \out4, zero, 1
vinsgr2vr.d \out6, zero, 1
.endm
functionl inv_txfm_add_4x8_lsx
vxor.v vr23, vr23, vr23
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a2, 32
vld vr3, a2, 48
.irp i, 0, 16, 32, 48
vst vr23, a2, \i
.endr
li.w t0, 2896
vreplgr2vr.w vr23, t0
rect2_lsx vr0, vr23, vr0
rect2_lsx vr1, vr23, vr1
rect2_lsx vr2, vr23, vr2
rect2_lsx vr3, vr23, vr3
move t6, ra
jirl ra, t7, 0
move ra, t6
LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
vr6, vr7, vr16, vr17, vr18, vr19
move t6, ra
jirl ra, t8, 0
move ra, t6
vilvl.d vr0, vr1, vr0
vilvl.d vr1, vr3, vr2
vilvl.d vr2, vr5, vr4
vilvl.d vr3, vr7, vr6
vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr16, vr17
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr18, vr19
endfuncl
.macro fn8x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
bnez a3, .NO_HAS_DCONLY_8x4
idct_dc 8, 4, 0
DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
b .\txfm1\()_\txfm2\()_8X4_END
.NO_HAS_DCONLY_8x4:
.endif
la.local t7, inv_\txfm1\()_4h_x8_lsx
la.local t8, inv_\txfm2\()_8h_x4_lsx
b inv_txfm_add_8x4_lsx
.\txfm1\()_\txfm2\()_8X4_END:
endfunc
.endm
fn8x4 dct, dct
fn8x4 identity, identity
fn8x4 dct, adst
fn8x4 dct, flipadst
fn8x4 dct, identity
fn8x4 adst, dct
fn8x4 adst, adst
fn8x4 adst, flipadst
fn8x4 flipadst, dct
fn8x4 flipadst, adst
fn8x4 flipadst, flipadst
fn8x4 identity, dct
fn8x4 adst, identity
fn8x4 flipadst, identity
fn8x4 identity, adst
fn8x4 identity, flipadst
.macro fn4x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
bnez a3, .NO_HAS_DCONLY_4x8
idct_dc 4, 8, 0
DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20
add.d a0, a0, a1
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr5, vr5
b .\txfm1\()_\txfm2\()_4X8_END
.NO_HAS_DCONLY_4x8:
.endif
la.local t7, inv_\txfm1\()_8h_x4_lsx
la.local t8, inv_\txfm2\()_4h_x8_lsx
b inv_txfm_add_4x8_lsx
.\txfm1\()_\txfm2\()_4X8_END:
endfunc
.endm
fn4x8 dct, dct
fn4x8 identity, identity
fn4x8 dct, adst
fn4x8 dct, flipadst
fn4x8 dct, identity
fn4x8 adst, dct
fn4x8 adst, adst
fn4x8 adst, flipadst
fn4x8 flipadst, dct
fn4x8 flipadst, adst
fn4x8 flipadst, flipadst
fn4x8 identity, dct
fn4x8 adst, identity
fn4x8 flipadst, identity
fn4x8 identity, adst
fn4x8 identity, flipadst
.macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1
vsllwil.w.h vr4, \in0, 0
vexth.w.h vr5, \in0
vsllwil.w.h vr6, \in1, 0
vexth.w.h vr7, \in1
vmul.w vr4, vr4, \in2
vmul.w vr5, vr5, \in2
vmul.w vr6, vr6, \in2
vmul.w vr7, vr7, \in2
vssrarni.h.w vr5, vr4, 12
vssrarni.h.w vr7, vr6, 12
vsadd.h \out0, vr5, \in3
vsadd.h \out1, vr7, \in4
.endm
.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
vsllwil.w.h vr22, \in0, 0
vexth.w.h vr23, \in0
vmul.w \out0, vr22, \in2
vmul.w \out1, vr23, \in2
vsllwil.w.h vr22, \in1, 0
vexth.w.h vr23, \in1
vmadd.w \out0, vr22, \in3
vmadd.w \out1, vr23, \in3
.endm
.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
vsllwil.w.h vr22, \in0, 0
vexth.w.h vr23, \in0
vmul.w \out0, vr22, \in2
vmul.w \out1, vr23, \in2
vsllwil.w.h vr22, \in1, 0
vexth.w.h vr23, \in1
vmsub.w \out0, vr22, \in3
vmsub.w \out1, vr23, \in3
.endm
.macro inv_dct16_lsx sz
inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 16 // 401
vldrepl.h vr21, t0, 18 // 4076
vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz
vssrarni.h.w vr17, vr16, 12 // t15a
vssrarni.h.w vr19, vr18, 12 // t8a
vldrepl.h vr20, t0, 20 // 3166 -> 1583
vldrepl.h vr21, t0, 22 // 2598 -> 1299
vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz
vssrarni.h.w vr16, vr1, 12 // t14a
vssrarni.h.w vr18, vr15, 12 // t9a
vldrepl.h vr20, t0, 24 // 1931
vldrepl.h vr21, t0, 26 // 3612
vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz
vssrarni.h.w vr1, vr7, 12 // t13a
vssrarni.h.w vr15, vr9, 12 // t10a
vldrepl.h vr20, t0, 28 // 3920
vldrepl.h vr21, t0, 30 // 1189
vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz
vssrarni.h.w vr7, vr5, 12 // t12a
vssrarni.h.w vr9, vr11, 12 // t11a
vsadd.h vr5, vr19, vr18 // t8
vssub.h vr11, vr19, vr18 // t9
vssub.h vr3, vr9, vr15 // t10
vsadd.h vr13, vr9, vr15 // t11
vsadd.h vr18, vr7, vr1 // t12
vssub.h vr19, vr7, vr1 // t13
vssub.h vr9, vr17, vr16 // t14
vsadd.h vr15, vr17, vr16 // t15
vldrepl.h vr20, t0, 4 // 1567
vldrepl.h vr21, t0, 6 // 3784
vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz
vssrarni.h.w vr16, vr1, 12 // t14a
vssrarni.h.w vr17, vr7, 12 // t9a
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz
vneg.w vr1, vr1
vneg.w vr9, vr9
vssrarni.h.w vr7, vr11, 12 // t13a
vssrarni.h.w vr1, vr9, 12 // t10a
vsadd.h vr9, vr5, vr13 // t8a
vssub.h vr11, vr5, vr13 // t11a
vssub.h vr3, vr15, vr18 // t12a
vsadd.h vr19, vr15, vr18 // t15a
vsadd.h vr5, vr17, vr1 // t9
vssub.h vr13, vr17, vr1 // t10
vssub.h vr15, vr16, vr7 // t13
vsadd.h vr18, vr16, vr7 // t14
vldrepl.h vr20, t0, 0 // 2896
vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz
vssrarni.h.w vr7, vr1, 12 // t13a
vssrarni.h.w vr16, vr17, 12 // t10a
vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz
vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz
vssrarni.h.w vr23, vr13, 12 // t12
vssrarni.h.w vr17, vr15, 12 // t11
vssub.h vr15, vr0, vr19 // c[15]
vsadd.h vr0, vr0, vr19 // c[0]
vsadd.h vr1, vr2, vr18 // c[1]
vssub.h vr20, vr2, vr18 // c[14]
vsadd.h vr2, vr4, vr7 // c[2]
vssub.h vr13, vr4, vr7 // c[13]
vsadd.h vr3, vr6, vr23 // c[3]
vssub.h vr21, vr6, vr23 // c[12]
vsadd.h vr4, vr8, vr17 // c[4]
vssub.h vr11, vr8, vr17 // c[11]
vsadd.h vr7, vr14, vr9 // c[7]
vssub.h vr8, vr14, vr9 // c[8]
vsadd.h vr6, vr12, vr5 // c[6]
vssub.h vr9, vr12, vr5 // c[9]
vsadd.h vr5, vr10, vr16 // c[5]
vssub.h vr10, vr10, vr16 // c[10]
vor.v vr14, vr20, vr20
vor.v vr12, vr21, vr21
.endm
functionl inv_dct_8h_x16_lsx
inv_dct16_lsx .8h
endfuncl
functionl inv_dct_4h_x16_lsx
inv_dct16_lsx .4h
endfuncl
.macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 \in0, \in1
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 \in2, \in3
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 \in4, \in5
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 \in6, \in7
.endm
.macro def_fn_4x16_base txfm
functionl inv_txfm_\txfm\()add_4x16_lsx
PUSH_REG
blt a3, t5, 416f
vld vr0, a2, 16
vld vr1, a2, 48
vld vr2, a2, 80
vld vr3, a2, 112
vxor.v vr23, vr23, vr23
.irp i, 16, 48, 80, 112
vst vr23, a2, \i
.endr
move t6, ra
jirl ra, t7, 0
move ra, t6
.ifnc \txfm, identity_
vsrari.h vr0, vr0, 1
vsrari.h vr1, vr1, 1
vsrari.h vr2, vr2, 1
vsrari.h vr3, vr3, 1
.endif
LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \
vr27, vr14, vr28, vr10, vr11, vr12, vr13
416:
ble t5, a3, 416416f
.irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28
vxor.v \i, \i, \i
.endr
416416:
vld vr0, a2, 0
vld vr1, a2, 32
vld vr2, a2, 64
vld vr3, a2, 96
vxor.v vr23, vr23, vr23
.irp i, 0, 32, 64, 96
vst vr23, a2, \i
.endr
move t6, ra
jirl ra, t7, 0
move ra, t6
.ifnc \txfm, identity_
vsrari.h vr0, vr0, 1
vsrari.h vr1, vr1, 1
vsrari.h vr2, vr2, 1
vsrari.h vr3, vr3, 1
.endif
LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
vr6, vr7, vr16, vr17, vr18, vr19
vor.v vr10, vr24, vr24
vor.v vr11, vr25, vr25
vor.v vr12, vr26, vr26
vor.v vr13, vr27, vr27
vor.v vr15, vr28, vr28
move t6, ra
jirl ra, t8, 0
move ra, t6
vilvl.d vr16, vr1, vr0
vilvl.d vr17, vr3, vr2
vilvl.d vr18, vr5, vr4
vilvl.d vr19, vr7, vr6
vilvl.d vr20, vr9, vr8
vilvl.d vr21, vr11, vr10
vilvl.d vr22, vr13, vr12
vilvl.d vr23, vr15, vr14
.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
vsrari.h \i, \i, 4
.endr
VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
POP_REG
endfuncl
.endm
def_fn_4x16_base identity_
def_fn_4x16_base
.macro fn4x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
bnez a3, .NO_HAS_DCONLY_4x16
idct_dc 4, 16, 1
DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
.rept 3
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W4 vr5, vr5
.endr
b .\txfm1\()_\txfm2\()_4X16_END
.NO_HAS_DCONLY_4x16:
.endif
li.w t5, \eob_half
la.local t7, inv_\txfm1\()_8h_x4_lsx
.ifc \txfm1, identity
la.local t7, inv_\txfm1\()_8h_x4_lsx1
.endif
la.local t8, inv_\txfm2\()_4h_x16_lsx
.ifc \txfm1, identity
b inv_txfm_identity_add_4x16_lsx
.else
b inv_txfm_add_4x16_lsx
.endif
.\txfm1\()_\txfm2\()_4X16_END:
endfunc
.endm
fn4x16 dct, dct, 29
fn4x16 identity, identity, 29
fn4x16 dct, adst, 29
fn4x16 dct, flipadst, 29
fn4x16 dct, identity, 8
fn4x16 adst, dct, 29
fn4x16 adst, adst, 29
fn4x16 adst, flipadst, 29
fn4x16 flipadst, dct, 29
fn4x16 flipadst, adst, 29
fn4x16 flipadst, flipadst, 29
fn4x16 identity, dct, 32
fn4x16 adst, identity, 8
fn4x16 flipadst, identity, 8
fn4x16 identity, adst, 32
fn4x16 identity, flipadst, 32
.macro inv_identity16_lsx in0, in1, in2, out0, sz
.ifc \sz, .8h
vsllwil.w.h vr16, \in0, 0
vexth.w.h vr17, \in0
vmul.w vr16, vr16, \in1
vmul.w vr17, vr17, \in1
vsadd.h \in2, \in2, \in2
vssrarni.h.w vr17, vr16, 11
vsadd.h \out0, vr17, \in2
.else
vsllwil.w.h vr16, \in0, 0
vmul.w vr16, vr16, \in1
vsadd.h \in2, \in2, \in2
vssrarni.h.w vr16, vr16, 11
vsadd.h \out0, vr16, \in2
.endif
.endm
.macro inv_identity16_lsx1 in0, in1, in2, out0
vsllwil.w.h vr16, \in0, 0
vexth.w.h vr17, \in1
vmul.w vr18, vr16, \in2
vmul.w vr19, vr17, \in2
vsrari.w vr18, vr18, 11
vsrari.w vr19, vr19, 11
vslli.w vr16, vr16, 1
vslli.w vr17, vr17, 1
vadd.w vr16, vr18, vr16
vadd.w \out0, vr19, vr17
vssrarni.h.w \out0, vr16, 1
.endm
functionl inv_identity_8h_x16_lsx
li.w t0, 1697
vreplgr2vr.w vr20, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
vr9, vr10, vr11, vr12, vr13, vr14, vr15
inv_identity16_lsx \i, vr20, \i, \i, .8h
.endr
endfuncl
functionl inv_identity_4h_x16_lsx
li.w t0, 1697
vreplgr2vr.w vr20, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
vr9, vr10, vr11, vr12, vr13, vr14, vr15
inv_identity16_lsx \i, vr20, \i, \i, .4h
.endr
endfuncl
functionl inv_identity_8h_x16_lsx1
li.w t0, 1697
vreplgr2vr.w vr20, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
vr9, vr10, vr11, vr12, vr13, vr14, vr15
inv_identity16_lsx1 \i, \i, vr20, \i
.endr
endfuncl
const iadst16_coeffs_h, align =4
.short 4091, 201, 3973, 995
.short 3703, 1751, 3290, 2440
.short 2751, 3035, 2106, 3513
.short 1380, 3857, 601, 4052
endconst
.macro inv_adst16_lsx txfm, sz
la.local t0, iadst16_coeffs_h
vldrepl.h vr20, t0, 0 // 4091
vldrepl.h vr21, t0, 2 // 201
vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz
vssrarni.h.w vr18, vr16, 12 // t0
vssrarni.h.w vr19, vr17, 12 // t1
vldrepl.h vr20, t0, 4 // 3973
vldrepl.h vr21, t0, 6 // 995
vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz
vssrarni.h.w vr0, vr16, 12 // t2
vssrarni.h.w vr15, vr17, 12 // t3
vldrepl.h vr20, t0, 8 // 3703
vldrepl.h vr21, t0, 10 // 1751
vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz
vssrarni.h.w vr2, vr16, 12 // t4
vssrarni.h.w vr13, vr17, 12 // t5
vldrepl.h vr20, t0, 12 // 3290 -> 1645
vldrepl.h vr21, t0, 14 // 2440 -> 1220
vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz
vssrarni.h.w vr4, vr16, 12 // t6
vssrarni.h.w vr11, vr17, 12 // t7
vldrepl.h vr20, t0, 16 // 2751
vldrepl.h vr21, t0, 18 // 3035
vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz
vssrarni.h.w vr6, vr16, 12 // t8
vssrarni.h.w vr9, vr17, 12 // t9
vldrepl.h vr20, t0, 20 // 2106
vldrepl.h vr21, t0, 22 // 3513
vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz
vssrarni.h.w vr7, vr16, 12 // t10
vssrarni.h.w vr8, vr17, 12 // t11
vldrepl.h vr20, t0, 24 // 1380
vldrepl.h vr21, t0, 26 // 3857
vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz
vssrarni.h.w vr5, vr16, 12 // t12
vssrarni.h.w vr10, vr17, 12 // t13
vldrepl.h vr20, t0, 28 // 601
vldrepl.h vr21, t0, 30 // 4052
vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz
vssrarni.h.w vr3, vr16, 12 // t14
vssrarni.h.w vr12, vr17, 12 // t15
vsadd.h vr1, vr18, vr6 // t0a
vssub.h vr14, vr18, vr6 // t8a
vsadd.h vr16, vr19, vr9 // t1a
vssub.h vr17, vr19, vr9 // t9a
vsadd.h vr6, vr0, vr7 // t2a
vssub.h vr18, vr0, vr7 // t10a
vsadd.h vr9, vr15, vr8 // t3a
vssub.h vr19, vr15, vr8 // t11a
vsadd.h vr0, vr2, vr5 // t4a
vssub.h vr7, vr2, vr5 // t12a
vsadd.h vr8, vr13, vr10 // t5a
vssub.h vr15, vr13, vr10 // t13a
vsadd.h vr2, vr4, vr3 // t6a
vssub.h vr5, vr4, vr3 // t14a
vsadd.h vr10, vr11, vr12 // t7a
vssub.h vr13, vr11, vr12 // t15a
la.local t0, idct_coeffs_h
vldrepl.h vr20, t0, 8 // 799
vldrepl.h vr21, t0, 10 // 4017
vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz
vssrarni.h.w vr11, vr3, 12 // t8
vssrarni.h.w vr12, vr4, 12 // t9
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz
vssrarni.h.w vr14, vr3, 12 // t13
vssrarni.h.w vr17, vr4, 12 // t12
vldrepl.h vr20, t0, 12 // 3406
vldrepl.h vr21, t0, 14 // 2276
vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz
vssrarni.h.w vr7, vr3, 12 // t10
vssrarni.h.w vr15, vr4, 12 // t11
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz
vssrarni.h.w vr18, vr3, 12 // t15
vssrarni.h.w vr19, vr4, 12 // t14
vsadd.h vr5, vr1, vr0 // t0
vssub.h vr13, vr1, vr0 // t4
vsadd.h vr3, vr16, vr8 // t1
vssub.h vr4, vr16, vr8 // t5
vsadd.h vr0, vr6, vr2 // t2
vssub.h vr1, vr6, vr2 // t6
vsadd.h vr8, vr9, vr10 // t3
vssub.h vr16, vr9, vr10 // t7
vsadd.h vr2, vr11, vr17 // t8a
vssub.h vr6, vr11, vr17 // t12a
vsadd.h vr9, vr12, vr14 // t9a
vssub.h vr10, vr12, vr14 // t13a
vsadd.h vr11, vr7, vr19 // t10a
vssub.h vr17, vr7, vr19 // t14a
vsadd.h vr12, vr15, vr18 // t11a
vssub.h vr14, vr15, vr18 // t15a
vldrepl.h vr20, t0, 4 // 1567
vldrepl.h vr21, t0, 6 // 3784
vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz
vssrarni.h.w vr18, vr7, 12 // t4a
vssrarni.h.w vr19, vr15, 12 // t5a
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz
vssrarni.h.w vr4, vr7, 12 // t7a
vssrarni.h.w vr13, vr15, 12 // t6a
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz
vssrarni.h.w vr1, vr7, 12 // t12
vssrarni.h.w vr16, vr15, 12 // t13
vneg.h vr21, vr21
vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz
vneg.h vr20, vr20
vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz
vssrarni.h.w vr6, vr7, 12 // t15
vssrarni.h.w vr10, vr15, 12 // t14
vssub.h vr17, vr5, vr0 // t2a
vsadd.h vr14, vr5, vr0 // out[0]
vssub.h vr7, vr3, vr8 // t3a
vsadd.h vr15, vr3, vr8 // out[15]
vsllwil.w.h vr22, vr15, 0
vexth.w.h vr15, vr15
vneg.w vr22, vr22
vneg.w vr15, vr15
vssrarni.h.w vr15, vr22, 0 // out[15]
vsadd.h vr3, vr19, vr4 // out[12]
vssub.h vr8, vr19, vr4 // t7
vssub.h vr0, vr18, vr13 // t6
vsadd.h vr5, vr18, vr13 // out[3]
vsllwil.w.h vr22, vr5, 0
vexth.w.h vr5, vr5
vneg.w vr22, vr22
vneg.w vr5, vr5
vssrarni.h.w vr5, vr22, 0 // out[3]
vsadd.h vr13, vr9, vr12 // out[14]
vssub.h vr19, vr9, vr12 // t11
vssub.h vr4, vr2, vr11 // t10
vsadd.h vr18, vr2, vr11 // out[1]
vsllwil.w.h vr22, vr18, 0
vexth.w.h vr18, vr18
vneg.w vr22, vr22
vneg.w vr18, vr18
vssrarni.h.w vr18, vr22, 0 // out[1]
vsadd.h vr2, vr1, vr10 // out[2]
vssub.h vr11, vr1, vr10 // t14a
vssub.h vr12, vr16, vr6 // t15a
vsadd.h vr9, vr16, vr6 // out[13]
vsllwil.w.h vr22, vr9, 0
vexth.w.h vr9, vr9
vneg.w vr22, vr22
vneg.w vr9, vr9
vssrarni.h.w vr9, vr22, 0 // out[13]
vldrepl.h vr20, t0, 0 // 2896
vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz
vssrarni.h.w vr1, vr16, 12 // out[8]
vsrari.w vr6, vr6, 12
vsrari.w vr10, vr10, 12
vneg.w vr6, vr6
vneg.w vr10, vr10
vssrarni.h.w vr10, vr6, 0 // out[7]
vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz
vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz
vssrarni.h.w vr7, vr6, 12 // out[4]
vsrari.w vr16, vr16, 12
vsrari.w vr17, vr17, 12
vneg.w vr16, vr16
vneg.w vr17, vr17
vssrarni.h.w vr17, vr16, 0 // out[11]
vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz
vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz
vssrarni.h.w vr8, vr6, 12 // out[6]
vsrari.w vr16, vr16, 12
vsrari.w vr0, vr0, 12
vneg.w vr16, vr16
vneg.w vr0, vr0
vssrarni.h.w vr0, vr16, 0 // out[9]
vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz
vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz
vssrarni.h.w vr19, vr16, 12 // out[10]
vsrari.w vr6, vr6, 12
vsrari.w vr4, vr4, 12
vneg.w vr6, vr6
vneg.w vr4, vr4
vssrarni.h.w vr4, vr6, 0 // out[5]
.ifc \txfm, adst
vor.v vr12, vr3, vr3
vor.v vr3, vr5, vr5
vor.v vr5, vr4, vr4
vor.v vr4, vr7, vr7
vor.v vr7, vr10, vr10
vor.v vr10, vr19, vr19
vor.v vr6, vr8, vr8
vor.v vr8, vr1, vr1
vor.v vr11, vr17, vr17
vor.v vr20, vr13, vr13
vor.v vr13, vr9, vr9
vor.v vr9, vr0, vr0
vor.v vr0, vr14, vr14
vor.v vr14, vr20, vr20
vor.v vr1, vr18, vr18
.else
vor.v vr6, vr0, vr0
vor.v vr0, vr15, vr15
vor.v vr15, vr14, vr14
vor.v vr14, vr18, vr18
vor.v vr11, vr7, vr7
vor.v vr7, vr1, vr1
vor.v vr1, vr13, vr13
vor.v vr13, vr2, vr2
vor.v vr2, vr9, vr9
vor.v vr9, vr8, vr8
vor.v vr8, vr10, vr10
vor.v vr10, vr4, vr4
vor.v vr4, vr17, vr17
vor.v vr12, vr5, vr5
vor.v vr5, vr19, vr19
.endif
.endm // inv_adst16_lsx
functionl inv_adst_8h_x16_lsx
inv_adst16_lsx adst, 8h
endfuncl
functionl inv_flipadst_8h_x16_lsx
inv_adst16_lsx flipadst, 8h
endfuncl
functionl inv_adst_4h_x16_lsx
inv_adst16_lsx adst, 4h
endfuncl
functionl inv_flipadst_4h_x16_lsx
inv_adst16_lsx flipadst, 4h
endfuncl
.macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \
in9, in10, in11, in12, in13, in14, in15
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 \in0, \in1, \in2, \in3
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 \in4, \in5, \in6, \in7
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 \in8, \in9, \in10, \in11
add.d a0, a1, a0
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W8 \in12, \in13, \in14, \in15
.endm
.macro def_base_8x16 txfm1
functionl inv_txfm_\txfm1\()add_8x16_lsx
blt a3, t5, 816f
vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vxor.v vr23, vr23, vr23
.irp i, 16, 48, 80, 112, 144, 176, 208, 240
vst vr23, a2, \i
.endr
li.w t0, 2896
vreplgr2vr.w vr23, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
rect2_lsx \i, vr23, \i
.endr
.ifc \txfm1, identity_
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
.else
move t6, ra
jirl ra, t7, 0
move ra, t6
vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
.endif
816:
ble t5, a3, 816816f
.irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
vxor.v \i, \i, \i
.endr
816816:
vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vxor.v vr23, vr23, vr23
.irp i, 0, 32, 64, 96, 128, 160, 192, 224
vst vr23, a2, \i
.endr
li.w t0, 2896
vreplgr2vr.w vr23, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
rect2_lsx \i, vr23, \i
.endr
.ifc \txfm1, identity_
.else
move t6, ra
jirl ra, t7, 0
move ra, t6
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vsrari.h \i, \i, 1
.endr
.endif
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
move t6, ra
jirl ra, t8, 0
move ra, t6
vor.v vr0, vr0, vr0
vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
endfuncl
.endm
def_base_8x16 identity_
def_base_8x16
.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
vsllwil.hu.bu vr4, \in0, 0
vexth.hu.bu vr0, \in0
vsllwil.hu.bu vr5, \in1, 0
vexth.hu.bu vr1, \in1
vsllwil.hu.bu vr6, \in2, 0
vexth.hu.bu vr2, \in2
vsllwil.hu.bu vr7, \in3, 0
vexth.hu.bu vr3, \in3
vadd.h vr4, vr4, \in4
vadd.h vr0, vr0, \in5
vadd.h vr5, vr5, \in6
vadd.h vr1, vr1, \in7
vadd.h vr6, vr6, \in8
vadd.h vr2, vr2, \in9
vadd.h vr7, vr7, \in10
vadd.h vr3, vr3, \in11
vssrani.bu.h vr0, vr4, 0
vssrani.bu.h vr1, vr5, 0
vssrani.bu.h vr2, vr6, 0
vssrani.bu.h vr3, vr7, 0
vst vr0, a0, 0
vstx vr1, a0, a1
vst vr2, t2, 0
vstx vr3, t2, a1
.endm
.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7
vld vr0, a0, 0
vldx vr1, a0, a1
vld vr2, t2, 0
vldx vr3, t2, a1
DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
\in4, \in5, \in6, \in7
.endm
.macro def_fn_16x8 txfm1
functionl inv_txfm_\txfm1\()add_16x8_lsx
PUSH_REG
vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
vxor.v vr23, vr23, vr23
.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \
176, 192, 208, 224, 240
vst vr23, a2, \i
.endr
li.w t0, 2896
vreplgr2vr.w vr23, t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
rect2_lsx \i, vr23, \i
.endr
move t6, ra
jirl ra, t7, 0
move ra, t6
.ifnc \txfm1, identity_
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
vsrari.h \i, \i, 1
.endr
.endif
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
move t6, ra
jirl ra, t8, 0
move ra, t6
vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
move t6, ra
jirl ra, t8, 0
move ra, t6
vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11
alsl.d a0, a1, a0, 2
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15
POP_REG
endfuncl
.endm
def_fn_16x8 identity_
def_fn_16x8
.macro fun16x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
bnez a3, .NO_HAS_DCONLY_16x8
idct_dc 16, 8, 1
DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
vr20, vr20, vr20, vr20, vr20
alsl.d a0, a1, a0, 2
alsl.d t2, a1, a0, 1
VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20,
b .\txfm1\()_\txfm2\()_16x8_END
.NO_HAS_DCONLY_16x8:
.endif
la.local t7, inv_\txfm1\()_8h_x16_lsx
.ifc \txfm1, identity
la.local t7, inv_identity_8h_x16_lsx1
.endif
la.local t8, inv_\txfm2\()_8h_x8_lsx
.ifc \txfm1, identity
b inv_txfm_identity_add_16x8_lsx
.else
b inv_txfm_add_16x8_lsx
.endif
.\txfm1\()_\txfm2\()_16x8_END:
endfunc
.endm
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=91 H=70 G=80