/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
/*
static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int16_t *const abcd, int mx, int my
HIGHBD_DECL_SUFFIX)
*/
.macro vld_filter_row dst, src, inc
addi.w t3, \src, 512
srai.w t3, t3, 10
add.w \src, \src, \inc
addi.w t3, t3, 64
slli.w t3, t3, 3
fldx.d \dst, t4, t3
.endm
.macro warp_filter_horz_lsx
addi.w t5, a5, 0
vld vr10, a2, 0
add.d a2, a2, a3
vld_filter_row f0, t5, t0
vld_filter_row f1, t5, t0
vld_filter_row f2, t5, t0
vld_filter_row f3, t5, t0
vld_filter_row f4, t5, t0
vld_filter_row f5, t5, t0
vld_filter_row f6, t5, t0
vld_filter_row f7, t5, t0
vxor.v vr10, vr10, vr20
vbsrl.v vr8, vr10, 1
vbsrl.v vr9, vr10, 2
vilvl.d vr8, vr8, vr10
vilvl.d vr0, vr1, vr0
vmulwev.h.b vr11, vr8, vr0
vmulwod.h.b vr12, vr8, vr0
vbsrl.v vr8, vr10, 3
vbsrl.v vr19, vr10, 4
vilvl.d vr8, vr8, vr9
vilvl.d vr2, vr3, vr2
vmulwev.h.b vr13, vr8, vr2
vmulwod.h.b vr14, vr8, vr2
vbsrl.v vr8, vr10, 5
vbsrl.v vr9, vr10, 6
vilvl.d vr8, vr8, vr19
vilvl.d vr4, vr5, vr4
vmulwev.h.b vr15, vr8, vr4
vmulwod.h.b vr16, vr8, vr4
vbsrl.v vr8, vr10, 7
vilvl.d vr8, vr8, vr9
vilvl.d vr6, vr7, vr6
vmulwev.h.b vr17, vr8, vr6
vmulwod.h.b vr18, vr8, vr6
vadd.h vr11, vr11, vr12
vadd.h vr13, vr13, vr14
vadd.h vr15, vr15, vr16
vadd.h vr17, vr17, vr18
vpickev.h vr12, vr13, vr11
vpickod.h vr14, vr13, vr11
vpickev.h vr16, vr17, vr15
vpickod.h vr18, vr17, vr15
vadd.h vr11, vr12, vr14
vadd.h vr15, vr16, vr18
vpickev.h vr12, vr15, vr11
vpickod.h vr14, vr15, vr11
vadd.h vr11, vr12, vr14
add.d a5, a5, t1
.endm
.macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7
vilvl.b \in0, \in1, \in0
vilvl.b \in2, \in3, \in2
vilvl.b \in4, \in5, \in4
vilvl.b \in6, \in7, \in6
vpackev.h \in1, \in2, \in0
vpackod.h \in3, \in2, \in0
vpackev.h \in5, \in6, \in4
vpackod.h \in7, \in6, \in4
vpackev.w \in0, \in5, \in1
vpackod.w \in2, \in5, \in1
vpackev.w \in1, \in7, \in3
vpackod.w \in3, \in7, \in3
vexth.h.b \in4, \in0
vsllwil.h.b \in0, \in0, 0
vexth.h.b \in5, \in1
vsllwil.h.b \in1, \in1, 0
vexth.h.b \in6, \in2
vsllwil.h.b \in2, \in2, 0
vexth.h.b \in7, \in3
vsllwil.h.b \in3, \in3, 0
.endm
.macro warp t, shift
function warp_affine_8x8\t\()_8bpc_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
ld .h t0, a4, 0
ld .h t1, a4, 2
ld .h t2, a4, 4
ld .h a4, a4, 6
li.d t7, 8
alsl.w t3, a3, a3, 1
sub .d a2, a2, t3
addi.d a2, a2, -3
la.local t4, dav1d_mc_warp_filter
.ifnb \t
slli.d a1, a1, 1
.endif
li.w t3, 128
vreplgr2vr.b vr20, t3
.ifb \t
vreplgr2vr.h vr21, t3
.else
li.w t3, 2048
vreplgr2vr.h vr21, t3
.endif
warp_filter_horz_lsx
vsrari.h vr24, vr11, 3
warp_filter_horz_lsx
vsrari.h vr25, vr11, 3
warp_filter_horz_lsx
vsrari.h vr26, vr11, 3
warp_filter_horz_lsx
vsrari.h vr27, vr11, 3
warp_filter_horz_lsx
vsrari.h vr28, vr11, 3
warp_filter_horz_lsx
vsrari.h vr29, vr11, 3
warp_filter_horz_lsx
vsrari.h vr30, vr11, 3
1:
addi.d t6, a6, 0
warp_filter_horz_lsx
vsrari.h vr31, vr11, 3
vld_filter_row f0, t6, t2
vld_filter_row f1, t6, t2
vld_filter_row f2, t6, t2
vld_filter_row f3, t6, t2
vld_filter_row f4, t6, t2
vld_filter_row f5, t6, t2
vld_filter_row f6, t6, t2
vld_filter_row f7, t6, t2
transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vmulwev.w.h vr16, vr24, vr0
vmulwod.w.h vr17, vr24, vr0
vmaddwev.w.h vr16, vr25, vr1
vmaddwod.w.h vr17, vr25, vr1
vmaddwev.w.h vr16, vr26, vr2
vmaddwod.w.h vr17, vr26, vr2
vmaddwev.w.h vr16, vr27, vr3
vmaddwod.w.h vr17, vr27, vr3
vmaddwev.w.h vr16, vr28, vr4
vmaddwod.w.h vr17, vr28, vr4
vmaddwev.w.h vr16, vr29, vr5
vmaddwod.w.h vr17, vr29, vr5
vmaddwev.w.h vr16, vr30, vr6
vmaddwod.w.h vr17, vr30, vr6
vmaddwev.w.h vr16, vr31, vr7
vmaddwod.w.h vr17, vr31, vr7
vssrarni.h.w vr16, vr16, \shift
vssrarni.h.w vr17, vr17, \shift
vilvl.h vr16, vr17, vr16
vadd.h vr16, vr16, vr21
vor.v vr24, vr25, vr25
vor.v vr25, vr26, vr26
vor.v vr26, vr27, vr27
vor.v vr27, vr28, vr28
vor.v vr28, vr29, vr29
vor.v vr29, vr30, vr30
vor.v vr30, vr31, vr31
.ifb \t
vssrarni.bu.h vr16, vr16, 0
.endif
addi.d t7, t7, -1
.ifnb \t
vst vr16, a0, 0
.else
vstelm.d vr16, a0, 0, 0
.endif
add.d a0, a1, a0
add.d a6, a6, a4
blt zero, t7, 1b
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc
.endm
warp , 11
warp t, 7
.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
xvshuf.b xr2, \in0, \in0, \in2
addi.w t4, \in1, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr3, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr4, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr5, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr6, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
xvinsve0.d xr3, xr5, 1
xvinsve0.d xr3, xr4, 2
xvinsve0.d xr3, xr6, 3
xvmulwev.h.bu.b xr4, xr2, xr3
xvmulwod.h.bu.b xr5, xr2, xr3
xvilvl.d xr2, xr5, xr4
xvilvh.d xr3, xr5, xr4
xvhaddw.w.h xr2, xr2, xr2
xvhaddw.w.h xr3, xr3, xr3
xvhaddw.d.w xr2, xr2, xr2
xvhaddw.d.w xr3, xr3, xr3
xvhaddw.q.d xr2, xr2, xr2
xvhaddw.q.d xr3, xr3, xr3
xvextrins.w \out0, xr2, \out1
xvextrins.w \out2, xr3, \out3
.endm
.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
add.w \in0, \in0, \in1
addi.w t6, \in0, 512
srai.w t6, t6, 10
addi.w t6, t6, 64
slli.w t6, t6, 3
fldx.d f1, t5, t6
add.w t2, t2, t7
addi.w t6, t2, 512
srai.w t6, t6, 10
addi.w t6, t6, 64
slli.w t6, t6, 3
fldx.d f2, t5, t6
vilvl.d vr0, vr2, vr1
vext2xv.h.b xr0, xr0
xvmulwev.w.h xr3, \in2, xr0
xvmaddwod.w.h xr3, \in2, xr0
xvhaddw.d.w xr3, xr3, xr3
xvhaddw.q.d xr3, xr3, xr3
xvextrins.w \out0, xr3, \out1
.endm
const shuf0
.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
endconst
const warp_sh
.rept 2
.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
.endr
.rept 2
.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.endr
endconst
.macro warp_lasx t, shift
function warp_affine_8x8\t\()_8bpc_lasx
addi.d sp, sp, -16
ld .h t0, a4, 0 // abcd[0]
ld .h t1, a4, 2 // abcd[1]
fst.d f24, sp, 0
fst.d f25, sp, 8
alsl.w t2, a3, a3, 1
addi.w t3, a5, 0
la.local t4, warp_sh
la.local t5, dav1d_mc_warp_filter
sub .d a2, a2, t2
addi.d a2, a2, -3
vld vr0, a2, 0
xvld xr24, t4, 0
xvld xr25, t4, 32
la.local t2, shuf0
xvld xr1, t2, 0
xvpermi.q xr0, xr0, 0x00
xvaddi.bu xr9, xr1, 4
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
xvsrarni.h.w xr12, xr7, 3
xvsrarni.h.w xr13, xr8, 3
xvsrarni.h.w xr14, xr10, 3
xvsrarni.h.w xr15, xr11, 3
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
xvsrarni.h.w xr16, xr7, 3
xvsrarni.h.w xr17, xr8, 3
xvsrarni.h.w xr18, xr10, 3
xvsrarni.h.w xr19, xr11, 3
addi.w t2, a6, 0 // my
ld .h t7, a4, 4 // abcd[2]
ld .h t8, a4, 6 // abcd[3]
.ifnb \t
slli.d a1, a1, 1
.endif
// y = 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, \shift
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
fld.d f24, sp, 0
fld.d f25, sp, 8
addi.d sp, sp, 16
endfunc
.endm
warp_lasx , 11
warp_lasx t, 7
/*
static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2,
const int w, int h,
const int weight HIGHBD_DECL_SUFFIX)
*/
#define bpc8_sh 5 // sh = intermediate_bits + 1
#define bpcw8_sh 8 // sh = intermediate_bits + 4
#define bpc_sh bpc8_sh
#define bpcw_sh bpcw8_sh
function avg_8bpc_lsx
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .AVG_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.AVG_LSX_JRTABLE:
.hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W64_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W32_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W16_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W8_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W4_LSX - .AVG_LSX_JRTABLE
.AVG_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vadd.h vr2, vr0, vr1
vssrarni.bu.h vr3, vr2, bpc_sh
vstelm.w vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .AVG_W4_LSX
b .AVG_END_LSX
.AVG_W8_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr4, vr0, vr1
vadd.h vr5, vr2, vr3
vssrarni.bu.h vr5, vr4, bpc_sh
addi.w a5, a5, -2
addi.d a2, a2, 32
vstelm.d vr5, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr5, a0, 0, 1
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .AVG_W8_LSX
b .AVG_END_LSX
.AVG_W16_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr4, vr0, vr1
vadd.h vr5, vr2, vr3
vssrarni.bu.h vr5, vr4, bpc_sh
addi.w a5, a5, -1
addi.d a2, a2, 32
vst vr5, a0, 0
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .AVG_W16_LSX
b .AVG_END_LSX
.AVG_W32_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr4, a2, 32
vld vr6, a2, 48
vld vr1, a3, 0
vld vr3, a3, 16
vld vr5, a3, 32
vld vr7, a3, 48
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vadd.h vr4, vr4, vr5
vadd.h vr6, vr6, vr7
vssrarni.bu.h vr2, vr0, bpc_sh
vssrarni.bu.h vr6, vr4, bpc_sh
addi.w a5, a5, -1
addi.d a2, a2, 64
vst vr2, a0, 0
vst vr6, a0, 16
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .AVG_W32_LSX
b .AVG_END_LSX
.AVG_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vssrarni.bu.h vr2, vr0, bpc_sh
addi.d a2, a2, 32
addi.d a3, a3, 32
vst vr2, a0, 0
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .AVG_W64_LSX
b .AVG_END_LSX
.AVG_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vssrarni.bu.h vr2, vr0, bpc_sh
addi.d a2, a2, 32
addi.d a3, a3, 32
vst vr2, a0, 0
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .AVG_W128_LSX
.AVG_END_LSX:
endfunc
function avg_8bpc_lasx
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .AVG_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.AVG_LASX_JRTABLE:
.hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W64_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W32_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W16_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W8_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W4_LASX - .AVG_LASX_JRTABLE
.AVG_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
vadd.h vr0, vr0, vr1
vssrarni.bu.h vr1, vr0, bpc_sh
vstelm.w vr1, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr1, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .AVG_W4_LASX
b .AVG_END_LASX
.AVG_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvadd.h xr2, xr0, xr1
xvssrarni.bu.h xr1, xr2, bpc_sh
xvstelm.d xr1, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr1, a0, 0, 2
addi.w a5, a5, -2
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a1, a0
blt zero, a5, .AVG_W8_LASX
b .AVG_END_LASX
.AVG_W16_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvadd.h xr4, xr0, xr1
xvadd.h xr5, xr2, xr3
xvssrarni.bu.h xr5, xr4, bpc_sh
xvpermi.d xr2, xr5, 0xd8
xvpermi.d xr3, xr5, 0x8d
vst vr2, a0, 0
vstx vr3, a0, a1
addi.w a5, a5, -2
addi.d a2, a2, 64
addi.d a3, a3, 64
alsl.d a0, a1, a0, 1
blt zero, a5, .AVG_W16_LASX
b .AVG_END_LASX
.AVG_W32_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvadd.h xr4, xr0, xr1
xvadd.h xr5, xr2, xr3
xvssrarni.bu.h xr5, xr4, bpc_sh
xvpermi.d xr6, xr5, 0xd8
xvst xr6, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 64
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .AVG_W32_LASX
b .AVG_END_LASX
.AVG_W64_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr4, a2, 64
xvld xr6, a2, 96
xvld xr1, a3, 0
xvld xr3, a3, 32
xvld xr5, a3, 64
xvld xr7, a3, 96
xvadd.h xr0, xr0, xr1
xvadd.h xr2, xr2, xr3
xvadd.h xr4, xr4, xr5
xvadd.h xr6, xr6, xr7
xvssrarni.bu.h xr2, xr0, bpc_sh
xvssrarni.bu.h xr6, xr4, bpc_sh
xvpermi.d xr1, xr2, 0xd8
xvpermi.d xr3, xr6, 0xd8
xvst xr1, a0, 0
xvst xr3, a0, 32
addi.w a5, a5, -1
addi.d a2, a2, 128
addi.d a3, a3, 128
add.d a0, a0, a1
blt zero, a5, .AVG_W64_LASX
b .AVG_END_LASX
.AVG_W128_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr4, a2, 64
xvld xr6, a2, 96
xvld xr8, a2, 128
xvld xr10, a2, 160
xvld xr12, a2, 192
xvld xr14, a2, 224
xvld xr1, a3, 0
xvld xr3, a3, 32
xvld xr5, a3, 64
xvld xr7, a3, 96
xvld xr9, a3, 128
xvld xr11, a3, 160
xvld xr13, a3, 192
xvld xr15, a3, 224
xvadd.h xr0, xr0, xr1
xvadd.h xr2, xr2, xr3
xvadd.h xr4, xr4, xr5
xvadd.h xr6, xr6, xr7
xvadd.h xr8, xr8, xr9
xvadd.h xr10, xr10, xr11
xvadd.h xr12, xr12, xr13
xvadd.h xr14, xr14, xr15
xvssrarni.bu.h xr2, xr0, bpc_sh
xvssrarni.bu.h xr6, xr4, bpc_sh
xvssrarni.bu.h xr10, xr8, bpc_sh
xvssrarni.bu.h xr14, xr12, bpc_sh
xvpermi.d xr1, xr2, 0xd8
xvpermi.d xr3, xr6, 0xd8
xvpermi.d xr5, xr10, 0xd8
xvpermi.d xr7, xr14, 0xd8
xvst xr1, a0, 0
xvst xr3, a0, 32
xvst xr5, a0, 64
xvst xr7, a0, 96
addi.w a5, a5, -1
addi.d a2, a2, 256
addi.d a3, a3, 256
add.d a0, a0, a1
blt zero, a5, .AVG_W128_LASX
.AVG_END_LASX:
endfunc
function w_avg_8bpc_lsx
addi.d t8, a0, 0
li.w t2, 16
sub .w t2, t2, a6 // 16 - weight
vreplgr2vr.h vr21, a6
vreplgr2vr.h vr22, t2
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .W_AVG_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LSX_JRTABLE:
.hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
.W_AVG_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LSX
b .W_AVG_END_LSX
.W_AVG_W8_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.d f0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LSX
b .W_AVG_END_LSX
.W_AVG_W16_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LSX
b .W_AVG_END_LSX
.W_AVG_W32_LSX:
.rept 2
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W32_LSX
b .W_AVG_END_LSX
.W_AVG_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LSX
b .W_AVG_END_LSX
.W_AVG_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LSX
.W_AVG_END_LSX:
endfunc
function w_avg_8bpc_lasx
addi.d t8, a0, 0
li.w t2, 16
sub .w t2, t2, a6 // 16 - weight
xvreplgr2vr.h xr21, a6
xvreplgr2vr.h xr22, t2
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .W_AVG_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LASX_JRTABLE:
.hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
.W_AVG_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
xvpermi.d xr2, xr0, 0xD8
xvpermi.d xr3, xr1, 0xD8
xvilvl.h xr4, xr3, xr2
xvmulwev.w.h xr0, xr4, xr21
xvmaddwod.w.h xr0, xr4, xr22
xvssrarni.hu.w xr1, xr0, bpcw_sh
xvssrlni.bu.h xr0, xr1, 0
fst.s f0, a0, 0
add.d a0, a0, a1
xvstelm.w xr0, a0, 0, 4
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LASX
b .W_AVG_END_LASX
.W_AVG_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvstelm.d xr0, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr0, a0, 0, 2
addi.w a5, a5, -2
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LASX
b .W_AVG_END_LASX
.W_AVG_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvpermi.d xr1, xr0, 0xD8
vst vr1, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LASX
b .W_AVG_END_LSX
.W_AVG_W32_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 64
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .W_AVG_W32_LASX
b .W_AVG_END_LASX
.W_AVG_W64_LASX:
.rept 2
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LASX
b .W_AVG_END_LASX
.W_AVG_W128_LASX:
.rept 4
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LASX
.W_AVG_END_LASX:
endfunc
#undef bpc_sh
#undef bpcw_sh
#define mask_sh 10
/*
static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
const uint8_t *mask HIGHBD_DECL_SUFFIX)
*/
function mask_8bpc_lsx
vldi vr21, 0x440 // 64
vxor.v vr19, vr19, vr19
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .MASK_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.MASK_LSX_JRTABLE:
.hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W64_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W32_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W16_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W8_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W4_LSX - .MASK_LSX_JRTABLE
.MASK_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
fld.d f22, a6, 0
vilvl.b vr2, vr19, vr22
vsub.h vr3, vr21, vr2
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vssrarni.hu.w vr5, vr4, mask_sh
vssrlrni.bu.h vr1, vr5, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
addi.d a2, a2, 16
addi.d a3, a3, 16
addi.d a6, a6, 8
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W4_LSX
b .MASK_END_LSX
.MASK_W8_LSX:
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
fst.d f0, a0, 0
add.d a0, a0, a1
vstelm.d vr0, a0, 0, 1
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W8_LSX
b .MASK_END_LSX
.MASK_W16_LSX:
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W16_LSX
b .MASK_END_LSX
.MASK_W32_LSX:
.rept 2
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W32_LSX
b .MASK_END_LSX
.MASK_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W64_LSX
b .MASK_END_LSX
.MASK_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W128_LSX
.MASK_END_LSX:
endfunc
function mask_8bpc_lasx
xvldi xr21, 0x440 // 64
xvxor.v xr19, xr19, xr19
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .MASK_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.MASK_LASX_JRTABLE:
.hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W64_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W32_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W16_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W8_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W4_LASX - .MASK_LASX_JRTABLE
.MASK_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
fld.d f22, a6, 0
vilvl.h vr4, vr1, vr0
vilvh.h vr14, vr1, vr0
vilvl.b vr2, vr19, vr22
vsub.h vr3, vr21, vr2
xvpermi.q xr14, xr4, 0x20
vilvl.h vr5, vr3, vr2
vilvh.h vr15, vr3, vr2
xvpermi.q xr15, xr5, 0x20
xvmulwev.w.h xr0, xr14, xr15
xvmaddwod.w.h xr0, xr14, xr15
xvssrarni.hu.w xr1, xr0, mask_sh
xvssrlni.bu.h xr2, xr1, 0
fst.s f2, a0, 0
add.d a0, a0, a1
xvstelm.w xr2, a0, 0, 4
addi.d a2, a2, 16
addi.d a3, a3, 16
addi.d a6, a6, 8
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W4_LASX
b .MASK_END_LASX
.MASK_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
vld vr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvsub.h xr3, xr21, xr2
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvssrarni.hu.w xr5, xr4, mask_sh
xvssrlni.bu.h xr1, xr5, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
fst.d f0, a0, 0
add.d a0, a0, a1
xvstelm.d xr0, a0, 0, 2
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W8_LASX
b .MASK_END_LASX
.MASK_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
vld vr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvsub.h xr3, xr21, xr2
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvssrarni.hu.w xr5, xr4, mask_sh
xvssrlni.bu.h xr1, xr5, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvpermi.d xr1, xr0, 0xD8
vst vr1, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W16_LASX
b .MASK_END_LASX
.MASK_W32_LASX:
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W32_LASX
b .MASK_END_LASX
.MASK_W64_LASX:
.rept 2
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
addi.d a0, a0, 32
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W64_LASX
b .MASK_END_LASX
.MASK_W128_LASX:
.rept 4
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
addi.d a0, a0, 32
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W128_LASX
.MASK_END_LASX:
endfunc
/*
static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
uint8_t *mask, const int sign,
const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
*/
function w_mask_420_8bpc_lsx
addi.d sp, sp, -24
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
vldi vr20, 0x440
vreplgr2vr.h vr21, a7
vldi vr22, 0x426
clz.w t0, a4
li.w t1, 24
sub .w t0, t0, t1
la.local t1, .WMASK420_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld .h t8, t0, 0
add.d t1, t1, t8
jirl $r0, t1, 0
.align 3
.WMASK420_LSX_JRTABLE:
.hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE
.WMASK420_W4_LSX:
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a3, 0
vld vr3, a3, 16
addi.w a5, a5, -4
vabsd.h vr4, vr0, vr2
vabsd.h vr5, vr1, vr3
vaddi.hu vr4, vr4, 8
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=100 H=100 G=100