/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#define PREP_BIAS 8192
.macro avg d0, d1, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sqadd \t0\().8h, \t0\().8h, \t2\().8h
sqadd \t1\().8h, \t1\().8h, \t3\().8h
smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1)
sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1)
.endm
.macro w_avg d0, d1, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
// This difference requires a 17 bit range, and all bits are
// significant for the following multiplication.
ssubl \d0\().4s, \t2\().4h, \t0\().4h
ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
ssubl \d1\().4s, \t3\().4h, \t1\().4h
ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
mul \d0\().4s, \d0\().4s, v27.4s
mul \t0\().4s, \t0\().4s, v27.4s
mul \d1\().4s, \d1\().4s, v27.4s
mul \t1\().4s, \t1\().4s, v27.4s
sshr \d0\().4s, \d0\().4s, #4
sshr \t0\().4s, \t0\().4s, #4
sshr \d1\().4s, \d1\().4s, #4
sshr \t1\().4s, \t1\().4s, #4
saddw \d0\().4s, \d0\().4s, \t2\().4h
saddw2 \t0\().4s, \t0\().4s, \t2\().8h
saddw \d1\().4s, \d1\().4s, \t3\().4h
saddw2 \t1\().4s, \t1\().4s, \t3\().8h
uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
smax \d0\().8h, \d0\().8h, v30.8h // 0
smax \d1\().8h, \d1\().8h, v30.8h // 0
.endm
.macro mask d0, d1, t0, t1, t2, t3
ld1 {v27.16b}, [x6], 16
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
neg v27.16b, v27.16b
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sxtl v26.8h, v27.8b
sxtl2 v27.8h, v27.16b
sxtl v24.4s, v26.4h
sxtl2 v25.4s, v26.8h
sxtl v26.4s, v27.4h
sxtl2 v27.4s, v27.8h
ssubl \d0\().4s, \t2\().4h, \t0\().4h
ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
ssubl \d1\().4s, \t3\().4h, \t1\().4h
ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
mul \d0\().4s, \d0\().4s, v24.4s
mul \t0\().4s, \t0\().4s, v25.4s
mul \d1\().4s, \d1\().4s, v26.4s
mul \t1\().4s, \t1\().4s, v27.4s
sshr \d0\().4s, \d0\().4s, #6
sshr \t0\().4s, \t0\().4s, #6
sshr \d1\().4s, \d1\().4s, #6
sshr \t1\().4s, \t1\().4s, #6
saddw \d0\().4s, \d0\().4s, \t2\().4h
saddw2 \t0\().4s, \t0\().4s, \t2\().8h
saddw \d1\().4s, \d1\().4s, \t3\().4h
saddw2 \t1\().4s, \t1\().4s, \t3\().8h
uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
smax \d0\().8h, \d0\().8h, v30.8h // 0
smax \d1\().8h, \d1\().8h, v30.8h // 0
.endm
.macro bidir_fn type, bdmax
function \type\()_16bpc_neon, export=1
clz w4, w4
.ifnc \type, avg
dup v31.8h, \bdmax // bitdepth_max
movi v30.8h, #0
.endif
clz w7, \bdmax
sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18
.ifc \type, avg
mov w9, #1
mov w8, #-2*PREP_BIAS
lsl w9, w9, w7 // 1 << intermediate_bits
add w7, w7, #1
sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits
neg w7, w7 // -(intermediate_bits+1)
dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits
dup v29.8h, w7 // -(intermediate_bits+1)
.else
mov w8, #PREP_BIAS
lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits
neg w7, w7 // -intermediate_bits
dup v28.8h, w8 // PREP_BIAS >> intermediate_bits
dup v29.8h, w7 // -intermediate_bits
.endif
.ifc \type, w_avg
dup v27.4s, w6
neg v27.4s, v27.4s
.endif
movrel x7, \type\()_tbl
sub w4, w4, #24
\type v4, v5, v0, v1, v2, v3
ldrsw x4, [x7, x4, lsl #2]
add x7, x7, x4
br x7
40:
AARCH64_VALID_JUMP_TARGET
add x7, x0, x1
lsl x1, x1, #1
4:
subs w5, w5, #4
st1 {v4.8b}, [x0], x1
st1 {v4.d}[1], [x7], x1
st1 {v5.8b}, [x0], x1
st1 {v5.d}[1], [x7], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 4b
80:
AARCH64_VALID_JUMP_TARGET
add x7, x0, x1
lsl x1, x1, #1
8:
st1 {v4.8h}, [x0], x1
subs w5, w5, #2
st1 {v5.8h}, [x7], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 8b
160:
AARCH64_VALID_JUMP_TARGET
16:
\type v6, v7, v0, v1, v2, v3
st1 {v4.8h, v5.8h}, [x0], x1
subs w5, w5, #2
st1 {v6.8h, v7.8h}, [x0], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 16b
320:
AARCH64_VALID_JUMP_TARGET
32:
\type v6, v7, v0, v1, v2, v3
subs w5, w5, #1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 32b
640:
AARCH64_VALID_JUMP_TARGET
add x7, x0, #64
64:
\type v6, v7, v0, v1, v2, v3
\type v16, v17, v0, v1, v2, v3
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
\type v18, v19, v0, v1, v2, v3
subs w5, w5, #1
st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 64b
1280:
AARCH64_VALID_JUMP_TARGET
add x7, x0, #64
mov x8, #128
sub x1, x1, #128
128:
\type v6, v7, v0, v1, v2, v3
\type v16, v17, v0, v1, v2, v3
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8
\type v18, v19, v0, v1, v2, v3
st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
\type v4, v5, v0, v1, v2, v3
\type v6, v7, v0, v1, v2, v3
\type v16, v17, v0, v1, v2, v3
subs w5, w5, #1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
\type v18, v19, v0, v1, v2, v3
st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
b.le 0f
\type v4, v5, v0, v1, v2, v3
b 128b
0:
ret
endfunc
jumptable \type\()_tbl
.word 1280b - \type\()_tbl
.word 640b - \type\()_tbl
.word 320b - \type\()_tbl
.word 160b - \type\()_tbl
.word 80b - \type\()_tbl
.word 40b - \type\()_tbl
endjumptable
.endm
bidir_fn avg, w6
bidir_fn w_avg, w7
bidir_fn mask, w7
.macro w_mask_fn type
function w_mask_\type\()_16bpc_neon, export=1
ldr w8, [sp]
clz w9, w4
movrel x10, w_mask_\type\()_tbl
dup v31.8h, w8 // bitdepth_max
sub w9, w9, #24
clz w8, w8 // clz(bitdepth_max)
ldrsw x9, [x10, x9, lsl #2]
add x10, x10, x9
sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
mov w9, #PREP_BIAS*64
neg w8, w8 // -sh
mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
dup v30.4s, w9 // PREP_BIAS*64
dup v29.4s, w8 // -sh
dup v0.8h, w11
.if \type == 444
movi v1.16b, #64
.elseif \type == 422
dup v2.8b, w7
movi v3.8b, #129
sub v3.8b, v3.8b, v2.8b
.elseif \type == 420
dup v2.8h, w7
movi v3.8h, #1, lsl #8
sub v3.8h, v3.8h, v2.8h
.endif
add x12, x0, x1
lsl x1, x1, #1
br x10
40:
AARCH64_VALID_JUMP_TARGET
4:
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
subs w5, w5, #4
sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
sabd v21.8h, v5.8h, v7.8h
ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
ssubl2 v17.4s, v6.8h, v4.8h
ssubl v18.4s, v7.4h, v5.4h
ssubl2 v19.4s, v7.8h, v5.8h
uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
uqsub v21.8h, v0.8h, v21.8h
sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
sshll v6.4s, v5.4h, #6
sshll2 v5.4s, v4.8h, #6
sshll v4.4s, v4.4h, #6
ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
ushr v21.8h, v21.8h, #10
add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
add v5.4s, v5.4s, v30.4s
add v6.4s, v6.4s, v30.4s
add v7.4s, v7.4s, v30.4s
uxtl v22.4s, v20.4h
uxtl2 v23.4s, v20.8h
uxtl v24.4s, v21.4h
uxtl2 v25.4s, v21.8h
mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
mla v5.4s, v17.4s, v23.4s
mla v6.4s, v18.4s, v24.4s
mla v7.4s, v19.4s, v25.4s
srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
srshl v5.4s, v5.4s, v29.4s
srshl v6.4s, v6.4s, v29.4s
srshl v7.4s, v7.4s, v29.4s
sqxtun v4.4h, v4.4s // iclip_pixel
sqxtun2 v4.8h, v5.4s
sqxtun v5.4h, v6.4s
sqxtun2 v5.8h, v7.4s
umin v4.8h, v4.8h, v31.8h // iclip_pixel
umin v5.8h, v5.8h, v31.8h
.if \type == 444
uzp1 v20.16b, v20.16b, v21.16b // 64 - m
sub v20.16b, v1.16b, v20.16b // m
st1 {v20.16b}, [x6], #16
.elseif \type == 422
addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
xtn v20.8b, v20.8h
uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
st1 {v20.8b}, [x6], #8
.elseif \type == 420
trn1 v24.2d, v20.2d, v21.2d
trn2 v25.2d, v20.2d, v21.2d
add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition)
addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition)
sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
str s20, [x6], #4
.endif
st1 {v4.8b}, [x0], x1
st1 {v4.d}[1], [x12], x1
st1 {v5.8b}, [x0], x1
st1 {v5.d}[1], [x12], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
8:
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2
subs w5, w5, #2
sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
sabd v21.8h, v5.8h, v7.8h
ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
ssubl2 v17.4s, v6.8h, v4.8h
ssubl v18.4s, v7.4h, v5.4h
ssubl2 v19.4s, v7.8h, v5.8h
uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
uqsub v21.8h, v0.8h, v21.8h
sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
sshll v6.4s, v5.4h, #6
sshll2 v5.4s, v4.8h, #6
sshll v4.4s, v4.4h, #6
ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
ushr v21.8h, v21.8h, #10
add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
add v5.4s, v5.4s, v30.4s
add v6.4s, v6.4s, v30.4s
add v7.4s, v7.4s, v30.4s
uxtl v22.4s, v20.4h
uxtl2 v23.4s, v20.8h
uxtl v24.4s, v21.4h
uxtl2 v25.4s, v21.8h
mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
mla v5.4s, v17.4s, v23.4s
mla v6.4s, v18.4s, v24.4s
mla v7.4s, v19.4s, v25.4s
srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
srshl v5.4s, v5.4s, v29.4s
srshl v6.4s, v6.4s, v29.4s
srshl v7.4s, v7.4s, v29.4s
sqxtun v4.4h, v4.4s // iclip_pixel
sqxtun2 v4.8h, v5.4s
sqxtun v5.4h, v6.4s
sqxtun2 v5.8h, v7.4s
umin v4.8h, v4.8h, v31.8h // iclip_pixel
umin v5.8h, v5.8h, v31.8h
.if \type == 444
uzp1 v20.16b, v20.16b, v21.16b // 64 - m
sub v20.16b, v1.16b, v20.16b // m
st1 {v20.16b}, [x6], #16
.elseif \type == 422
addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
xtn v20.8b, v20.8h
uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
st1 {v20.8b}, [x6], #8
.elseif \type == 420
add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition)
addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition)
sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
str s20, [x6], #4
.endif
st1 {v4.8h}, [x0], x1
st1 {v5.8h}, [x12], x1
b.gt 8b
ret
1280:
640:
320:
160:
AARCH64_VALID_JUMP_TARGET
mov w11, w4
sub x1, x1, w4, uxtw #1
.if \type == 444
add x10, x6, w4, uxtw
.elseif \type == 422
add x10, x6, x11, lsr #1
.endif
add x9, x3, w4, uxtw #1
add x7, x2, w4, uxtw #1
161:
mov w8, w4
16:
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2
ld1 {v6.8h, v7.8h}, [x7], #32
ld1 {v18.8h, v19.8h}, [x9], #32
subs w8, w8, #16
sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2)
sabd v21.8h, v5.8h, v17.8h
ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
ssubl2 v23.4s, v16.8h, v4.8h
ssubl v24.4s, v17.4h, v5.4h
ssubl2 v25.4s, v17.8h, v5.8h
uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
uqsub v21.8h, v0.8h, v21.8h
sshll2 v27.4s, v5.8h, #6 // tmp1 << 6
sshll v26.4s, v5.4h, #6
sshll2 v5.4s, v4.8h, #6
sshll v4.4s, v4.4h, #6
ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
ushr v21.8h, v21.8h, #10
add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
add v5.4s, v5.4s, v30.4s
add v26.4s, v26.4s, v30.4s
add v27.4s, v27.4s, v30.4s
uxtl v16.4s, v20.4h
uxtl2 v17.4s, v20.8h
uxtl v28.4s, v21.4h
mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m)
uxtl2 v16.4s, v21.8h
mla v5.4s, v23.4s, v17.4s
mla v26.4s, v24.4s, v28.4s
mla v27.4s, v25.4s, v16.4s
srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
srshl v5.4s, v5.4s, v29.4s
srshl v26.4s, v26.4s, v29.4s
srshl v27.4s, v27.4s, v29.4s
sqxtun v4.4h, v4.4s // iclip_pixel
sqxtun2 v4.8h, v5.4s
sqxtun v5.4h, v26.4s
sqxtun2 v5.8h, v27.4s
// Start of other half
sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2)
sabd v23.8h, v7.8h, v19.8h
umin v4.8h, v4.8h, v31.8h // iclip_pixel
umin v5.8h, v5.8h, v31.8h
ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit)
ssubl2 v17.4s, v18.8h, v6.8h
ssubl v18.4s, v19.4h, v7.4h
ssubl2 v19.4s, v19.8h, v7.8h
uqsub v22.8h, v0.8h, v22.8h // 27615 - abs()
uqsub v23.8h, v0.8h, v23.8h
sshll v24.4s, v6.4h, #6 // tmp1 << 6
sshll2 v25.4s, v6.8h, #6
sshll v26.4s, v7.4h, #6
sshll2 v27.4s, v7.8h, #6
ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
ushr v23.8h, v23.8h, #10
add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64
add v25.4s, v25.4s, v30.4s
add v26.4s, v26.4s, v30.4s
add v27.4s, v27.4s, v30.4s
uxtl v6.4s, v22.4h
uxtl2 v7.4s, v22.8h
uxtl v28.4s, v23.4h
mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m)
uxtl2 v6.4s, v23.8h
mla v25.4s, v17.4s, v7.4s
mla v26.4s, v18.4s, v28.4s
mla v27.4s, v19.4s, v6.4s
srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
srshl v25.4s, v25.4s, v29.4s
srshl v26.4s, v26.4s, v29.4s
srshl v27.4s, v27.4s, v29.4s
sqxtun v6.4h, v24.4s // iclip_pixel
sqxtun2 v6.8h, v25.4s
sqxtun v7.4h, v26.4s
sqxtun2 v7.8h, v27.4s
umin v6.8h, v6.8h, v31.8h // iclip_pixel
umin v7.8h, v7.8h, v31.8h
.if \type == 444
uzp1 v20.16b, v20.16b, v21.16b // 64 - m
uzp1 v21.16b, v22.16b, v23.16b
sub v20.16b, v1.16b, v20.16b // m
sub v21.16b, v1.16b, v21.16b
st1 {v20.16b}, [x6], #16
st1 {v21.16b}, [x10], #16
.elseif \type == 422
addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
addp v21.8h, v22.8h, v23.8h
xtn v20.8b, v20.8h
xtn v21.8b, v21.8h
uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
uhsub v21.8b, v3.8b, v21.8b
st1 {v20.8b}, [x6], #8
st1 {v21.8b}, [x10], #8
.elseif \type == 420
add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition)
add v21.8h, v21.8h, v23.8h
addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition)
sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n))
rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
st1 {v20.8b}, [x6], #8
.endif
st1 {v4.8h, v5.8h}, [x0], #32
st1 {v6.8h, v7.8h}, [x12], #32
b.gt 16b
subs w5, w5, #2
add x2, x2, w4, uxtw #1
add x3, x3, w4, uxtw #1
add x7, x7, w4, uxtw #1
add x9, x9, w4, uxtw #1
.if \type == 444
add x6, x6, w4, uxtw
add x10, x10, w4, uxtw
.elseif \type == 422
add x6, x6, x11, lsr #1
add x10, x10, x11, lsr #1
.endif
add x0, x0, x1
add x12, x12, x1
b.gt 161b
ret
endfunc
jumptable w_mask_\type\()_tbl
.word 1280b - w_mask_\type\()_tbl
.word 640b - w_mask_\type\()_tbl
.word 320b - w_mask_\type\()_tbl
.word 160b - w_mask_\type\()_tbl
.word 80b - w_mask_\type\()_tbl
.word 40b - w_mask_\type\()_tbl
endjumptable
.endm
w_mask_fn 444
w_mask_fn 422
w_mask_fn 420
function blend_16bpc_neon, export=1
movrel x6, blend_tbl
clz w3, w3
sub w3, w3, #26
ldrsw x3, [x6, x3, lsl #2]
add x6, x6, x3
add x8, x0, x1
br x6
40:
AARCH64_VALID_JUMP_TARGET
lsl x1, x1, #1
4:
ld1 {v2.8b}, [x5], #8
ld1 {v1.8h}, [x2], #16
ldr d0, [x0]
neg v2.8b, v2.8b // -m
subs w4, w4, #2
ld1 {v0.d}[1], [x8]
sxtl v2.8h, v2.8b
shl v2.8h, v2.8h, #9 // -m << 9
sub v1.8h, v0.8h, v1.8h // a - b
sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
add v0.8h, v0.8h, v1.8h
st1 {v0.8b}, [x0], x1
st1 {v0.d}[1], [x8], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
lsl x1, x1, #1
8:
ld1 {v4.16b}, [x5], #16
ld1 {v2.8h, v3.8h}, [x2], #32
neg v5.16b, v4.16b // -m
ld1 {v0.8h}, [x0]
ld1 {v1.8h}, [x8]
sxtl v4.8h, v5.8b
sxtl2 v5.8h, v5.16b
shl v4.8h, v4.8h, #9 // -m << 9
shl v5.8h, v5.8h, #9
sub v2.8h, v0.8h, v2.8h // a - b
sub v3.8h, v1.8h, v3.8h
subs w4, w4, #2
sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v3.8h, v3.8h, v5.8h
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
st1 {v0.8h}, [x0], x1
st1 {v1.8h}, [x8], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
lsl x1, x1, #1
16:
ld1 {v16.16b, v17.16b}, [x5], #32
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
subs w4, w4, #2
neg v18.16b, v16.16b // -m
neg v19.16b, v17.16b
ld1 {v0.8h, v1.8h}, [x0]
sxtl v16.8h, v18.8b
sxtl2 v17.8h, v18.16b
sxtl v18.8h, v19.8b
sxtl2 v19.8h, v19.16b
ld1 {v2.8h, v3.8h}, [x8]
shl v16.8h, v16.8h, #9 // -m << 9
shl v17.8h, v17.8h, #9
shl v18.8h, v18.8h, #9
shl v19.8h, v19.8h, #9
sub v4.8h, v0.8h, v4.8h // a - b
sub v5.8h, v1.8h, v5.8h
sub v6.8h, v2.8h, v6.8h
sub v7.8h, v3.8h, v7.8h
sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v5.8h, v5.8h, v17.8h
sqrdmulh v6.8h, v6.8h, v18.8h
sqrdmulh v7.8h, v7.8h, v19.8h
add v0.8h, v0.8h, v4.8h
add v1.8h, v1.8h, v5.8h
add v2.8h, v2.8h, v6.8h
add v3.8h, v3.8h, v7.8h
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v2.8h, v3.8h}, [x8], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
32:
ld1 {v16.16b, v17.16b}, [x5], #32
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
subs w4, w4, #1
neg v18.16b, v16.16b // -m
neg v19.16b, v17.16b
sxtl v16.8h, v18.8b
sxtl2 v17.8h, v18.16b
sxtl v18.8h, v19.8b
sxtl2 v19.8h, v19.16b
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
shl v16.8h, v16.8h, #9 // -m << 9
shl v17.8h, v17.8h, #9
shl v18.8h, v18.8h, #9
shl v19.8h, v19.8h, #9
sub v4.8h, v0.8h, v4.8h // a - b
sub v5.8h, v1.8h, v5.8h
sub v6.8h, v2.8h, v6.8h
sub v7.8h, v3.8h, v7.8h
sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v5.8h, v5.8h, v17.8h
sqrdmulh v6.8h, v6.8h, v18.8h
sqrdmulh v7.8h, v7.8h, v19.8h
add v0.8h, v0.8h, v4.8h
add v1.8h, v1.8h, v5.8h
add v2.8h, v2.8h, v6.8h
add v3.8h, v3.8h, v7.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
b.gt 32b
ret
endfunc
jumptable blend_tbl
.word 320b - blend_tbl
.word 160b - blend_tbl
.word 80b - blend_tbl
.word 40b - blend_tbl
endjumptable
function blend_h_16bpc_neon, export=1
movrel x6, blend_h_tbl
movrel x5, X(obmc_masks)
add x5, x5, w4, uxtw
sub w4, w4, w4, lsr #2
clz w7, w3
add x8, x0, x1
lsl x1, x1, #1
sub w7, w7, #24
ldrsw x7, [x6, x7, lsl #2]
add x6, x6, x7
br x6
20:
AARCH64_VALID_JUMP_TARGET
2:
ld2r {v2.8b, v3.8b}, [x5], #2
ld1 {v1.4h}, [x2], #8
ext v2.8b, v2.8b, v3.8b, #6
subs w4, w4, #2
neg v2.8b, v2.8b // -m
ldr s0, [x0]
ld1 {v0.s}[1], [x8]
sxtl v2.8h, v2.8b
shl v2.4h, v2.4h, #9 // -m << 9
sub v1.4h, v0.4h, v1.4h // a - b
sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
add v0.4h, v0.4h, v1.4h
st1 {v0.s}[0], [x0], x1
st1 {v0.s}[1], [x8], x1
b.gt 2b
ret
40:
AARCH64_VALID_JUMP_TARGET
4:
ld2r {v2.8b, v3.8b}, [x5], #2
ld1 {v1.8h}, [x2], #16
ext v2.8b, v2.8b, v3.8b, #4
subs w4, w4, #2
neg v2.8b, v2.8b // -m
ldr d0, [x0]
ld1 {v0.d}[1], [x8]
sxtl v2.8h, v2.8b
shl v2.8h, v2.8h, #9 // -m << 9
sub v1.8h, v0.8h, v1.8h // a - b
sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
add v0.8h, v0.8h, v1.8h
st1 {v0.8b}, [x0], x1
st1 {v0.d}[1], [x8], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
8:
ld2r {v4.8b, v5.8b}, [x5], #2
ld1 {v2.8h, v3.8h}, [x2], #32
neg v4.8b, v4.8b // -m
neg v5.8b, v5.8b
ld1 {v0.8h}, [x0]
subs w4, w4, #2
sxtl v4.8h, v4.8b
sxtl v5.8h, v5.8b
ld1 {v1.8h}, [x8]
shl v4.8h, v4.8h, #9 // -m << 9
shl v5.8h, v5.8h, #9
sub v2.8h, v0.8h, v2.8h // a - b
sub v3.8h, v1.8h, v3.8h
sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v3.8h, v3.8h, v5.8h
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
st1 {v0.8h}, [x0], x1
st1 {v1.8h}, [x8], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
16:
ld2r {v16.8b, v17.8b}, [x5], #2
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
neg v16.8b, v16.8b // -m
neg v17.8b, v17.8b
ld1 {v0.8h, v1.8h}, [x0]
ld1 {v2.8h, v3.8h}, [x8]
subs w4, w4, #2
sxtl v16.8h, v16.8b
sxtl v17.8h, v17.8b
shl v16.8h, v16.8h, #9 // -m << 9
shl v17.8h, v17.8h, #9
sub v4.8h, v0.8h, v4.8h // a - b
sub v5.8h, v1.8h, v5.8h
sub v6.8h, v2.8h, v6.8h
sub v7.8h, v3.8h, v7.8h
sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v5.8h, v5.8h, v16.8h
sqrdmulh v6.8h, v6.8h, v17.8h
sqrdmulh v7.8h, v7.8h, v17.8h
add v0.8h, v0.8h, v4.8h
add v1.8h, v1.8h, v5.8h
add v2.8h, v2.8h, v6.8h
add v3.8h, v3.8h, v7.8h
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v2.8h, v3.8h}, [x8], x1
b.gt 16b
ret
1280:
640:
320:
AARCH64_VALID_JUMP_TARGET
sub x1, x1, w3, uxtw #1
add x7, x2, w3, uxtw #1
321:
ld2r {v24.8b, v25.8b}, [x5], #2
mov w6, w3
neg v24.8b, v24.8b // -m
neg v25.8b, v25.8b
sxtl v24.8h, v24.8b
sxtl v25.8h, v25.8b
shl v24.8h, v24.8h, #9 // -m << 9
shl v25.8h, v25.8h, #9
32:
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
subs w6, w6, #32
sub v16.8h, v0.8h, v16.8h // a - b
sub v17.8h, v1.8h, v17.8h
sub v18.8h, v2.8h, v18.8h
sub v19.8h, v3.8h, v19.8h
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8]
sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v17.8h, v17.8h, v24.8h
sqrdmulh v18.8h, v18.8h, v24.8h
sqrdmulh v19.8h, v19.8h, v24.8h
sub v20.8h, v4.8h, v20.8h // a - b
sub v21.8h, v5.8h, v21.8h
sub v22.8h, v6.8h, v22.8h
sub v23.8h, v7.8h, v23.8h
add v0.8h, v0.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v3.8h, v3.8h, v19.8h
sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v21.8h, v21.8h, v25.8h
sqrdmulh v22.8h, v22.8h, v25.8h
sqrdmulh v23.8h, v23.8h, v25.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
add v7.8h, v7.8h, v23.8h
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64
b.gt 32b
subs w4, w4, #2
add x0, x0, x1
add x8, x8, x1
add x2, x2, w3, uxtw #1
add x7, x7, w3, uxtw #1
b.gt 321b
ret
endfunc
jumptable blend_h_tbl
.word 1280b - blend_h_tbl
.word 640b - blend_h_tbl
.word 320b - blend_h_tbl
.word 160b - blend_h_tbl
.word 80b - blend_h_tbl
.word 40b - blend_h_tbl
.word 20b - blend_h_tbl
endjumptable
function blend_v_16bpc_neon, export=1
movrel x6, blend_v_tbl
movrel x5, X(obmc_masks)
add x5, x5, w3, uxtw
clz w3, w3
add x8, x0, x1
lsl x1, x1, #1
sub w3, w3, #26
ldrsw x3, [x6, x3, lsl #2]
add x6, x6, x3
br x6
20:
AARCH64_VALID_JUMP_TARGET
ld1r {v2.8b}, [x5]
neg v2.8b, v2.8b // -m
sxtl v2.8h, v2.8b
shl v2.4h, v2.4h, #9 // -m << 9
2:
ldr s1, [x2], #4
ldr h0, [x0]
subs w4, w4, #2
ld1 {v1.h}[1], [x2]
ld1 {v0.h}[1], [x8]
add x2, x2, #4
sub v1.4h, v0.4h, v1.4h // a - b
sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
add v0.4h, v0.4h, v1.4h
st1 {v0.h}[0], [x0], x1
st1 {v0.h}[1], [x8], x1
b.gt 2b
ret
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v2.2s}, [x5]
sub x1, x1, #4
neg v2.8b, v2.8b // -m
sxtl v2.8h, v2.8b
shl v2.8h, v2.8h, #9 // -m << 9
4:
ld1 {v1.8h}, [x2], #16
ldr d0, [x0]
ld1 {v0.d}[1], [x8]
subs w4, w4, #2
sub v1.8h, v0.8h, v1.8h // a - b
sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
add v0.8h, v0.8h, v1.8h
str s0, [x0], #4
st1 {v0.s}[2], [x8], #4
st1 {v0.h}[2], [x0], x1
st1 {v0.h}[6], [x8], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v4.8b}, [x5]
sub x1, x1, #8
neg v4.8b, v4.8b // -m
sxtl v4.8h, v4.8b
shl v4.8h, v4.8h, #9 // -m << 9
8:
ld1 {v2.8h, v3.8h}, [x2], #32
ld1 {v0.8h}, [x0]
ld1 {v1.8h}, [x8]
subs w4, w4, #2
sub v2.8h, v0.8h, v2.8h // a - b
sub v3.8h, v1.8h, v3.8h
sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v3.8h, v3.8h, v4.8h
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
str d0, [x0], #8
str d1, [x8], #8
st1 {v0.s}[2], [x0], x1
st1 {v1.s}[2], [x8], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
ld1 {v16.16b}, [x5]
sub x1, x1, #16
neg v17.16b, v16.16b // -m
sxtl v16.8h, v17.8b
sxtl2 v17.8h, v17.16b
shl v16.8h, v16.8h, #9 // -m << 9
shl v17.4h, v17.4h, #9
16:
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
ld1 {v0.8h, v1.8h}, [x0]
subs w4, w4, #2
ld1 {v2.8h, v3.8h}, [x8]
sub v4.8h, v0.8h, v4.8h // a - b
sub v5.4h, v1.4h, v5.4h
sub v6.8h, v2.8h, v6.8h
sub v7.4h, v3.4h, v7.4h
sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v5.4h, v5.4h, v17.4h
sqrdmulh v6.8h, v6.8h, v16.8h
sqrdmulh v7.4h, v7.4h, v17.4h
add v0.8h, v0.8h, v4.8h
add v1.4h, v1.4h, v5.4h
add v2.8h, v2.8h, v6.8h
add v3.4h, v3.4h, v7.4h
st1 {v0.8h}, [x0], #16
st1 {v2.8h}, [x8], #16
st1 {v1.4h}, [x0], x1
st1 {v3.4h}, [x8], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
ld1 {v24.16b, v25.16b}, [x5]
neg v26.16b, v24.16b // -m
neg v27.8b, v25.8b
sxtl v24.8h, v26.8b
sxtl2 v25.8h, v26.16b
sxtl v26.8h, v27.8b
shl v24.8h, v24.8h, #9 // -m << 9
shl v25.8h, v25.8h, #9
shl v26.8h, v26.8h, #9
32:
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
ld1 {v0.8h, v1.8h, v2.8h}, [x0]
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
ld1 {v4.8h, v5.8h, v6.8h}, [x8]
subs w4, w4, #2
sub v16.8h, v0.8h, v16.8h // a - b
sub v17.8h, v1.8h, v17.8h
sub v18.8h, v2.8h, v18.8h
sub v20.8h, v4.8h, v20.8h
sub v21.8h, v5.8h, v21.8h
sub v22.8h, v6.8h, v22.8h
sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
sqrdmulh v17.8h, v17.8h, v25.8h
sqrdmulh v18.8h, v18.8h, v26.8h
sqrdmulh v20.8h, v20.8h, v24.8h
sqrdmulh v21.8h, v21.8h, v25.8h
sqrdmulh v22.8h, v22.8h, v26.8h
add v0.8h, v0.8h, v16.8h
add v1.8h, v1.8h, v17.8h
add v2.8h, v2.8h, v18.8h
add v4.8h, v4.8h, v20.8h
add v5.8h, v5.8h, v21.8h
add v6.8h, v6.8h, v22.8h
st1 {v0.8h, v1.8h, v2.8h}, [x0], x1
st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
b.gt 32b
ret
endfunc
jumptable blend_v_tbl
.word 320b - blend_v_tbl
.word 160b - blend_v_tbl
.word 80b - blend_v_tbl
.word 40b - blend_v_tbl
.word 20b - blend_v_tbl
endjumptable
// This has got the same signature as the put_8tap functions,
// and assumes that x9 is set to (clz(w)-24).
function put_16bpc_neon, export=1
movrel x10, put_16bpc_tbl
ldrsw x9, [x10, x9, lsl #2]
add x10, x10, x9
br x10
20:
AARCH64_VALID_JUMP_TARGET
2:
ld1r {v0.4s}, [x2], x3
ld1r {v1.4s}, [x2], x3
subs w5, w5, #2
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
b.gt 2b
ret
40:
AARCH64_VALID_JUMP_TARGET
4:
ld1 {v0.4h}, [x2], x3
ld1 {v1.4h}, [x2], x3
subs w5, w5, #2
st1 {v0.4h}, [x0], x1
st1 {v1.4h}, [x0], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
add x8, x0, x1
lsl x1, x1, #1
add x9, x2, x3
lsl x3, x3, #1
8:
ld1 {v0.8h}, [x2], x3
ld1 {v1.8h}, [x9], x3
subs w5, w5, #2
st1 {v0.8h}, [x0], x1
st1 {v1.8h}, [x8], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
16:
ldp x6, x7, [x2]
ldp x8, x9, [x2, #16]
stp x6, x7, [x0]
subs w5, w5, #1
stp x8, x9, [x0, #16]
add x2, x2, x3
add x0, x0, x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
32:
ldp x6, x7, [x2]
ldp x8, x9, [x2, #16]
stp x6, x7, [x0]
ldp x10, x11, [x2, #32]
stp x8, x9, [x0, #16]
subs w5, w5, #1
ldp x12, x13, [x2, #48]
stp x10, x11, [x0, #32]
stp x12, x13, [x0, #48]
add x2, x2, x3
add x0, x0, x1
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
64:
ldp q0, q1, [x2]
ldp q2, q3, [x2, #32]
stp q0, q1, [x0]
ldp q4, q5, [x2, #64]
stp q2, q3, [x0, #32]
ldp q6, q7, [x2, #96]
subs w5, w5, #1
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x2, x2, x3
add x0, x0, x1
b.gt 64b
ret
1280:
AARCH64_VALID_JUMP_TARGET
128:
ldp q0, q1, [x2]
ldp q2, q3, [x2, #32]
stp q0, q1, [x0]
ldp q4, q5, [x2, #64]
stp q2, q3, [x0, #32]
ldp q6, q7, [x2, #96]
subs w5, w5, #1
stp q4, q5, [x0, #64]
ldp q16, q17, [x2, #128]
stp q6, q7, [x0, #96]
ldp q18, q19, [x2, #160]
stp q16, q17, [x0, #128]
ldp q20, q21, [x2, #192]
stp q18, q19, [x0, #160]
ldp q22, q23, [x2, #224]
stp q20, q21, [x0, #192]
stp q22, q23, [x0, #224]
add x2, x2, x3
add x0, x0, x1
b.gt 128b
ret
endfunc
jumptable put_16bpc_tbl
.word 1280b - put_16bpc_tbl
.word 640b - put_16bpc_tbl
.word 320b - put_16bpc_tbl
.word 160b - put_16bpc_tbl
.word 80b - put_16bpc_tbl
.word 40b - put_16bpc_tbl
.word 20b - put_16bpc_tbl
endjumptable
// This has got the same signature as the prep_8tap functions,
// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
// x8 to w*2.
function prep_16bpc_neon
movrel x10, prep_16bpc_tbl
ldrsw x9, [x10, x9, lsl #2]
dup v31.8h, w7 // intermediate_bits
movi v30.8h, #(PREP_BIAS >> 8), lsl #8
add x10, x10, x9
br x10
40:
AARCH64_VALID_JUMP_TARGET
add x9, x1, x2
lsl x2, x2, #1
4:
ld1 {v0.8b}, [x1], x2
ld1 {v0.d}[1], [x9], x2
subs w4, w4, #2
sshl v0.8h, v0.8h, v31.8h
sub v0.8h, v0.8h, v30.8h
st1 {v0.8h}, [x0], #16
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
add x9, x1, x2
lsl x2, x2, #1
8:
ld1 {v0.8h}, [x1], x2
ld1 {v1.8h}, [x9], x2
subs w4, w4, #2
sshl v0.8h, v0.8h, v31.8h
sshl v1.8h, v1.8h, v31.8h
sub v0.8h, v0.8h, v30.8h
sub v1.8h, v1.8h, v30.8h
st1 {v0.8h, v1.8h}, [x0], #32
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
16:
ldp q0, q1, [x1]
add x1, x1, x2
sshl v0.8h, v0.8h, v31.8h
ldp q2, q3, [x1]
add x1, x1, x2
subs w4, w4, #2
sshl v1.8h, v1.8h, v31.8h
sshl v2.8h, v2.8h, v31.8h
sshl v3.8h, v3.8h, v31.8h
sub v0.8h, v0.8h, v30.8h
sub v1.8h, v1.8h, v30.8h
sub v2.8h, v2.8h, v30.8h
sub v3.8h, v3.8h, v30.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
32:
ldp q0, q1, [x1]
sshl v0.8h, v0.8h, v31.8h
ldp q2, q3, [x1, #32]
add x1, x1, x2
sshl v1.8h, v1.8h, v31.8h
sshl v2.8h, v2.8h, v31.8h
sshl v3.8h, v3.8h, v31.8h
subs w4, w4, #1
sub v0.8h, v0.8h, v30.8h
sub v1.8h, v1.8h, v30.8h
sub v2.8h, v2.8h, v30.8h
sub v3.8h, v3.8h, v30.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
64:
ldp q0, q1, [x1]
subs w4, w4, #1
sshl v0.8h, v0.8h, v31.8h
ldp q2, q3, [x1, #32]
sshl v1.8h, v1.8h, v31.8h
ldp q4, q5, [x1, #64]
sshl v2.8h, v2.8h, v31.8h
sshl v3.8h, v3.8h, v31.8h
ldp q6, q7, [x1, #96]
add x1, x1, x2
sshl v4.8h, v4.8h, v31.8h
sshl v5.8h, v5.8h, v31.8h
sshl v6.8h, v6.8h, v31.8h
sshl v7.8h, v7.8h, v31.8h
sub v0.8h, v0.8h, v30.8h
sub v1.8h, v1.8h, v30.8h
sub v2.8h, v2.8h, v30.8h
sub v3.8h, v3.8h, v30.8h
stp q0, q1, [x0]
sub v4.8h, v4.8h, v30.8h
sub v5.8h, v5.8h, v30.8h
stp q2, q3, [x0, #32]
sub v6.8h, v6.8h, v30.8h
sub v7.8h, v7.8h, v30.8h
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x0, x0, x8
b.gt 64b
ret
1280:
AARCH64_VALID_JUMP_TARGET
128:
ldp q0, q1, [x1]
subs w4, w4, #1
sshl v0.8h, v0.8h, v31.8h
ldp q2, q3, [x1, #32]
sshl v1.8h, v1.8h, v31.8h
ldp q4, q5, [x1, #64]
sshl v2.8h, v2.8h, v31.8h
sshl v3.8h, v3.8h, v31.8h
ldp q6, q7, [x1, #96]
sshl v4.8h, v4.8h, v31.8h
sshl v5.8h, v5.8h, v31.8h
ldp q16, q17, [x1, #128]
sshl v6.8h, v6.8h, v31.8h
sshl v7.8h, v7.8h, v31.8h
ldp q18, q19, [x1, #160]
sshl v16.8h, v16.8h, v31.8h
sshl v17.8h, v17.8h, v31.8h
ldp q20, q21, [x1, #192]
sshl v18.8h, v18.8h, v31.8h
sshl v19.8h, v19.8h, v31.8h
ldp q22, q23, [x1, #224]
add x1, x1, x2
sshl v20.8h, v20.8h, v31.8h
sshl v21.8h, v21.8h, v31.8h
sshl v22.8h, v22.8h, v31.8h
sshl v23.8h, v23.8h, v31.8h
sub v0.8h, v0.8h, v30.8h
sub v1.8h, v1.8h, v30.8h
sub v2.8h, v2.8h, v30.8h
sub v3.8h, v3.8h, v30.8h
stp q0, q1, [x0]
sub v4.8h, v4.8h, v30.8h
sub v5.8h, v5.8h, v30.8h
stp q2, q3, [x0, #32]
sub v6.8h, v6.8h, v30.8h
sub v7.8h, v7.8h, v30.8h
stp q4, q5, [x0, #64]
sub v16.8h, v16.8h, v30.8h
sub v17.8h, v17.8h, v30.8h
stp q6, q7, [x0, #96]
sub v18.8h, v18.8h, v30.8h
sub v19.8h, v19.8h, v30.8h
stp q16, q17, [x0, #128]
sub v20.8h, v20.8h, v30.8h
sub v21.8h, v21.8h, v30.8h
stp q18, q19, [x0, #160]
sub v22.8h, v22.8h, v30.8h
sub v23.8h, v23.8h, v30.8h
stp q20, q21, [x0, #192]
stp q22, q23, [x0, #224]
add x0, x0, x8
b.gt 128b
ret
endfunc
jumptable prep_16bpc_tbl
.word 1280b - prep_16bpc_tbl
.word 640b - prep_16bpc_tbl
.word 320b - prep_16bpc_tbl
.word 160b - prep_16bpc_tbl
.word 80b - prep_16bpc_tbl
.word 40b - prep_16bpc_tbl
endjumptable
.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
ld1 {\d0\wd}[0], [\s0], \strd
ld1 {\d1\wd}[0], [\s1], \strd
.ifnb \d2
ld1 {\d2\wd}[0], [\s0], \strd
ld1 {\d3\wd}[0], [\s1], \strd
.endif
.ifnb \d4
ld1 {\d4\wd}[0], [\s0], \strd
.endif
.ifnb \d5
ld1 {\d5\wd}[0], [\s1], \strd
.endif
.ifnb \d6
ld1 {\d6\wd}[0], [\s0], \strd
.endif
.endm
.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
ld1 {\d0\wd}, [\s0], \strd
ld1 {\d1\wd}, [\s1], \strd
.ifnb \d2
ld1 {\d2\wd}, [\s0], \strd
ld1 {\d3\wd}, [\s1], \strd
.endif
.ifnb \d4
ld1 {\d4\wd}, [\s0], \strd
.endif
.ifnb \d5
ld1 {\d5\wd}, [\s1], \strd
.endif
.ifnb \d6
ld1 {\d6\wd}, [\s0], \strd
.endif
.endm
.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
ld1 {\d0\wd, \d1\wd}, [\s0], \strd
.ifnb \d2
ld1 {\d2\wd, \d3\wd}, [\s1], \strd
.endif
.ifnb \d4
ld1 {\d4\wd, \d5\wd}, [\s0], \strd
.endif
.endm
.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
.endm
.macro interleave_1 wd, r0, r1, r2, r3, r4
trn1 \r0\wd, \r0\wd, \r1\wd
trn1 \r1\wd, \r1\wd, \r2\wd
.ifnb \r3
trn1 \r2\wd, \r2\wd, \r3\wd
trn1 \r3\wd, \r3\wd, \r4\wd
.endif
.endm
.macro interleave_1_s r0, r1, r2, r3, r4
interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
.endm
.macro umin_h c, wd, r0, r1, r2, r3
umin \r0\wd, \r0\wd, \c\wd
.ifnb \r1
umin \r1\wd, \r1\wd, \c\wd
.endif
.ifnb \r2
umin \r2\wd, \r2\wd, \c\wd
umin \r3\wd, \r3\wd, \c\wd
.endif
.endm
.macro sub_h c, wd, r0, r1, r2, r3
sub \r0\wd, \r0\wd, \c\wd
.ifnb \r1
sub \r1\wd, \r1\wd, \c\wd
.endif
.ifnb \r2
sub \r2\wd, \r2\wd, \c\wd
sub \r3\wd, \r3\wd, \c\wd
.endif
.endm
.macro smull_smlal_4tap d, s0, s1, s2, s3
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
.endm
.macro smull2_smlal2_4tap d, s0, s1, s2, s3
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
.endm
.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
smlal \d\().4s, \s4\().4h, v0.h[4]
smlal \d\().4s, \s5\().4h, v0.h[5]
smlal \d\().4s, \s6\().4h, v0.h[6]
.endm
.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
smlal2 \d\().4s, \s4\().8h, v0.h[4]
smlal2 \d\().4s, \s5\().8h, v0.h[5]
smlal2 \d\().4s, \s6\().8h, v0.h[6]
.endm
.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
smlal \d\().4s, \s4\().4h, v0.h[4]
smlal \d\().4s, \s5\().4h, v0.h[5]
smlal \d\().4s, \s6\().4h, v0.h[6]
smlal \d\().4s, \s7\().4h, v0.h[7]
.endm
.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
smlal2 \d\().4s, \s4\().8h, v0.h[4]
smlal2 \d\().4s, \s5\().8h, v0.h[5]
smlal2 \d\().4s, \s6\().8h, v0.h[6]
smlal2 \d\().4s, \s7\().8h, v0.h[7]
.endm
.macro sqrshrun_h shift, r0, r1, r2, r3
sqrshrun \r0\().4h, \r0\().4s, #\shift
.ifnb \r1
sqrshrun2 \r0\().8h, \r1\().4s, #\shift
.endif
.ifnb \r2
sqrshrun \r2\().4h, \r2\().4s, #\shift
sqrshrun2 \r2\().8h, \r3\().4s, #\shift
.endif
.endm
.macro xtn_h r0, r1, r2, r3
uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2
.ifnb \r2
uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto
.endif
.endm
.macro srshl_s shift, r0, r1, r2, r3
srshl \r0\().4s, \r0\().4s, \shift\().4s
srshl \r1\().4s, \r1\().4s, \shift\().4s
.ifnb \r2
srshl \r2\().4s, \r2\().4s, \shift\().4s
srshl \r3\().4s, \r3\().4s, \shift\().4s
.endif
.endm
.macro st_s strd, reg, lanes
st1 {\reg\().s}[0], [x0], \strd
st1 {\reg\().s}[1], [x9], \strd
.if \lanes > 2
st1 {\reg\().s}[2], [x0], \strd
st1 {\reg\().s}[3], [x9], \strd
.endif
.endm
.macro st_d strd, r0, r1
st1 {\r0\().8b}, [x0], \strd
st1 {\r0\().d}[1], [x9], \strd
.ifnb \r1
st1 {\r1\().8b}, [x0], \strd
st1 {\r1\().d}[1], [x9], \strd
.endif
.endm
.macro shift_store_4 type, strd, r0, r1, r2, r3
.ifc \type, put
sqrshrun_h 6, \r0, \r1, \r2, \r3
umin_h v31, .8h, \r0, \r2
.else
srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
xtn_h \r0, \r1, \r2, \r3
sub_h v29, .8h, \r0, \r2 // PREP_BIAS
.endif
st_d \strd, \r0, \r2
.endm
.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
st1 {\r0\wd}, [x0], \strd
st1 {\r1\wd}, [x9], \strd
.ifnb \r2
st1 {\r2\wd}, [x0], \strd
st1 {\r3\wd}, [x9], \strd
.endif
.ifnb \r4
st1 {\r4\wd}, [x0], \strd
st1 {\r5\wd}, [x9], \strd
st1 {\r6\wd}, [x0], \strd
st1 {\r7\wd}, [x9], \strd
.endif
.endm
.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
.endm
.macro shift_store_8 type, strd, r0, r1, r2, r3
.ifc \type, put
sqrshrun_h 6, \r0, \r1, \r2, \r3
umin_h v31, .8h, \r0, \r2
.else
srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
xtn_h \r0, \r1, \r2, \r3
sub_h v29, .8h, \r0, \r2 // PREP_BIAS
.endif
st_8h \strd, \r0, \r2
.endm
.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
.ifc \type, put
sqrshrun_h 6, \r0, \r1, \r2, \r3
umin \r0\().8h, \r0\().8h, v31.8h
umin \r1\().8h, \r2\().8h, v31.8h
.else
srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
xtn_h \r0, \r1, \r2, \r3
sub \r0\().8h, \r0\().8h, v29.8h
sub \r1\().8h, \r2\().8h, v29.8h
.endif
st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm
.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
mov w9, \type_h
mov w10, \type_v
b \op\()_\taps\()_neon
endfunc
.endm
// No spaces in these expressions, due to gas-preprocessor.
#define REGULAR ((0*15<<7)|3*15)
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
function \type\()_\taps\()_neon
.ifc \bdmax, w8
ldr w8, [sp]
.endif
mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w11
mul \my, \my, w11
add \mx, \mx, w9 // mx, 8tap_h, 4tap_h
add \my, \my, w10 // my, 8tap_v, 4tap_v
.ifc \type, prep
uxtw \d_strd, \w
lsl \d_strd, \d_strd, #1
.endif
dup v31.8h, \bdmax // bitdepth_max
clz \bdmax, \bdmax
clz w9, \w
sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
mov w12, #6
tst \mx, #(0x7f << 14)
sub w9, w9, #24
add w13, w12, \bdmax // 6 + intermediate_bits
sub w12, w12, \bdmax // 6 - intermediate_bits
movrel x11, X(mc_subpel_filters), -8
b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
b.ne L(\type\()_\taps\()_v)
b \type\()_16bpc_neon
L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w10, \mx, #7, #7
and \mx, \mx, #0x7f
b.le 4f
mov \mx, w10
4:
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=100 H=100 G=100