/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height,
// const int bitdepth_max);
function ipred_dc_128_16bpc_neon, export=1
ldr w8, [sp]
clz w3, w3
movrel x5, ipred_dc_128_tbl
sub w3, w3, #25
ldrsw x3, [x5, w3, uxtw #2]
dup v0.8h, w8
add x5, x5, x3
add x6, x0, x1
lsl x1, x1, #1
urshr v0.8h, v0.8h, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
sub x1, x1, #64
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 64b
ret
endfunc
jumptable ipred_dc_128_tbl
.word 640b - ipred_dc_128_tbl
.word 320b - ipred_dc_128_tbl
.word 160b - ipred_dc_128_tbl
.word 80b - ipred_dc_128_tbl
.word 40b - ipred_dc_128_tbl
endjumptable
// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_v_16bpc_neon, export=1
clz w3, w3
movrel x5, ipred_v_tbl
sub w3, w3, #25
ldrsw x3, [x5, w3, uxtw #2]
add x2, x2, #2
add x5, x5, x3
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.4h}, [x2]
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h}, [x2]
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h}, [x2]
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
sub x1, x1, #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
b.gt 64b
ret
endfunc
jumptable ipred_v_tbl
.word 640b - ipred_v_tbl
.word 320b - ipred_v_tbl
.word 160b - ipred_v_tbl
.word 80b - ipred_v_tbl
.word 40b - ipred_v_tbl
endjumptable
// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_h_16bpc_neon, export=1
clz w3, w3
movrel x5, ipred_h_tbl
sub w3, w3, #25
ldrsw x3, [x5, w3, uxtw #2]
sub x2, x2, #8
add x5, x5, x3
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
4:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
st1 {v3.4h}, [x0], x1
st1 {v2.4h}, [x6], x1
subs w4, w4, #4
st1 {v1.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
16:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
32:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
stp q3, q3, [x0, #32]
stp q2, q2, [x6, #32]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
stp q1, q1, [x0, #32]
stp q0, q0, [x6, #32]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
64:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
str q3, [x0, #16]
str q2, [x6, #16]
stp q3, q3, [x0, #32]
stp q2, q2, [x6, #32]
stp q3, q3, [x0, #64]
stp q2, q2, [x6, #64]
stp q3, q3, [x0, #96]
stp q2, q2, [x6, #96]
st1 {v3.8h}, [x0], x1
st1 {v2.8h}, [x6], x1
subs w4, w4, #4
str q1, [x0, #16]
str q0, [x6, #16]
stp q1, q1, [x0, #32]
stp q0, q0, [x6, #32]
stp q1, q1, [x0, #64]
stp q0, q0, [x6, #64]
stp q1, q1, [x0, #96]
stp q0, q0, [x6, #96]
st1 {v1.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 64b
ret
endfunc
jumptable ipred_h_tbl
.word 640b - ipred_h_tbl
.word 320b - ipred_h_tbl
.word 160b - ipred_h_tbl
.word 80b - ipred_h_tbl
.word 40b - ipred_h_tbl
endjumptable
// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_top_16bpc_neon, export=1
clz w3, w3
movrel x5, ipred_dc_top_tbl
sub w3, w3, #25
ldrsw x3, [x5, w3, uxtw #2]
add x2, x2, #2
add x5, x5, x3
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.4h, v0.h[0]
4:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
8:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addv h0, v0.8h
urshr v2.4h, v0.4h, #4
dup v0.8h, v2.h[0]
dup v1.8h, v2.h[0]
16:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #5
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
32:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #6
sub x1, x1, #64
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
64:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 64b
ret
endfunc
jumptable ipred_dc_top_tbl
.word 640b - ipred_dc_top_tbl
.word 320b - ipred_dc_top_tbl
.word 160b - ipred_dc_top_tbl
.word 80b - ipred_dc_top_tbl
.word 40b - ipred_dc_top_tbl
endjumptable
// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_left_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
clz w3, w3
clz w7, w4
movrel x5, ipred_dc_left_tbl
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w7, w7, #25
ldrsw x3, [x5, w3, uxtw #2]
ldrsw x7, [x5, w7, uxtw #2]
add x3, x5, x3
add x5, x5, x7
add x6, x0, x1
lsl x1, x1, #1
br x5
L(ipred_dc_left_h4):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.4h}, [x2]
addv h0, v0.4h
urshr v0.4h, v0.4h, #2
dup v0.8h, v0.h[0]
br x3
L(ipred_dc_left_w4):
AARCH64_VALID_JUMP_TARGET
1:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h8):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h}, [x2]
addv h0, v0.8h
urshr v0.4h, v0.4h, #3
dup v0.8h, v0.h[0]
br x3
L(ipred_dc_left_w8):
AARCH64_VALID_JUMP_TARGET
1:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h16):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addv h0, v0.8h
urshr v2.4h, v0.4h, #4
dup v0.8h, v2.h[0]
dup v1.8h, v2.h[0]
br x3
L(ipred_dc_left_w16):
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
1:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h32):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
uaddlp v0.4s, v0.8h
addv s0, v0.4s
rshrn v4.4h, v0.4s, #5
dup v0.8h, v4.h[0]
br x3
L(ipred_dc_left_w32):
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
1:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 1b
ret
L(ipred_dc_left_h64):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
uaddlv s0, v0.8h
rshrn v4.4h, v0.4s, #6
dup v0.8h, v4.h[0]
br x3
L(ipred_dc_left_w64):
AARCH64_VALID_JUMP_TARGET
mov v1.16b, v0.16b
mov v2.16b, v0.16b
mov v3.16b, v0.16b
sub x1, x1, #64
1:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 1b
ret
endfunc
jumptable ipred_dc_left_tbl
.word L(ipred_dc_left_h64) - ipred_dc_left_tbl
.word L(ipred_dc_left_h32) - ipred_dc_left_tbl
.word L(ipred_dc_left_h16) - ipred_dc_left_tbl
.word L(ipred_dc_left_h8) - ipred_dc_left_tbl
.word L(ipred_dc_left_h4) - ipred_dc_left_tbl
.word L(ipred_dc_left_w64) - ipred_dc_left_tbl
.word L(ipred_dc_left_w32) - ipred_dc_left_tbl
.word L(ipred_dc_left_w16) - ipred_dc_left_tbl
.word L(ipred_dc_left_w8) - ipred_dc_left_tbl
.word L(ipred_dc_left_w4) - ipred_dc_left_tbl
endjumptable
// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
add w7, w3, w4 // width + height
clz w3, w3
clz w6, w4
dup v16.4s, w7 // width + height
movrel x5, ipred_dc_tbl
rbit w7, w7 // rbit(width + height)
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w6, w6, #25
clz w7, w7 // ctz(width + height)
ldrsw x3, [x5, w3, uxtw #2]
ldrsw x6, [x5, w6, uxtw #2]
neg w7, w7 // -ctz(width + height)
add x3, x5, x3
add x5, x5, x6
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w7 // -ctz(width + height)
add x6, x0, x1
lsl x1, x1, #1
br x5
L(ipred_dc_h4):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.4h}, [x2], #8
uaddlv s0, v0.4h
add x2, x2, #2
br x3
L(ipred_dc_w4):
AARCH64_VALID_JUMP_TARGET
ld1 {v1.4h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.4h
cmp w4, #4
add v0.2s, v0.2s, v1.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16
cmp w4, #16
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.4h, v0.h[0]
2:
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
subs w4, w4, #4
st1 {v0.4h}, [x0], x1
st1 {v0.4h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h8):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h}, [x2], #16
uaddlv s0, v0.8h
add x2, x2, #2
br x3
L(ipred_dc_w8):
AARCH64_VALID_JUMP_TARGET
ld1 {v1.8h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.8h
cmp w4, #8
add v0.2s, v0.2s, v1.2s
ushl v0.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/16/32
cmp w4, #32
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v0.2s, v0.2s, v16.2s
ushr v0.2s, v0.2s, #17
1:
dup v0.8h, v0.h[0]
2:
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h}, [x0], x1
st1 {v0.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h16):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h}, [x2], #32
addp v0.8h, v0.8h, v1.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w16):
AARCH64_VALID_JUMP_TARGET
ld1 {v1.8h, v2.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
uaddlv s1, v1.8h
cmp w4, #16
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 4/8/32/64
tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], x1
st1 {v0.8h, v1.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h32):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w32):
AARCH64_VALID_JUMP_TARGET
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
addp v3.8h, v3.8h, v4.8h
addp v1.8h, v1.8h, v3.8h
uaddlv s1, v1.8h
cmp w4, #32
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 8/16/64
cmp w4, #8
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 2b
ret
L(ipred_dc_h64):
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
addp v0.8h, v0.8h, v1.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w64):
AARCH64_VALID_JUMP_TARGET
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
addp v3.8h, v3.8h, v4.8h
addp v20.8h, v20.8h, v21.8h
addp v22.8h, v22.8h, v23.8h
addp v1.8h, v1.8h, v3.8h
addp v20.8h, v20.8h, v22.8h
addp v1.8h, v1.8h, v20.8h
uaddlv s1, v1.8h
cmp w4, #64
add v0.2s, v0.2s, v1.2s
ushl v4.2s, v0.2s, v17.2s
b.eq 1f
// h = 16/32
cmp w4, #16
mov w16, #0x6667
mov w17, #0xAAAB
csel w16, w16, w17, eq
dup v16.2s, w16
mul v4.2s, v4.2s, v16.2s
ushr v4.2s, v4.2s, #17
1:
sub x1, x1, #64
dup v0.8h, v4.h[0]
dup v1.8h, v4.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v4.h[0]
2:
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
b.gt 2b
ret
endfunc
jumptable ipred_dc_tbl
.word L(ipred_dc_h64) - ipred_dc_tbl
.word L(ipred_dc_h32) - ipred_dc_tbl
.word L(ipred_dc_h16) - ipred_dc_tbl
.word L(ipred_dc_h8) - ipred_dc_tbl
.word L(ipred_dc_h4) - ipred_dc_tbl
.word L(ipred_dc_w64) - ipred_dc_tbl
.word L(ipred_dc_w32) - ipred_dc_tbl
.word L(ipred_dc_w16) - ipred_dc_tbl
.word L(ipred_dc_w8) - ipred_dc_tbl
.word L(ipred_dc_w4) - ipred_dc_tbl
endjumptable
// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_paeth_16bpc_neon, export=1
clz w9, w3
movrel x5, ipred_paeth_tbl
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v4.8h}, [x2]
add x8, x2, #2
sub x2, x2, #8
add x5, x5, x9
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v5.2d}, [x8]
sub v6.8h, v5.8h, v4.8h // top - topleft
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
add v16.8h, v6.8h, v0.8h // base
add v17.8h, v6.8h, v2.8h
sabd v20.8h, v5.8h, v16.8h // tdiff
sabd v21.8h, v5.8h, v17.8h
sabd v22.8h, v4.8h, v16.8h // tldiff
sabd v23.8h, v4.8h, v17.8h
sabd v16.8h, v0.8h, v16.8h // ldiff
sabd v17.8h, v2.8h, v17.8h
umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
umin v19.8h, v21.8h, v23.8h
cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
cmge v21.8h, v23.8h, v21.8h
cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
cmge v17.8h, v19.8h, v17.8h
bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
bsl v20.16b, v5.16b, v4.16b
bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
bit v20.16b, v0.16b, v16.16b
st1 {v21.d}[1], [x0], x1
st1 {v21.d}[0], [x6], x1
subs w4, w4, #4
st1 {v20.d}[1], [x0], x1
st1 {v20.d}[0], [x6], x1
b.gt 4b
ret
80:
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
ld1 {v5.8h}, [x8], #16
mov w9, w3
// Set up pointers for four rows in parallel; x0, x6, x5, x10
add x5, x0, x1
add x10, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
1:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
2:
sub v6.8h, v5.8h, v4.8h // top - topleft
add v16.8h, v6.8h, v0.8h // base
add v17.8h, v6.8h, v1.8h
add v18.8h, v6.8h, v2.8h
add v19.8h, v6.8h, v3.8h
sabd v20.8h, v5.8h, v16.8h // tdiff
sabd v21.8h, v5.8h, v17.8h
sabd v22.8h, v5.8h, v18.8h
sabd v23.8h, v5.8h, v19.8h
sabd v24.8h, v4.8h, v16.8h // tldiff
sabd v25.8h, v4.8h, v17.8h
sabd v26.8h, v4.8h, v18.8h
sabd v27.8h, v4.8h, v19.8h
sabd v16.8h, v0.8h, v16.8h // ldiff
sabd v17.8h, v1.8h, v17.8h
sabd v18.8h, v2.8h, v18.8h
sabd v19.8h, v3.8h, v19.8h
umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
umin v29.8h, v21.8h, v25.8h
umin v30.8h, v22.8h, v26.8h
umin v31.8h, v23.8h, v27.8h
cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
cmge v21.8h, v25.8h, v21.8h
cmge v22.8h, v26.8h, v22.8h
cmge v23.8h, v27.8h, v23.8h
cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
cmge v17.8h, v29.8h, v17.8h
cmge v18.8h, v30.8h, v18.8h
cmge v19.8h, v31.8h, v19.8h
bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
bsl v22.16b, v5.16b, v4.16b
bsl v21.16b, v5.16b, v4.16b
bsl v20.16b, v5.16b, v4.16b
bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
bit v22.16b, v2.16b, v18.16b
bit v21.16b, v1.16b, v17.16b
bit v20.16b, v0.16b, v16.16b
st1 {v23.8h}, [x0], #16
st1 {v22.8h}, [x6], #16
subs w3, w3, #8
st1 {v21.8h}, [x5], #16
st1 {v20.8h}, [x10], #16
b.le 8f
ld1 {v5.8h}, [x8], #16
b 2b
8:
subs w4, w4, #4
b.le 9f
// End of horizontal loop, move pointers to next four rows
sub x8, x8, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
// Load the top row as early as possible
ld1 {v5.8h}, [x8], #16
add x5, x5, x1
add x10, x10, x1
mov w3, w9
b 1b
9:
ret
endfunc
jumptable ipred_paeth_tbl
.word 640b - ipred_paeth_tbl
.word 320b - ipred_paeth_tbl
.word 160b - ipred_paeth_tbl
.word 80b - ipred_paeth_tbl
.word 40b - ipred_paeth_tbl
endjumptable
// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_16bpc_neon, export=1
movrel x10, X(sm_weights)
add x11, x10, w4, uxtw
add x10, x10, w3, uxtw
clz w9, w3
movrel x5, ipred_smooth_tbl
sub x12, x2, w4, uxtw #1
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v4.8h}, [x12] // bottom
add x8, x2, #2
add x5, x5, x9
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v6.2d}, [x8] // top
ld1r {v7.2s}, [x10] // weights_hor
sub x2, x2, #8
mov x7, #-8
dup v5.8h, v6.h[3] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
add v31.4h, v4.4h, v5.4h // bottom+right
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
zip1 v1.2d, v1.2d, v0.2d // left, flipped
zip1 v0.2d, v3.2d, v2.2d
zip1 v16.2s, v16.2s, v17.2s // weights_ver
zip1 v18.2s, v18.2s, v19.2s
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v18.8h, v18.8b
smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
smlal2 v21.4s, v0.8h, v7.8h
smlal v22.4s, v1.4h, v7.4h
smlal2 v23.4s, v1.8h, v7.8h
smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v6.8h, v16.8h
smlal v22.4s, v6.4h, v18.4h
smlal2 v23.4s, v6.8h, v18.8h
rshrn v20.4h, v20.4s, #9
rshrn v21.4h, v21.4s, #9
rshrn v22.4h, v22.4s, #9
rshrn v23.4h, v23.4s, #9
st1 {v20.4h}, [x0], x1
st1 {v21.4h}, [x6], x1
subs w4, w4, #4
st1 {v22.4h}, [x0], x1
st1 {v23.4h}, [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v6.8h}, [x8] // top
ld1 {v7.8b}, [x10] // weights_hor
sub x2, x2, #8
mov x7, #-8
dup v5.8h, v6.h[7] // right
sub v6.8h, v6.8h, v4.8h // top-bottom
uxtl v7.8h, v7.8b // weights_hor
add v31.4h, v4.4h, v5.4h // bottom+right
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
ushll v24.4s, v31.4h, #8
ushll v25.4s, v31.4h, #8
ushll v26.4s, v31.4h, #8
ushll v27.4s, v31.4h, #8
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sub v2.8h, v2.8h, v5.8h
sub v3.8h, v3.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
smlal v22.4s, v2.4h, v7.4h
smlal2 v23.4s, v2.8h, v7.8h
smlal v24.4s, v1.4h, v7.4h
smlal2 v25.4s, v1.8h, v7.8h
smlal v26.4s, v0.4h, v7.4h
smlal2 v27.4s, v0.8h, v7.8h
smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v6.8h, v16.8h
smlal v22.4s, v6.4h, v17.4h
smlal2 v23.4s, v6.8h, v17.8h
smlal v24.4s, v6.4h, v18.4h
smlal2 v25.4s, v6.8h, v18.8h
smlal v26.4s, v6.4h, v19.4h
smlal2 v27.4s, v6.8h, v19.8h
rshrn v20.4h, v20.4s, #9
rshrn2 v20.8h, v21.4s, #9
rshrn v21.4h, v22.4s, #9
rshrn2 v21.8h, v23.4s, #9
rshrn v22.4h, v24.4s, #9
rshrn2 v22.8h, v25.4s, #9
rshrn v23.4h, v26.4s, #9
rshrn2 v23.8h, v27.4s, #9
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
add x12, x2, w3, uxtw #1
sub x1, x1, w3, uxtw #1
ld1r {v5.8h}, [x12] // right
sub x2, x2, #4
mov x7, #-4
mov w9, w3
add v31.4h, v4.4h, v5.4h // bottom+right
1:
ld2r {v0.8h, v1.8h}, [x2], x7 // left
ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
uxtl v16.8h, v16.8b // weights_ver
uxtl v17.8h, v17.8b
2:
ld1 {v7.16b}, [x10], #16 // weights_hor
ld1 {v2.8h, v3.8h}, [x8], #32 // top
ushll v20.4s, v31.4h, #8 // (bottom+right)*256
ushll v21.4s, v31.4h, #8
ushll v22.4s, v31.4h, #8
ushll v23.4s, v31.4h, #8
ushll v24.4s, v31.4h, #8
ushll v25.4s, v31.4h, #8
ushll v26.4s, v31.4h, #8
ushll v27.4s, v31.4h, #8
uxtl v6.8h, v7.8b // weights_hor
uxtl2 v7.8h, v7.16b
sub v2.8h, v2.8h, v4.8h // top-bottom
sub v3.8h, v3.8h, v4.8h
smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
smlal v22.4s, v1.4h, v7.4h
smlal2 v23.4s, v1.8h, v7.8h
smlal v24.4s, v0.4h, v6.4h
smlal2 v25.4s, v0.8h, v6.8h
smlal v26.4s, v0.4h, v7.4h
smlal2 v27.4s, v0.8h, v7.8h
smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
smlal2 v21.4s, v2.8h, v16.8h
smlal v22.4s, v3.4h, v16.4h
smlal2 v23.4s, v3.8h, v16.8h
smlal v24.4s, v2.4h, v17.4h
smlal2 v25.4s, v2.8h, v17.8h
smlal v26.4s, v3.4h, v17.4h
smlal2 v27.4s, v3.8h, v17.8h
rshrn v20.4h, v20.4s, #9
rshrn2 v20.8h, v21.4s, #9
rshrn v21.4h, v22.4s, #9
rshrn2 v21.8h, v23.4s, #9
rshrn v22.4h, v24.4s, #9
rshrn2 v22.8h, v25.4s, #9
rshrn v23.4h, v26.4s, #9
rshrn2 v23.8h, v27.4s, #9
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
b.gt 2b
subs w4, w4, #2
b.le 9f
sub x8, x8, w9, uxtw #1
sub x10, x10, w9, uxtw
add x0, x0, x1
add x6, x6, x1
mov w3, w9
b 1b
9:
ret
endfunc
jumptable ipred_smooth_tbl
.word 640b - ipred_smooth_tbl
.word 320b - ipred_smooth_tbl
.word 160b - ipred_smooth_tbl
.word 80b - ipred_smooth_tbl
.word 40b - ipred_smooth_tbl
endjumptable
// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_v_16bpc_neon, export=1
movrel x7, X(sm_weights)
add x7, x7, w4, uxtw
clz w9, w3
movrel x5, ipred_smooth_v_tbl
sub x8, x2, w4, uxtw #1
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v4.8h}, [x8] // bottom
add x2, x2, #2
add x5, x5, x9
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v6.2d}, [x2] // top
sub v6.8h, v6.8h, v4.8h // top-bottom
4:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
zip1 v16.2s, v16.2s, v17.2s // weights_ver
zip1 v18.2s, v18.2s, v19.2s
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v18.8h, v18.8b, #7
sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v6.8h, v18.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
st1 {v20.d}[0], [x0], x1
st1 {v20.d}[1], [x6], x1
subs w4, w4, #4
st1 {v21.d}[0], [x0], x1
st1 {v21.d}[1], [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v6.8h}, [x2] // top
sub v6.8h, v6.8h, v4.8h // top-bottom
8:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v17.8h, v17.8b, #7
ushll v18.8h, v18.8b, #7
ushll v19.8h, v19.8b, #7
sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v6.8h, v17.8h
sqrdmulh v22.8h, v6.8h, v18.8h
sqrdmulh v23.8h, v6.8h, v19.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
// Set up pointers for four rows in parallel; x0, x6, x5, x8
add x5, x0, x1
add x8, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
ushll v16.8h, v16.8b, #7 // weights_ver << 7
ushll v17.8h, v17.8b, #7
ushll v18.8h, v18.8b, #7
ushll v19.8h, v19.8b, #7
2:
ld1 {v2.8h, v3.8h}, [x2], #32 // top
sub v2.8h, v2.8h, v4.8h // top-bottom
sub v3.8h, v3.8h, v4.8h
sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
sqrdmulh v21.8h, v3.8h, v16.8h
sqrdmulh v22.8h, v2.8h, v17.8h
sqrdmulh v23.8h, v3.8h, v17.8h
sqrdmulh v24.8h, v2.8h, v18.8h
sqrdmulh v25.8h, v3.8h, v18.8h
sqrdmulh v26.8h, v2.8h, v19.8h
sqrdmulh v27.8h, v3.8h, v19.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
add v24.8h, v24.8h, v4.8h
add v25.8h, v25.8h, v4.8h
add v26.8h, v26.8h, v4.8h
add v27.8h, v27.8h, v4.8h
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
st1 {v24.8h, v25.8h}, [x5], #32
st1 {v26.8h, v27.8h}, [x8], #32
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x2, x2, w9, uxtw #1
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
add x8, x8, x1
mov w3, w9
b 1b
9:
ret
endfunc
jumptable ipred_smooth_v_tbl
.word 640b - ipred_smooth_v_tbl
.word 320b - ipred_smooth_v_tbl
.word 160b - ipred_smooth_v_tbl
.word 80b - ipred_smooth_v_tbl
.word 40b - ipred_smooth_v_tbl
endjumptable
// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_smooth_h_16bpc_neon, export=1
movrel x8, X(sm_weights)
add x8, x8, w3, uxtw
clz w9, w3
movrel x5, ipred_smooth_h_tbl
add x12, x2, w3, uxtw #1
sub w9, w9, #25
ldrsw x9, [x5, w9, uxtw #2]
ld1r {v5.8h}, [x12] // right
add x5, x5, x9
add x6, x0, x1
lsl x1, x1, #1
br x5
40:
AARCH64_VALID_JUMP_TARGET
ld1r {v7.2s}, [x8] // weights_hor
sub x2, x2, #8
mov x7, #-8
ushll v7.8h, v7.8b, #7 // weights_hor << 7
4:
ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
zip1 v1.2d, v1.2d, v0.2d // left, flipped
zip1 v0.2d, v3.2d, v2.2d
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v1.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
st1 {v20.d}[0], [x0], x1
st1 {v20.d}[1], [x6], x1
subs w4, w4, #4
st1 {v21.d}[0], [x0], x1
st1 {v21.d}[1], [x6], x1
b.gt 4b
ret
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v7.8b}, [x8] // weights_hor
sub x2, x2, #8
mov x7, #-8
ushll v7.8h, v7.8b, #7 // weights_hor << 7
8:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
sub v3.8h, v3.8h, v5.8h // left-right
sub v2.8h, v2.8h, v5.8h
sub v1.8h, v1.8h, v5.8h
sub v0.8h, v0.8h, v5.8h
sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
sqrdmulh v22.8h, v1.8h, v7.8h
sqrdmulh v23.8h, v0.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
add v22.8h, v22.8h, v5.8h
add v23.8h, v23.8h, v5.8h
st1 {v20.8h}, [x0], x1
st1 {v21.8h}, [x6], x1
subs w4, w4, #4
st1 {v22.8h}, [x0], x1
st1 {v23.8h}, [x6], x1
b.gt 8b
ret
160:
320:
640:
AARCH64_VALID_JUMP_TARGET
sub x2, x2, #8
mov x7, #-8
// Set up pointers for four rows in parallel; x0, x6, x5, x10
add x5, x0, x1
add x10, x6, x1
lsl x1, x1, #1
sub x1, x1, w3, uxtw #1
mov w9, w3
1:
ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
sub v0.8h, v0.8h, v5.8h // left-right
sub v1.8h, v1.8h, v5.8h
sub v2.8h, v2.8h, v5.8h
sub v3.8h, v3.8h, v5.8h
2:
ld1 {v7.16b}, [x8], #16 // weights_hor
ushll v6.8h, v7.8b, #7 // weights_hor << 7
ushll2 v7.8h, v7.16b, #7
sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
sqrdmulh v22.8h, v2.8h, v6.8h
sqrdmulh v23.8h, v2.8h, v7.8h
sqrdmulh v24.8h, v1.8h, v6.8h
sqrdmulh v25.8h, v1.8h, v7.8h
sqrdmulh v26.8h, v0.8h, v6.8h
sqrdmulh v27.8h, v0.8h, v7.8h
add v20.8h, v20.8h, v5.8h
add v21.8h, v21.8h, v5.8h
add v22.8h, v22.8h, v5.8h
add v23.8h, v23.8h, v5.8h
add v24.8h, v24.8h, v5.8h
add v25.8h, v25.8h, v5.8h
add v26.8h, v26.8h, v5.8h
add v27.8h, v27.8h, v5.8h
subs w3, w3, #16
st1 {v20.8h, v21.8h}, [x0], #32
st1 {v22.8h, v23.8h}, [x6], #32
st1 {v24.8h, v25.8h}, [x5], #32
st1 {v26.8h, v27.8h}, [x10], #32
b.gt 2b
subs w4, w4, #4
b.le 9f
sub x8, x8, w9, uxtw
add x0, x0, x1
add x6, x6, x1
add x5, x5, x1
add x10, x10, x1
mov w3, w9
b 1b
9:
ret
endfunc
jumptable ipred_smooth_h_tbl
.word 640b - ipred_smooth_h_tbl
.word 320b - ipred_smooth_h_tbl
.word 160b - ipred_smooth_h_tbl
.word 80b - ipred_smooth_h_tbl
.word 40b - ipred_smooth_h_tbl
endjumptable
const padding_mask_buf
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
padding_mask:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
endconst
// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
// const pixel *const in, const int end,
// const int bitdepth_max);
function ipred_z1_upsample_edge_16bpc_neon, export=1
dup v30.8h, w4 // bitdepth_max
movrel x4, padding_mask
ld1 {v0.8h, v1.8h}, [x2] // in[]
add x5, x2, w3, uxtw #1 // in[end]
sub x4, x4, w3, uxtw #1
ld1r {v2.8h}, [x5] // padding
ld1 {v3.8h, v4.8h}, [x4] // padding_mask
movi v31.8h, #9
bit v0.16b, v2.16b, v3.16b // padded in[]
bit v1.16b, v2.16b, v4.16b
ext v4.16b, v0.16b, v1.16b, #2
ext v5.16b, v1.16b, v2.16b, #2
ext v6.16b, v0.16b, v1.16b, #4
ext v7.16b, v1.16b, v2.16b, #4
ext v16.16b, v0.16b, v1.16b, #6
ext v17.16b, v1.16b, v2.16b, #6
add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2]
add v19.8h, v5.8h, v7.8h
add v20.8h, v0.8h, v16.8h
add v21.8h, v1.8h, v17.8h
umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2])
umull2 v23.4s, v18.8h, v31.8h
umull v24.4s, v19.4h, v31.4h
umull2 v25.4s, v19.8h, v31.8h
usubw v22.4s, v22.4s, v20.4h
usubw2 v23.4s, v23.4s, v20.8h
usubw v24.4s, v24.4s, v21.4h
usubw2 v25.4s, v25.4s, v21.8h
sqrshrun v16.4h, v22.4s, #4
sqrshrun2 v16.8h, v23.4s, #4
sqrshrun v17.4h, v24.4s, #4
sqrshrun2 v17.8h, v25.4s, #4
smin v16.8h, v16.8h, v30.8h
smin v17.8h, v17.8h, v30.8h
zip1 v0.8h, v4.8h, v16.8h
zip2 v1.8h, v4.8h, v16.8h
zip1 v2.8h, v5.8h, v17.8h
zip2 v3.8h, v5.8h, v17.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
ret
endfunc
// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
// const pixel *const in,
// const int bitdepth_max);
function ipred_z2_upsample_edge_16bpc_neon, export=1
dup v30.8h, w3 // bitdepth_max
// Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
movrel x4, padding_mask
ld1 {v0.8h, v1.8h}, [x2] // in[]
add x5, x2, w1, uxtw #1 // in[sz]
sub x4, x4, w1, uxtw #1
ld1r {v3.8h}, [x2] // in[0] for padding
ld1r {v2.8h}, [x5] // padding
ld1 {v4.8h, v5.8h}, [x4] // padding_mask
movi v31.8h, #9
bit v0.16b, v2.16b, v4.16b // padded in[]
bit v1.16b, v2.16b, v5.16b
ext v4.16b, v3.16b, v0.16b, #14
ext v5.16b, v0.16b, v1.16b, #2
ext v6.16b, v0.16b, v1.16b, #4
add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1]
add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2]
umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2])
umull2 v19.4s, v16.8h, v31.8h
usubw v18.4s, v18.4s, v17.4h
usubw2 v19.4s, v19.4s, v17.8h
sqrshrun v16.4h, v18.4s, #4
sqrshrun2 v16.8h, v19.4s, #4
add x5, x0, #2*16
smin v16.8h, v16.8h, v30.8h
zip1 v4.8h, v0.8h, v16.8h
zip2 v5.8h, v0.8h, v16.8h
st1 {v2.h}[0], [x5]
// In case sz=8, output one single pixel in out[16].
st1 {v4.8h, v5.8h}, [x0]
ret
endfunc
const edge_filter
.short 0, 4, 8, 0
.short 0, 5, 6, 0
// Leaving out the coeffs for strength=3
// .byte 2, 4, 4, 0
endconst
// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
// const pixel *const in, const int end,
// const int strength);
function ipred_z1_filter_edge_16bpc_neon, export=1
cmp w4, #3
b.eq L(fivetap) // if (strength == 3) goto fivetap
movrel x5, edge_filter, -6
add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1)
ld1 {v31.s}[0], [x5] // kernel[1-2]
ld1 {v0.8h}, [x2], #16
dup v30.8h, v31.h[0]
dup v31.8h, v31.h[1]
1:
// in[end], is the last valid pixel. We produce 16 pixels out by
// using 18 pixels in - the last pixel used is [17] of the ones
// read/buffered.
cmp w3, #17
ld1 {v1.8h, v2.8h}, [x2], #32
b.lt 2f
ext v3.16b, v0.16b, v1.16b, #2
ext v4.16b, v1.16b, v2.16b, #2
ext v5.16b, v0.16b, v1.16b, #4
ext v6.16b, v1.16b, v2.16b, #4
mul v16.8h, v0.8h, v30.8h
mla v16.8h, v3.8h, v31.8h
mla v16.8h, v5.8h, v30.8h
mul v17.8h, v1.8h, v30.8h
mla v17.8h, v4.8h, v31.8h
mla v17.8h, v6.8h, v30.8h
subs w1, w1, #16
mov v0.16b, v2.16b
urshr v16.8h, v16.8h, #4
urshr v17.8h, v17.8h, #4
sub w3, w3, #16
st1 {v16.8h, v17.8h}, [x0], #32
b.gt 1b
ret
2:
// Right padding
// x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
movrel x5, padding_mask
sub w6, w3, #24
sub x5, x5, w3, uxtw #1
add x6, x2, w6, sxtw #1
ld1 {v3.8h, v4.8h}, [x5] // padding_mask
ld1r {v2.8h}, [x6]
bit v0.16b, v2.16b, v3.16b // Pad v0-v1
bit v1.16b, v2.16b, v4.16b
// Filter one block
ext v3.16b, v0.16b, v1.16b, #2
ext v4.16b, v1.16b, v2.16b, #2
ext v5.16b, v0.16b, v1.16b, #4
ext v6.16b, v1.16b, v2.16b, #4
mul v16.8h, v0.8h, v30.8h
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=98 H=100 G=98