Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  mc16.S   Sprache: Sparc

 
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Janne Grunau
 * Copyright © 2020, Martin Storsjo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


#include "src/arm/asm.S"
#include "util.S"

#define PREP_BIAS 8192

.macro avg d0, d1, t0, t1, t2, t3
        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
.endm

.macro w_avg d0, d1, t0, t1, t2, t3
        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
        // This difference requires a 17 bit range, and all bits are
        // significant for the following multiplication.
        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
        mul             \d0\().4s,  \d0\().4s,  v27.4s
        mul             \t0\().4s,  \t0\().4s,  v27.4s
        mul             \d1\().4s,  \d1\().4s,  v27.4s
        mul             \t1\().4s,  \t1\().4s,  v27.4s
        sshr            \d0\().4s,  \d0\().4s,  #4
        sshr            \t0\().4s,  \t0\().4s,  #4
        sshr            \d1\().4s,  \d1\().4s,  #4
        sshr            \t1\().4s,  \t1\().4s,  #4
        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
.endm

.macro mask d0, d1, t0, t1, t2, t3
        ld1             {v27.16b}, [x6],  16
        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
        neg             v27.16b, v27.16b
        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
        sxtl            v26.8h,  v27.8b
        sxtl2           v27.8h,  v27.16b
        sxtl            v24.4s,  v26.4h
        sxtl2           v25.4s,  v26.8h
        sxtl            v26.4s,  v27.4h
        sxtl2           v27.4s,  v27.8h
        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
        mul             \d0\().4s,  \d0\().4s,  v24.4s
        mul             \t0\().4s,  \t0\().4s,  v25.4s
        mul             \d1\().4s,  \d1\().4s,  v26.4s
        mul             \t1\().4s,  \t1\().4s,  v27.4s
        sshr            \d0\().4s,  \d0\().4s,  #6
        sshr            \t0\().4s,  \t0\().4s,  #6
        sshr            \d1\().4s,  \d1\().4s,  #6
        sshr            \t1\().4s,  \t1\().4s,  #6
        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
.endm

.macro bidir_fn type, bdmax
function \type\()_16bpc_neon, export=1
        clz             w4,  w4
.ifnc \type, avg
        dup             v31.8h,  \bdmax // bitdepth_max
        movi            v30.8h,  #0
.endif
        clz             w7,  \bdmax
        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
.ifc \type, avg
        mov             w9,  #1
        mov             w8,  #-2*PREP_BIAS
        lsl             w9,  w9,  w7    // 1 << intermediate_bits
        add             w7,  w7,  #1
        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
        neg             w7,  w7         // -(intermediate_bits+1)
        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
        dup             v29.8h,   w7    // -(intermediate_bits+1)
.else
        mov             w8,  #PREP_BIAS
        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
        neg             w7,  w7         // -intermediate_bits
        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
        dup             v29.8h,  w7     // -intermediate_bits
.endif
.ifc \type, w_avg
        dup             v27.4s,  w6
        neg             v27.4s,  v27.4s
.endif
        movrel          x7,  \type\()_tbl
        sub             w4,  w4,  #24
        \type           v4,  v5,  v0,  v1,  v2,  v3
        ldrsw           x4,  [x7, x4, lsl #2]
        add             x7,  x7,  x4
        br              x7
40:
        AARCH64_VALID_JUMP_TARGET
        add             x7,  x0,  x1
        lsl             x1,  x1,  #1
4:
        subs            w5,  w5,  #4
        st1             {v4.8b},    [x0], x1
        st1             {v4.d}[1],  [x7], x1
        st1             {v5.8b},    [x0], x1
        st1             {v5.d}[1],  [x7], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               4b
80:
        AARCH64_VALID_JUMP_TARGET
        add             x7,  x0,  x1
        lsl             x1,  x1,  #1
8:
        st1             {v4.8h},  [x0], x1
        subs            w5,  w5,  #2
        st1             {v5.8h},  [x7], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               8b
160:
        AARCH64_VALID_JUMP_TARGET
16:
        \type           v6,  v7,  v0,  v1,  v2,  v3
        st1             {v4.8h, v5.8h}, [x0], x1
        subs            w5,  w5,  #2
        st1             {v6.8h, v7.8h}, [x0], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               16b
320:
        AARCH64_VALID_JUMP_TARGET
32:
        \type           v6,  v7,  v0,  v1,  v2,  v3
        subs            w5,  w5,  #1
        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               32b
640:
        AARCH64_VALID_JUMP_TARGET
        add             x7,  x0,  #64
64:
        \type           v6,  v7,  v0,  v1,  v2,  v3
        \type           v16, v17, v0,  v1,  v2,  v3
        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
        \type           v18, v19, v0,  v1,  v2,  v3
        subs            w5,  w5,  #1
        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               64b
1280:
        AARCH64_VALID_JUMP_TARGET
        add             x7,  x0,  #64
        mov             x8,  #128
        sub             x1,  x1,  #128
128:
        \type           v6,  v7,  v0,  v1,  v2,  v3
        \type           v16, v17, v0,  v1,  v2,  v3
        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
        \type           v18, v19, v0,  v1,  v2,  v3
        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
        \type           v4,  v5,  v0,  v1,  v2,  v3
        \type           v6,  v7,  v0,  v1,  v2,  v3
        \type           v16, v17, v0,  v1,  v2,  v3
        subs            w5,  w5,  #1
        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
        \type           v18, v19, v0,  v1,  v2,  v3
        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
        b               128b
0:
        ret
endfunc

jumptable \type\()_tbl
        .word 1280b - \type\()_tbl
        .word 640b  - \type\()_tbl
        .word 320b  - \type\()_tbl
        .word 160b  - \type\()_tbl
        .word 80b   - \type\()_tbl
        .word 40b   - \type\()_tbl
endjumptable
.endm

bidir_fn avg, w6
bidir_fn w_avg, w7
bidir_fn mask, w7


.macro w_mask_fn type
function w_mask_\type\()_16bpc_neon, export=1
        ldr             w8,  [sp]
        clz             w9,  w4
        movrel          x10, w_mask_\type\()_tbl
        dup             v31.8h,  w8   // bitdepth_max
        sub             w9,  w9,  #24
        clz             w8,  w8       // clz(bitdepth_max)
        ldrsw           x9,  [x10,  x9,  lsl #2]
        add             x10, x10, x9
        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
        mov             w9,  #PREP_BIAS*64
        neg             w8,  w8       // -sh
        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
        dup             v30.4s,  w9   // PREP_BIAS*64
        dup             v29.4s,  w8   // -sh
        dup             v0.8h,   w11
.if \type == 444
        movi            v1.16b,  #64
.elseif \type == 422
        dup             v2.8b,   w7
        movi            v3.8b,   #129
        sub             v3.8b,   v3.8b,   v2.8b
.elseif \type == 420
        dup             v2.8h,   w7
        movi            v3.8h,   #1, lsl #8
        sub             v3.8h,   v3.8h,   v2.8h
.endif
        add             x12,  x0,  x1
        lsl             x1,   x1,  #1
        br              x10
40:
        AARCH64_VALID_JUMP_TARGET
4:
        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
        subs            w5,  w5,  #4
        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
        sabd            v21.8h,  v5.8h,   v7.8h
        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
        ssubl2          v17.4s,  v6.8h,   v4.8h
        ssubl           v18.4s,  v7.4h,   v5.4h
        ssubl2          v19.4s,  v7.8h,   v5.8h
        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
        uqsub           v21.8h,  v0.8h,   v21.8h
        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
        sshll           v6.4s,   v5.4h,   #6
        sshll2          v5.4s,   v4.8h,   #6
        sshll           v4.4s,   v4.4h,   #6
        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
        ushr            v21.8h,  v21.8h,  #10
        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
        add             v5.4s,   v5.4s,   v30.4s
        add             v6.4s,   v6.4s,   v30.4s
        add             v7.4s,   v7.4s,   v30.4s
        uxtl            v22.4s,  v20.4h
        uxtl2           v23.4s,  v20.8h
        uxtl            v24.4s,  v21.4h
        uxtl2           v25.4s,  v21.8h
        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
        mla             v5.4s,   v17.4s,  v23.4s
        mla             v6.4s,   v18.4s,  v24.4s
        mla             v7.4s,   v19.4s,  v25.4s
        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
        srshl           v5.4s,   v5.4s,   v29.4s
        srshl           v6.4s,   v6.4s,   v29.4s
        srshl           v7.4s,   v7.4s,   v29.4s
        sqxtun          v4.4h,   v4.4s            // iclip_pixel
        sqxtun2         v4.8h,   v5.4s
        sqxtun          v5.4h,   v6.4s
        sqxtun2         v5.8h,   v7.4s
        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
        umin            v5.8h,   v5.8h,   v31.8h
.if \type == 444
        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
        sub             v20.16b, v1.16b,  v20.16b // m
        st1             {v20.16b}, [x6], #16
.elseif \type == 422
        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
        xtn             v20.8b,  v20.8h
        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
        st1             {v20.8b}, [x6], #8
.elseif \type == 420
        trn1            v24.2d,  v20.2d,  v21.2d
        trn2            v25.2d,  v20.2d,  v21.2d
        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
        str             s20,        [x6],  #4
.endif
        st1             {v4.8b},    [x0],  x1
        st1             {v4.d}[1],  [x12], x1
        st1             {v5.8b},    [x0],  x1
        st1             {v5.d}[1],  [x12], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
8:
        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
        subs            w5,  w5,  #2
        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
        sabd            v21.8h,  v5.8h,   v7.8h
        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
        ssubl2          v17.4s,  v6.8h,   v4.8h
        ssubl           v18.4s,  v7.4h,   v5.4h
        ssubl2          v19.4s,  v7.8h,   v5.8h
        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
        uqsub           v21.8h,  v0.8h,   v21.8h
        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
        sshll           v6.4s,   v5.4h,   #6
        sshll2          v5.4s,   v4.8h,   #6
        sshll           v4.4s,   v4.4h,   #6
        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
        ushr            v21.8h,  v21.8h,  #10
        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
        add             v5.4s,   v5.4s,   v30.4s
        add             v6.4s,   v6.4s,   v30.4s
        add             v7.4s,   v7.4s,   v30.4s
        uxtl            v22.4s,  v20.4h
        uxtl2           v23.4s,  v20.8h
        uxtl            v24.4s,  v21.4h
        uxtl2           v25.4s,  v21.8h
        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
        mla             v5.4s,   v17.4s,  v23.4s
        mla             v6.4s,   v18.4s,  v24.4s
        mla             v7.4s,   v19.4s,  v25.4s
        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
        srshl           v5.4s,   v5.4s,   v29.4s
        srshl           v6.4s,   v6.4s,   v29.4s
        srshl           v7.4s,   v7.4s,   v29.4s
        sqxtun          v4.4h,   v4.4s            // iclip_pixel
        sqxtun2         v4.8h,   v5.4s
        sqxtun          v5.4h,   v6.4s
        sqxtun2         v5.8h,   v7.4s
        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
        umin            v5.8h,   v5.8h,   v31.8h
.if \type == 444
        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
        sub             v20.16b, v1.16b,  v20.16b // m
        st1             {v20.16b}, [x6], #16
.elseif \type == 422
        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
        xtn             v20.8b,  v20.8h
        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
        st1             {v20.8b}, [x6], #8
.elseif \type == 420
        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
        str             s20,     [x6],  #4
.endif
        st1             {v4.8h}, [x0],  x1
        st1             {v5.8h}, [x12], x1
        b.gt            8b
        ret
1280:
640:
320:
160:
        AARCH64_VALID_JUMP_TARGET
        mov             w11, w4
        sub             x1,  x1,  w4,  uxtw #1
.if \type == 444
        add             x10, x6,  w4,  uxtw
.elseif \type == 422
        add             x10, x6,  x11, lsr #1
.endif
        add             x9,  x3,  w4,  uxtw #1
        add             x7,  x2,  w4,  uxtw #1
161:
        mov             w8,  w4
16:
        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
        ld1             {v6.8h,   v7.8h},  [x7], #32
        ld1             {v18.8h,  v19.8h}, [x9], #32
        subs            w8,  w8,  #16
        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
        sabd            v21.8h,  v5.8h,   v17.8h
        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
        ssubl2          v23.4s,  v16.8h,  v4.8h
        ssubl           v24.4s,  v17.4h,  v5.4h
        ssubl2          v25.4s,  v17.8h,  v5.8h
        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
        uqsub           v21.8h,  v0.8h,   v21.8h
        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
        sshll           v26.4s,  v5.4h,   #6
        sshll2          v5.4s,   v4.8h,   #6
        sshll           v4.4s,   v4.4h,   #6
        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
        ushr            v21.8h,  v21.8h,  #10
        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
        add             v5.4s,   v5.4s,   v30.4s
        add             v26.4s,  v26.4s,  v30.4s
        add             v27.4s,  v27.4s,  v30.4s
        uxtl            v16.4s,  v20.4h
        uxtl2           v17.4s,  v20.8h
        uxtl            v28.4s,  v21.4h
        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
        uxtl2           v16.4s,  v21.8h
        mla             v5.4s,   v23.4s,  v17.4s
        mla             v26.4s,  v24.4s,  v28.4s
        mla             v27.4s,  v25.4s,  v16.4s
        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
        srshl           v5.4s,   v5.4s,   v29.4s
        srshl           v26.4s,  v26.4s,  v29.4s
        srshl           v27.4s,  v27.4s,  v29.4s
        sqxtun          v4.4h,   v4.4s            // iclip_pixel
        sqxtun2         v4.8h,   v5.4s
        sqxtun          v5.4h,   v26.4s
        sqxtun2         v5.8h,   v27.4s

        // Start of other half
        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
        sabd            v23.8h,  v7.8h,   v19.8h

        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
        umin            v5.8h,   v5.8h,   v31.8h

        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
        ssubl2          v17.4s,  v18.8h,  v6.8h
        ssubl           v18.4s,  v19.4h,  v7.4h
        ssubl2          v19.4s,  v19.8h,  v7.8h
        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
        uqsub           v23.8h,  v0.8h,   v23.8h
        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
        sshll2          v25.4s,  v6.8h,   #6
        sshll           v26.4s,  v7.4h,   #6
        sshll2          v27.4s,  v7.8h,   #6
        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
        ushr            v23.8h,  v23.8h,  #10
        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
        add             v25.4s,  v25.4s,  v30.4s
        add             v26.4s,  v26.4s,  v30.4s
        add             v27.4s,  v27.4s,  v30.4s
        uxtl            v6.4s,   v22.4h
        uxtl2           v7.4s,   v22.8h
        uxtl            v28.4s,  v23.4h
        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
        uxtl2           v6.4s,   v23.8h
        mla             v25.4s,  v17.4s,  v7.4s
        mla             v26.4s,  v18.4s,  v28.4s
        mla             v27.4s,  v19.4s,  v6.4s
        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
        srshl           v25.4s,  v25.4s,  v29.4s
        srshl           v26.4s,  v26.4s,  v29.4s
        srshl           v27.4s,  v27.4s,  v29.4s
        sqxtun          v6.4h,   v24.4s           // iclip_pixel
        sqxtun2         v6.8h,   v25.4s
        sqxtun          v7.4h,   v26.4s
        sqxtun2         v7.8h,   v27.4s
        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
        umin            v7.8h,   v7.8h,   v31.8h
.if \type == 444
        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
        uzp1            v21.16b, v22.16b, v23.16b
        sub             v20.16b, v1.16b,  v20.16b // m
        sub             v21.16b, v1.16b,  v21.16b
        st1             {v20.16b}, [x6],  #16
        st1             {v21.16b}, [x10], #16
.elseif \type == 422
        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
        addp            v21.8h,  v22.8h,  v23.8h
        xtn             v20.8b,  v20.8h
        xtn             v21.8b,  v21.8h
        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
        uhsub           v21.8b,  v3.8b,   v21.8b
        st1             {v20.8b}, [x6],  #8
        st1             {v21.8b}, [x10], #8
.elseif \type == 420
        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
        add             v21.8h,  v21.8h,  v23.8h
        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
        st1             {v20.8b}, [x6], #8
.endif
        st1             {v4.8h, v5.8h}, [x0],  #32
        st1             {v6.8h, v7.8h}, [x12], #32
        b.gt            16b
        subs            w5,  w5,  #2
        add             x2,  x2,  w4,  uxtw #1
        add             x3,  x3,  w4,  uxtw #1
        add             x7,  x7,  w4,  uxtw #1
        add             x9,  x9,  w4,  uxtw #1
.if \type == 444
        add             x6,  x6,  w4,  uxtw
        add             x10, x10, w4,  uxtw
.elseif \type == 422
        add             x6,  x6,  x11, lsr #1
        add             x10, x10, x11, lsr #1
.endif
        add             x0,  x0,  x1
        add             x12, x12, x1
        b.gt            161b
        ret
endfunc

jumptable w_mask_\type\()_tbl
        .word 1280b - w_mask_\type\()_tbl
        .word 640b  - w_mask_\type\()_tbl
        .word 320b  - w_mask_\type\()_tbl
        .word 160b  - w_mask_\type\()_tbl
        .word 80b   - w_mask_\type\()_tbl
        .word 40b   - w_mask_\type\()_tbl
endjumptable
.endm

w_mask_fn 444
w_mask_fn 422
w_mask_fn 420


function blend_16bpc_neon, export=1
        movrel          x6,  blend_tbl
        clz             w3,  w3
        sub             w3,  w3,  #26
        ldrsw           x3,  [x6,  x3,  lsl #2]
        add             x6,  x6,  x3
        add             x8,  x0,  x1
        br              x6
40:
        AARCH64_VALID_JUMP_TARGET
        lsl             x1,  x1,  #1
4:
        ld1             {v2.8b},   [x5], #8
        ld1             {v1.8h},   [x2], #16
        ldr             d0,        [x0]
        neg             v2.8b,   v2.8b            // -m
        subs            w4,  w4,  #2
        ld1             {v0.d}[1], [x8]
        sxtl            v2.8h,   v2.8b
        shl             v2.8h,   v2.8h,   #9      // -m << 9
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
        st1             {v0.8b},   [x0], x1
        st1             {v0.d}[1], [x8], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        lsl             x1,  x1,  #1
8:
        ld1             {v4.16b},       [x5], #16
        ld1             {v2.8h, v3.8h}, [x2], #32
        neg             v5.16b,  v4.16b           // -m
        ld1             {v0.8h},   [x0]
        ld1             {v1.8h},   [x8]
        sxtl            v4.8h,   v5.8b
        sxtl2           v5.8h,   v5.16b
        shl             v4.8h,   v4.8h,   #9      // -m << 9
        shl             v5.8h,   v5.8h,   #9
        sub             v2.8h,   v0.8h,   v2.8h   // a - b
        sub             v3.8h,   v1.8h,   v3.8h
        subs            w4,  w4,  #2
        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
        sqrdmulh        v3.8h,   v3.8h,   v5.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
        st1             {v0.8h}, [x0], x1
        st1             {v1.8h}, [x8], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        lsl             x1,  x1,  #1
16:
        ld1             {v16.16b, v17.16b},           [x5], #32
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
        subs            w4,  w4,  #2
        neg             v18.16b, v16.16b          // -m
        neg             v19.16b, v17.16b
        ld1             {v0.8h, v1.8h}, [x0]
        sxtl            v16.8h,  v18.8b
        sxtl2           v17.8h,  v18.16b
        sxtl            v18.8h,  v19.8b
        sxtl2           v19.8h,  v19.16b
        ld1             {v2.8h, v3.8h}, [x8]
        shl             v16.8h,  v16.8h,  #9      // -m << 9
        shl             v17.8h,  v17.8h,  #9
        shl             v18.8h,  v18.8h,  #9
        shl             v19.8h,  v19.8h,  #9
        sub             v4.8h,   v0.8h,   v4.8h   // a - b
        sub             v5.8h,   v1.8h,   v5.8h
        sub             v6.8h,   v2.8h,   v6.8h
        sub             v7.8h,   v3.8h,   v7.8h
        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v5.8h,   v5.8h,   v17.8h
        sqrdmulh        v6.8h,   v6.8h,   v18.8h
        sqrdmulh        v7.8h,   v7.8h,   v19.8h
        add             v0.8h,   v0.8h,   v4.8h
        add             v1.8h,   v1.8h,   v5.8h
        add             v2.8h,   v2.8h,   v6.8h
        add             v3.8h,   v3.8h,   v7.8h
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v2.8h, v3.8h}, [x8], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
32:
        ld1             {v16.16b, v17.16b},           [x5], #32
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
        subs            w4,  w4,  #1
        neg             v18.16b, v16.16b          // -m
        neg             v19.16b, v17.16b
        sxtl            v16.8h,  v18.8b
        sxtl2           v17.8h,  v18.16b
        sxtl            v18.8h,  v19.8b
        sxtl2           v19.8h,  v19.16b
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
        shl             v16.8h,  v16.8h,  #9      // -m << 9
        shl             v17.8h,  v17.8h,  #9
        shl             v18.8h,  v18.8h,  #9
        shl             v19.8h,  v19.8h,  #9
        sub             v4.8h,   v0.8h,   v4.8h   // a - b
        sub             v5.8h,   v1.8h,   v5.8h
        sub             v6.8h,   v2.8h,   v6.8h
        sub             v7.8h,   v3.8h,   v7.8h
        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v5.8h,   v5.8h,   v17.8h
        sqrdmulh        v6.8h,   v6.8h,   v18.8h
        sqrdmulh        v7.8h,   v7.8h,   v19.8h
        add             v0.8h,   v0.8h,   v4.8h
        add             v1.8h,   v1.8h,   v5.8h
        add             v2.8h,   v2.8h,   v6.8h
        add             v3.8h,   v3.8h,   v7.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        b.gt            32b
        ret
endfunc

jumptable blend_tbl
        .word 320b - blend_tbl
        .word 160b - blend_tbl
        .word 80b  - blend_tbl
        .word 40b  - blend_tbl
endjumptable

function blend_h_16bpc_neon, export=1
        movrel          x6,  blend_h_tbl
        movrel          x5,  X(obmc_masks)
        add             x5,  x5,  w4,  uxtw
        sub             w4,  w4,  w4,  lsr #2
        clz             w7,  w3
        add             x8,  x0,  x1
        lsl             x1,  x1,  #1
        sub             w7,  w7,  #24
        ldrsw           x7,  [x6,  x7,  lsl #2]
        add             x6,  x6,  x7
        br              x6
20:
        AARCH64_VALID_JUMP_TARGET
2:
        ld2r            {v2.8b, v3.8b}, [x5], #2
        ld1             {v1.4h},        [x2], #8
        ext             v2.8b,   v2.8b,   v3.8b,   #6
        subs            w4,  w4,  #2
        neg             v2.8b,   v2.8b            // -m
        ldr             s0,        [x0]
        ld1             {v0.s}[1], [x8]
        sxtl            v2.8h,   v2.8b
        shl             v2.4h,   v2.4h,   #9      // -m << 9
        sub             v1.4h,   v0.4h,   v1.4h   // a - b
        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
        add             v0.4h,   v0.4h,   v1.4h
        st1             {v0.s}[0], [x0], x1
        st1             {v0.s}[1], [x8], x1
        b.gt            2b
        ret
40:
        AARCH64_VALID_JUMP_TARGET
4:
        ld2r            {v2.8b, v3.8b}, [x5], #2
        ld1             {v1.8h},        [x2], #16
        ext             v2.8b,   v2.8b,   v3.8b,   #4
        subs            w4,  w4,  #2
        neg             v2.8b,   v2.8b            // -m
        ldr             d0,          [x0]
        ld1             {v0.d}[1],   [x8]
        sxtl            v2.8h,   v2.8b
        shl             v2.8h,   v2.8h,   #9      // -m << 9
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
        st1             {v0.8b},   [x0], x1
        st1             {v0.d}[1], [x8], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
8:
        ld2r            {v4.8b, v5.8b}, [x5], #2
        ld1             {v2.8h, v3.8h}, [x2], #32
        neg             v4.8b,   v4.8b            // -m
        neg             v5.8b,   v5.8b
        ld1             {v0.8h}, [x0]
        subs            w4,  w4,  #2
        sxtl            v4.8h,   v4.8b
        sxtl            v5.8h,   v5.8b
        ld1             {v1.8h}, [x8]
        shl             v4.8h,   v4.8h,   #9      // -m << 9
        shl             v5.8h,   v5.8h,   #9
        sub             v2.8h,   v0.8h,   v2.8h   // a - b
        sub             v3.8h,   v1.8h,   v3.8h
        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
        sqrdmulh        v3.8h,   v3.8h,   v5.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
        st1             {v0.8h}, [x0], x1
        st1             {v1.8h}, [x8], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
16:
        ld2r            {v16.8b, v17.8b}, [x5], #2
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
        neg             v16.8b,  v16.8b           // -m
        neg             v17.8b,  v17.8b
        ld1             {v0.8h, v1.8h},  [x0]
        ld1             {v2.8h, v3.8h},  [x8]
        subs            w4,  w4,  #2
        sxtl            v16.8h,  v16.8b
        sxtl            v17.8h,  v17.8b
        shl             v16.8h,  v16.8h,  #9      // -m << 9
        shl             v17.8h,  v17.8h,  #9
        sub             v4.8h,   v0.8h,   v4.8h   // a - b
        sub             v5.8h,   v1.8h,   v5.8h
        sub             v6.8h,   v2.8h,   v6.8h
        sub             v7.8h,   v3.8h,   v7.8h
        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v5.8h,   v5.8h,   v16.8h
        sqrdmulh        v6.8h,   v6.8h,   v17.8h
        sqrdmulh        v7.8h,   v7.8h,   v17.8h
        add             v0.8h,   v0.8h,   v4.8h
        add             v1.8h,   v1.8h,   v5.8h
        add             v2.8h,   v2.8h,   v6.8h
        add             v3.8h,   v3.8h,   v7.8h
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v2.8h, v3.8h}, [x8], x1
        b.gt            16b
        ret
1280:
640:
320:
        AARCH64_VALID_JUMP_TARGET
        sub             x1,  x1,  w3,  uxtw #1
        add             x7,  x2,  w3,  uxtw #1
321:
        ld2r            {v24.8b, v25.8b}, [x5], #2
        mov             w6,  w3
        neg             v24.8b,  v24.8b           // -m
        neg             v25.8b,  v25.8b
        sxtl            v24.8h,  v24.8b
        sxtl            v25.8h,  v25.8b
        shl             v24.8h,  v24.8h,  #9      // -m << 9
        shl             v25.8h,  v25.8h,  #9
32:
        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
        subs            w6,  w6,  #32
        sub             v16.8h,  v0.8h,   v16.8h  // a - b
        sub             v17.8h,  v1.8h,   v17.8h
        sub             v18.8h,  v2.8h,   v18.8h
        sub             v19.8h,  v3.8h,   v19.8h
        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v17.8h,  v17.8h,  v24.8h
        sqrdmulh        v18.8h,  v18.8h,  v24.8h
        sqrdmulh        v19.8h,  v19.8h,  v24.8h
        sub             v20.8h,  v4.8h,   v20.8h  // a - b
        sub             v21.8h,  v5.8h,   v21.8h
        sub             v22.8h,  v6.8h,   v22.8h
        sub             v23.8h,  v7.8h,   v23.8h
        add             v0.8h,   v0.8h,   v16.8h
        add             v1.8h,   v1.8h,   v17.8h
        add             v2.8h,   v2.8h,   v18.8h
        add             v3.8h,   v3.8h,   v19.8h
        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v21.8h,  v21.8h,  v25.8h
        sqrdmulh        v22.8h,  v22.8h,  v25.8h
        sqrdmulh        v23.8h,  v23.8h,  v25.8h
        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
        add             v4.8h,   v4.8h,   v20.8h
        add             v5.8h,   v5.8h,   v21.8h
        add             v6.8h,   v6.8h,   v22.8h
        add             v7.8h,   v7.8h,   v23.8h
        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
        b.gt            32b
        subs            w4,  w4,  #2
        add             x0,  x0,  x1
        add             x8,  x8,  x1
        add             x2,  x2,  w3,  uxtw #1
        add             x7,  x7,  w3,  uxtw #1
        b.gt            321b
        ret
endfunc

jumptable blend_h_tbl
        .word 1280b - blend_h_tbl
        .word 640b  - blend_h_tbl
        .word 320b  - blend_h_tbl
        .word 160b  - blend_h_tbl
        .word 80b   - blend_h_tbl
        .word 40b   - blend_h_tbl
        .word 20b   - blend_h_tbl
endjumptable

function blend_v_16bpc_neon, export=1
        movrel          x6,  blend_v_tbl
        movrel          x5,  X(obmc_masks)
        add             x5,  x5,  w3,  uxtw
        clz             w3,  w3
        add             x8,  x0,  x1
        lsl             x1,  x1,  #1
        sub             w3,  w3,  #26
        ldrsw           x3,  [x6,  x3,  lsl #2]
        add             x6,  x6,  x3
        br              x6
20:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v2.8b}, [x5]
        neg             v2.8b,   v2.8b            // -m
        sxtl            v2.8h,   v2.8b
        shl             v2.4h,   v2.4h,   #9      // -m << 9
2:
        ldr             s1,  [x2],  #4
        ldr             h0,  [x0]
        subs            w4,  w4,  #2
        ld1             {v1.h}[1], [x2]
        ld1             {v0.h}[1], [x8]
        add             x2,  x2,  #4
        sub             v1.4h,   v0.4h,   v1.4h   // a - b
        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
        add             v0.4h,   v0.4h,   v1.4h
        st1             {v0.h}[0], [x0],  x1
        st1             {v0.h}[1], [x8],  x1
        b.gt            2b
        ret
40:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v2.2s}, [x5]
        sub             x1,  x1,  #4
        neg             v2.8b,   v2.8b            // -m
        sxtl            v2.8h,   v2.8b
        shl             v2.8h,   v2.8h,   #9      // -m << 9
4:
        ld1             {v1.8h},   [x2], #16
        ldr             d0,        [x0]
        ld1             {v0.d}[1], [x8]
        subs            w4,  w4,  #2
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
        str             s0,        [x0], #4
        st1             {v0.s}[2], [x8], #4
        st1             {v0.h}[2], [x0], x1
        st1             {v0.h}[6], [x8], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v4.8b}, [x5]
        sub             x1,  x1,  #8
        neg             v4.8b,   v4.8b            // -m
        sxtl            v4.8h,   v4.8b
        shl             v4.8h,   v4.8h,   #9      // -m << 9
8:
        ld1             {v2.8h, v3.8h}, [x2], #32
        ld1             {v0.8h}, [x0]
        ld1             {v1.8h}, [x8]
        subs            w4,  w4,  #2
        sub             v2.8h,   v0.8h,   v2.8h   // a - b
        sub             v3.8h,   v1.8h,   v3.8h
        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
        sqrdmulh        v3.8h,   v3.8h,   v4.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
        str             d0,        [x0], #8
        str             d1,        [x8], #8
        st1             {v0.s}[2], [x0], x1
        st1             {v1.s}[2], [x8], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v16.16b}, [x5]
        sub             x1,  x1,  #16
        neg             v17.16b, v16.16b          // -m
        sxtl            v16.8h,  v17.8b
        sxtl2           v17.8h,  v17.16b
        shl             v16.8h,  v16.8h,  #9      // -m << 9
        shl             v17.4h,  v17.4h,  #9
16:
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
        ld1             {v0.8h, v1.8h}, [x0]
        subs            w4,  w4,  #2
        ld1             {v2.8h, v3.8h}, [x8]
        sub             v4.8h,   v0.8h,   v4.8h   // a - b
        sub             v5.4h,   v1.4h,   v5.4h
        sub             v6.8h,   v2.8h,   v6.8h
        sub             v7.4h,   v3.4h,   v7.4h
        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v5.4h,   v5.4h,   v17.4h
        sqrdmulh        v6.8h,   v6.8h,   v16.8h
        sqrdmulh        v7.4h,   v7.4h,   v17.4h
        add             v0.8h,   v0.8h,   v4.8h
        add             v1.4h,   v1.4h,   v5.4h
        add             v2.8h,   v2.8h,   v6.8h
        add             v3.4h,   v3.4h,   v7.4h
        st1             {v0.8h}, [x0], #16
        st1             {v2.8h}, [x8], #16
        st1             {v1.4h}, [x0], x1
        st1             {v3.4h}, [x8], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v24.16b, v25.16b},  [x5]
        neg             v26.16b, v24.16b          // -m
        neg             v27.8b,  v25.8b
        sxtl            v24.8h,  v26.8b
        sxtl2           v25.8h,  v26.16b
        sxtl            v26.8h,  v27.8b
        shl             v24.8h,  v24.8h,  #9      // -m << 9
        shl             v25.8h,  v25.8h,  #9
        shl             v26.8h,  v26.8h,  #9
32:
        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
        subs            w4,  w4,  #2
        sub             v16.8h,  v0.8h,   v16.8h  // a - b
        sub             v17.8h,  v1.8h,   v17.8h
        sub             v18.8h,  v2.8h,   v18.8h
        sub             v20.8h,  v4.8h,   v20.8h
        sub             v21.8h,  v5.8h,   v21.8h
        sub             v22.8h,  v6.8h,   v22.8h
        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
        sqrdmulh        v17.8h,  v17.8h,  v25.8h
        sqrdmulh        v18.8h,  v18.8h,  v26.8h
        sqrdmulh        v20.8h,  v20.8h,  v24.8h
        sqrdmulh        v21.8h,  v21.8h,  v25.8h
        sqrdmulh        v22.8h,  v22.8h,  v26.8h
        add             v0.8h,   v0.8h,   v16.8h
        add             v1.8h,   v1.8h,   v17.8h
        add             v2.8h,   v2.8h,   v18.8h
        add             v4.8h,   v4.8h,   v20.8h
        add             v5.8h,   v5.8h,   v21.8h
        add             v6.8h,   v6.8h,   v22.8h
        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
        b.gt            32b
        ret
endfunc

jumptable blend_v_tbl
        .word 320b - blend_v_tbl
        .word 160b - blend_v_tbl
        .word 80b  - blend_v_tbl
        .word 40b  - blend_v_tbl
        .word 20b  - blend_v_tbl
endjumptable


// This has got the same signature as the put_8tap functions,
// and assumes that x9 is set to (clz(w)-24).
function put_16bpc_neon, export=1
        movrel          x10, put_16bpc_tbl
        ldrsw           x9, [x10, x9, lsl #2]
        add             x10, x10, x9
        br              x10

20:
        AARCH64_VALID_JUMP_TARGET
2:
        ld1r            {v0.4s},   [x2], x3
        ld1r            {v1.4s},   [x2], x3
        subs            w5,  w5,  #2
        st1             {v0.s}[0], [x0], x1
        st1             {v1.s}[0], [x0], x1
        b.gt            2b
        ret
40:
        AARCH64_VALID_JUMP_TARGET
4:
        ld1             {v0.4h}, [x2], x3
        ld1             {v1.4h}, [x2], x3
        subs            w5,  w5,  #2
        st1             {v0.4h}, [x0], x1
        st1             {v1.4h}, [x0], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        add             x8,  x0,  x1
        lsl             x1,  x1,  #1
        add             x9,  x2,  x3
        lsl             x3,  x3,  #1
8:
        ld1             {v0.8h}, [x2], x3
        ld1             {v1.8h}, [x9], x3
        subs            w5,  w5,  #2
        st1             {v0.8h}, [x0], x1
        st1             {v1.8h}, [x8], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
16:
        ldp             x6,  x7,  [x2]
        ldp             x8,  x9,  [x2, #16]
        stp             x6,  x7,  [x0]
        subs            w5,  w5,  #1
        stp             x8,  x9,  [x0, #16]
        add             x2,  x2,  x3
        add             x0,  x0,  x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
32:
        ldp             x6,  x7,  [x2]
        ldp             x8,  x9,  [x2, #16]
        stp             x6,  x7,  [x0]
        ldp             x10, x11, [x2, #32]
        stp             x8,  x9,  [x0, #16]
        subs            w5,  w5,  #1
        ldp             x12, x13, [x2, #48]
        stp             x10, x11, [x0, #32]
        stp             x12, x13, [x0, #48]
        add             x2,  x2,  x3
        add             x0,  x0,  x1
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
64:
        ldp             q0,  q1,  [x2]
        ldp             q2,  q3,  [x2, #32]
        stp             q0,  q1,  [x0]
        ldp             q4,  q5,  [x2, #64]
        stp             q2,  q3,  [x0, #32]
        ldp             q6,  q7,  [x2, #96]
        subs            w5,  w5,  #1
        stp             q4,  q5,  [x0, #64]
        stp             q6,  q7,  [x0, #96]
        add             x2,  x2,  x3
        add             x0,  x0,  x1
        b.gt            64b
        ret
1280:
        AARCH64_VALID_JUMP_TARGET
128:
        ldp             q0,  q1,  [x2]
        ldp             q2,  q3,  [x2, #32]
        stp             q0,  q1,  [x0]
        ldp             q4,  q5,  [x2, #64]
        stp             q2,  q3,  [x0, #32]
        ldp             q6,  q7,  [x2, #96]
        subs            w5,  w5,  #1
        stp             q4,  q5,  [x0, #64]
        ldp             q16, q17, [x2, #128]
        stp             q6,  q7,  [x0, #96]
        ldp             q18, q19, [x2, #160]
        stp             q16, q17, [x0, #128]
        ldp             q20, q21, [x2, #192]
        stp             q18, q19, [x0, #160]
        ldp             q22, q23, [x2, #224]
        stp             q20, q21, [x0, #192]
        stp             q22, q23, [x0, #224]
        add             x2,  x2,  x3
        add             x0,  x0,  x1
        b.gt            128b
        ret
endfunc

jumptable put_16bpc_tbl
        .word 1280b - put_16bpc_tbl
        .word 640b  - put_16bpc_tbl
        .word 320b  - put_16bpc_tbl
        .word 160b  - put_16bpc_tbl
        .word 80b   - put_16bpc_tbl
        .word 40b   - put_16bpc_tbl
        .word 20b   - put_16bpc_tbl
endjumptable


// This has got the same signature as the prep_8tap functions,
// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
// x8 to w*2.
function prep_16bpc_neon
        movrel          x10, prep_16bpc_tbl
        ldrsw           x9, [x10, x9, lsl #2]
        dup             v31.8h,  w7   // intermediate_bits
        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
        add             x10, x10, x9
        br              x10

40:
        AARCH64_VALID_JUMP_TARGET
        add             x9,  x1,  x2
        lsl             x2,  x2,  #1
4:
        ld1             {v0.8b},   [x1], x2
        ld1             {v0.d}[1], [x9], x2
        subs            w4,  w4,  #2
        sshl            v0.8h,   v0.8h,   v31.8h
        sub             v0.8h,   v0.8h,   v30.8h
        st1             {v0.8h}, [x0], #16
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        add             x9,  x1,  x2
        lsl             x2,  x2,  #1
8:
        ld1             {v0.8h}, [x1], x2
        ld1             {v1.8h}, [x9], x2
        subs            w4,  w4,  #2
        sshl            v0.8h,   v0.8h,   v31.8h
        sshl            v1.8h,   v1.8h,   v31.8h
        sub             v0.8h,   v0.8h,   v30.8h
        sub             v1.8h,   v1.8h,   v30.8h
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
16:
        ldp             q0,  q1,  [x1]
        add             x1,  x1,  x2
        sshl            v0.8h,   v0.8h,   v31.8h
        ldp             q2,  q3,  [x1]
        add             x1,  x1,  x2
        subs            w4,  w4,  #2
        sshl            v1.8h,   v1.8h,   v31.8h
        sshl            v2.8h,   v2.8h,   v31.8h
        sshl            v3.8h,   v3.8h,   v31.8h
        sub             v0.8h,   v0.8h,   v30.8h
        sub             v1.8h,   v1.8h,   v30.8h
        sub             v2.8h,   v2.8h,   v30.8h
        sub             v3.8h,   v3.8h,   v30.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
32:
        ldp             q0,  q1,  [x1]
        sshl            v0.8h,   v0.8h,   v31.8h
        ldp             q2,  q3,  [x1, #32]
        add             x1,  x1,  x2
        sshl            v1.8h,   v1.8h,   v31.8h
        sshl            v2.8h,   v2.8h,   v31.8h
        sshl            v3.8h,   v3.8h,   v31.8h
        subs            w4,  w4,  #1
        sub             v0.8h,   v0.8h,   v30.8h
        sub             v1.8h,   v1.8h,   v30.8h
        sub             v2.8h,   v2.8h,   v30.8h
        sub             v3.8h,   v3.8h,   v30.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
64:
        ldp             q0,  q1,  [x1]
        subs            w4,  w4,  #1
        sshl            v0.8h,   v0.8h,   v31.8h
        ldp             q2,  q3,  [x1, #32]
        sshl            v1.8h,   v1.8h,   v31.8h
        ldp             q4,  q5,  [x1, #64]
        sshl            v2.8h,   v2.8h,   v31.8h
        sshl            v3.8h,   v3.8h,   v31.8h
        ldp             q6,  q7,  [x1, #96]
        add             x1,  x1,  x2
        sshl            v4.8h,   v4.8h,   v31.8h
        sshl            v5.8h,   v5.8h,   v31.8h
        sshl            v6.8h,   v6.8h,   v31.8h
        sshl            v7.8h,   v7.8h,   v31.8h
        sub             v0.8h,   v0.8h,   v30.8h
        sub             v1.8h,   v1.8h,   v30.8h
        sub             v2.8h,   v2.8h,   v30.8h
        sub             v3.8h,   v3.8h,   v30.8h
        stp             q0,  q1,  [x0]
        sub             v4.8h,   v4.8h,   v30.8h
        sub             v5.8h,   v5.8h,   v30.8h
        stp             q2,  q3,  [x0, #32]
        sub             v6.8h,   v6.8h,   v30.8h
        sub             v7.8h,   v7.8h,   v30.8h
        stp             q4,  q5,  [x0, #64]
        stp             q6,  q7,  [x0, #96]
        add             x0,  x0,  x8
        b.gt            64b
        ret
1280:
        AARCH64_VALID_JUMP_TARGET
128:
        ldp             q0,  q1,  [x1]
        subs            w4,  w4,  #1
        sshl            v0.8h,   v0.8h,   v31.8h
        ldp             q2,  q3,  [x1, #32]
        sshl            v1.8h,   v1.8h,   v31.8h
        ldp             q4,  q5,  [x1, #64]
        sshl            v2.8h,   v2.8h,   v31.8h
        sshl            v3.8h,   v3.8h,   v31.8h
        ldp             q6,  q7,  [x1, #96]
        sshl            v4.8h,   v4.8h,   v31.8h
        sshl            v5.8h,   v5.8h,   v31.8h
        ldp             q16, q17, [x1, #128]
        sshl            v6.8h,   v6.8h,   v31.8h
        sshl            v7.8h,   v7.8h,   v31.8h
        ldp             q18, q19, [x1, #160]
        sshl            v16.8h,  v16.8h,  v31.8h
        sshl            v17.8h,  v17.8h,  v31.8h
        ldp             q20, q21, [x1, #192]
        sshl            v18.8h,  v18.8h,  v31.8h
        sshl            v19.8h,  v19.8h,  v31.8h
        ldp             q22, q23, [x1, #224]
        add             x1,  x1,  x2
        sshl            v20.8h,  v20.8h,  v31.8h
        sshl            v21.8h,  v21.8h,  v31.8h
        sshl            v22.8h,  v22.8h,  v31.8h
        sshl            v23.8h,  v23.8h,  v31.8h
        sub             v0.8h,   v0.8h,   v30.8h
        sub             v1.8h,   v1.8h,   v30.8h
        sub             v2.8h,   v2.8h,   v30.8h
        sub             v3.8h,   v3.8h,   v30.8h
        stp             q0,  q1,  [x0]
        sub             v4.8h,   v4.8h,   v30.8h
        sub             v5.8h,   v5.8h,   v30.8h
        stp             q2,  q3,  [x0, #32]
        sub             v6.8h,   v6.8h,   v30.8h
        sub             v7.8h,   v7.8h,   v30.8h
        stp             q4,  q5,  [x0, #64]
        sub             v16.8h,  v16.8h,  v30.8h
        sub             v17.8h,  v17.8h,  v30.8h
        stp             q6,  q7,  [x0, #96]
        sub             v18.8h,  v18.8h,  v30.8h
        sub             v19.8h,  v19.8h,  v30.8h
        stp             q16, q17, [x0, #128]
        sub             v20.8h,  v20.8h,  v30.8h
        sub             v21.8h,  v21.8h,  v30.8h
        stp             q18, q19, [x0, #160]
        sub             v22.8h,  v22.8h,  v30.8h
        sub             v23.8h,  v23.8h,  v30.8h
        stp             q20, q21, [x0, #192]
        stp             q22, q23, [x0, #224]
        add             x0,  x0,  x8
        b.gt            128b
        ret
endfunc

jumptable prep_16bpc_tbl
        .word 1280b - prep_16bpc_tbl
        .word 640b  - prep_16bpc_tbl
        .word 320b  - prep_16bpc_tbl
        .word 160b  - prep_16bpc_tbl
        .word 80b   - prep_16bpc_tbl
        .word 40b   - prep_16bpc_tbl
endjumptable


.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
        ld1             {\d0\wd}[0], [\s0], \strd
        ld1             {\d1\wd}[0], [\s1], \strd
.ifnb \d2
        ld1             {\d2\wd}[0], [\s0], \strd
        ld1             {\d3\wd}[0], [\s1], \strd
.endif
.ifnb \d4
        ld1             {\d4\wd}[0], [\s0], \strd
.endif
.ifnb \d5
        ld1             {\d5\wd}[0], [\s1], \strd
.endif
.ifnb \d6
        ld1             {\d6\wd}[0], [\s0], \strd
.endif
.endm
.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
        ld1             {\d0\wd}, [\s0], \strd
        ld1             {\d1\wd}, [\s1], \strd
.ifnb \d2
        ld1             {\d2\wd}, [\s0], \strd
        ld1             {\d3\wd}, [\s1], \strd
.endif
.ifnb \d4
        ld1             {\d4\wd}, [\s0], \strd
.endif
.ifnb \d5
        ld1             {\d5\wd}, [\s1], \strd
.endif
.ifnb \d6
        ld1             {\d6\wd}, [\s0], \strd
.endif
.endm
.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
.ifnb \d2
        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
.endif
.ifnb \d4
        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
.endif
.endm
.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
.endm
.macro interleave_1 wd, r0, r1, r2, r3, r4
        trn1            \r0\wd, \r0\wd, \r1\wd
        trn1            \r1\wd, \r1\wd, \r2\wd
.ifnb \r3
        trn1            \r2\wd, \r2\wd, \r3\wd
        trn1            \r3\wd, \r3\wd, \r4\wd
.endif
.endm
.macro interleave_1_s r0, r1, r2, r3, r4
        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
.endm
.macro umin_h c, wd, r0, r1, r2, r3
        umin            \r0\wd,  \r0\wd,  \c\wd
.ifnb \r1
        umin            \r1\wd,  \r1\wd,  \c\wd
.endif
.ifnb \r2
        umin            \r2\wd,  \r2\wd,  \c\wd
        umin            \r3\wd,  \r3\wd,  \c\wd
.endif
.endm
.macro sub_h c, wd, r0, r1, r2, r3
        sub             \r0\wd,  \r0\wd,  \c\wd
.ifnb \r1
        sub             \r1\wd,  \r1\wd,  \c\wd
.endif
.ifnb \r2
        sub             \r2\wd,  \r2\wd,  \c\wd
        sub             \r3\wd,  \r3\wd,  \c\wd
.endif
.endm
.macro smull_smlal_4tap d, s0, s1, s2, s3
        smull           \d\().4s,  \s0\().4h,  v0.h[0]
        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
.endm
.macro smull2_smlal2_4tap d, s0, s1, s2, s3
        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
.endm
.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
        smull           \d\().4s,  \s1\().4h,  v0.h[1]
        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
.endm
.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
.endm
.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
        smull           \d\().4s,  \s0\().4h,  v0.h[0]
        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
.endm
.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
.endm
.macro sqrshrun_h shift, r0, r1, r2, r3
        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
.ifnb \r1
        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
.endif
.ifnb \r2
        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
.endif
.endm
.macro xtn_h r0, r1, r2, r3
        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
.ifnb \r2
        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
.endif
.endm
.macro srshl_s shift, r0, r1, r2, r3
        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
.ifnb \r2
        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
.endif
.endm
.macro st_s strd, reg, lanes
        st1             {\reg\().s}[0], [x0], \strd
        st1             {\reg\().s}[1], [x9], \strd
.if \lanes > 2
        st1             {\reg\().s}[2], [x0], \strd
        st1             {\reg\().s}[3], [x9], \strd
.endif
.endm
.macro st_d strd, r0, r1
        st1             {\r0\().8b},   [x0], \strd
        st1             {\r0\().d}[1], [x9], \strd
.ifnb \r1
        st1             {\r1\().8b},   [x0], \strd
        st1             {\r1\().d}[1], [x9], \strd
.endif
.endm
.macro shift_store_4 type, strd, r0, r1, r2, r3
.ifc \type, put
        sqrshrun_h      6,   \r0, \r1, \r2, \r3
        umin_h          v31, .8h, \r0, \r2
.else
        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
        xtn_h           \r0, \r1, \r2, \r3
        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
.endif
        st_d            \strd, \r0, \r2
.endm
.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
        st1             {\r0\wd}, [x0], \strd
        st1             {\r1\wd}, [x9], \strd
.ifnb \r2
        st1             {\r2\wd}, [x0], \strd
        st1             {\r3\wd}, [x9], \strd
.endif
.ifnb \r4
        st1             {\r4\wd}, [x0], \strd
        st1             {\r5\wd}, [x9], \strd
        st1             {\r6\wd}, [x0], \strd
        st1             {\r7\wd}, [x9], \strd
.endif
.endm
.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
.endm
.macro shift_store_8 type, strd, r0, r1, r2, r3
.ifc \type, put
        sqrshrun_h      6,   \r0, \r1, \r2, \r3
        umin_h          v31, .8h, \r0, \r2
.else
        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
        xtn_h           \r0, \r1, \r2, \r3
        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
.endif
        st_8h           \strd, \r0, \r2
.endm
.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
.ifc \type, put
        sqrshrun_h      6,   \r0, \r1, \r2, \r3
        umin            \r0\().8h, \r0\().8h, v31.8h
        umin            \r1\().8h, \r2\().8h, v31.8h
.else
        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
        xtn_h           \r0, \r1, \r2, \r3
        sub             \r0\().8h, \r0\().8h, v29.8h
        sub             \r1\().8h, \r2\().8h, v29.8h
.endif
        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm

.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
        mov             w9,  \type_h
        mov             w10, \type_v
        b               \op\()_\taps\()_neon
endfunc
.endm

// No spaces in these expressions, due to gas-preprocessor.
#define REGULAR ((0*15<<7)|3*15)
#define SMOOTH  ((1*15<<7)|4*15)
#define SHARP   ((2*15<<7)|3*15)

.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
function \type\()_\taps\()_neon
.ifc \bdmax, w8
        ldr             w8,  [sp]
.endif
        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
        mul             \mx,  \mx, w11
        mul             \my,  \my, w11
        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
.ifc \type, prep
        uxtw            \d_strd, \w
        lsl             \d_strd, \d_strd, #1
.endif

        dup             v31.8h,  \bdmax        // bitdepth_max
        clz             \bdmax,  \bdmax
        clz             w9,  \w
        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
        mov             w12, #6
        tst             \mx, #(0x7f << 14)
        sub             w9,  w9,  #24
        add             w13, w12, \bdmax       // 6 + intermediate_bits
        sub             w12, w12, \bdmax       // 6 - intermediate_bits
        movrel          x11, X(mc_subpel_filters), -8
        b.ne            L(\type\()_\taps\()_h)
        tst             \my, #(0x7f << 14)
        b.ne            L(\type\()_\taps\()_v)
        b               \type\()_16bpc_neon

L(\type\()_\taps\()_h):
        cmp             \w,   #4
        ubfx            w10,  \mx, #7, #7
        and             \mx,  \mx, #0x7f
        b.le            4f
        mov             \mx,  w10
4:
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=100 H=100 G=100

¤ Dauer der Verarbeitung: 0.22 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.






                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge